import os import oracledb import numpy as np import difflib from rapidfuzz import fuzz from langchain_community.embeddings import OCIGenAIEmbeddings class SearchSimilarProduct: def __init__( self, top_k=5, minimal_distance=1.0, model_id="cohere.embed-english-light-v3.0", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", wallet_path="/Users/cristianohoshikawa/Dropbox/ORACLE/MyWork/Projects/ocigenaillm/Wallet_oradb23ai", db_alias="oradb23ai_high", username="USER", password="password" ): os.environ["TNS_ADMIN"] = wallet_path self.conn = oracledb.connect( user=username, password=password, dsn=db_alias, config_dir=wallet_path, wallet_location=wallet_path, wallet_password=password ) self.top_k = top_k self.minimal_distance = minimal_distance self.embedding = OCIGenAIEmbeddings( model_id=model_id, service_endpoint=service_endpoint, compartment_id=compartment_id, auth_profile=auth_profile ) print("📦 Loading Oracle Vectors...") self._load_embeddings() def _load_embeddings(self): cursor = self.conn.cursor() cursor.execute("SELECT id, code, description, vector FROM embeddings_products") self.vectors = [] self.products = [] for row in cursor.fetchall(): id_, code, description, blob = row vector = np.frombuffer(blob.read(), dtype=np.float32) self.vectors.append(vector) self.products.append({ "id": id_, "code": code, "description": description }) self.vectors = np.array(self.vectors) def _correct_input(self, input_user): descriptions = [p["description"] for p in self.products] suggestions = difflib.get_close_matches(input_user, descriptions, n=1, cutoff=0.6) return suggestions[0] if suggestions else input_user def search_similar_products(self, description_input): description_input = description_input.strip() description_corrected = self._correct_input(description_input) results = { "consult_original": description_input, "consult_used": description_corrected, "semantics": [], "fallback_fuzzy": [] } consult_emb = self.embedding.embed_query(description_corrected) consult_emb = np.array(consult_emb) # Euclidean distance calculation dists = np.linalg.norm(self.vectors - consult_emb, axis=1) top_indices = np.argsort(dists)[:self.top_k] for idx in top_indices: dist = dists[idx] if dist < self.minimal_distance: match = self.products[idx] similarity = 1 / (1 + dist) results["semantics"].append({ "id": match["id"], "code": match["code"], "description": match["description"], "similarity": round(similarity * 100, 2), "distance": round(dist, 4) }) if not results["semantics"]: better_fuzz = [] for product in self.products: score = fuzz.token_sort_ratio(description_corrected, product["description"]) better_fuzz.append((product, score)) better_fuzz.sort(key=lambda x: x[1], reverse=True) for product, score in better_fuzz[:self.top_k]: results["fallback_fuzzy"].append({ "id": product["id"], "code": product["code"], "description": product["description"], "score_fuzzy": round(score, 2) }) return results