from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI from langchain_core.prompts import PromptTemplate from langchain.schema.output_parser import StrOutputParser from langchain_community.embeddings import OCIGenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema.runnable import RunnableMap from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader from langchain_core.documents import Document from langchain_core.runnables import RunnableLambda from tqdm import tqdm import os import pickle import re import atexit import oracledb # ========================= # Oracle Autonomous Configuration # ========================= WALLET_PATH = "Wallet_oradb23ai" # Your Wallet for Autonomous Database downloaded and unziped folder DB_ALIAS = "oradb23ai_high" # Your database name plus _high as your TNS Definitions USERNAME = "USERNAME" # Your Wallet username PASSWORD = "PASSWORD" # Your Wallet password os.environ["TNS_ADMIN"] = WALLET_PATH GRAPH_NAME = "my_graph" oracle_conn = oracledb.connect( user=USERNAME, password=PASSWORD, dsn=DB_ALIAS, config_dir=WALLET_PATH, wallet_location=WALLET_PATH, wallet_password=PASSWORD ) atexit.register(lambda: oracle_conn.close()) # ========================= # Oracle Graph Client # ========================= def create_tables_if_not_exist(conn): cursor = conn.cursor() try: cursor.execute(""" BEGIN EXECUTE IMMEDIATE ' CREATE TABLE ENTITIES ( ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, NAME VARCHAR2(500) ) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END IF; END; """) cursor.execute(""" BEGIN EXECUTE IMMEDIATE ' CREATE TABLE RELATIONS ( ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, SOURCE_ID NUMBER, TARGET_ID NUMBER, RELATION_TYPE VARCHAR2(100), SOURCE_TEXT VARCHAR2(4000) ) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END IF; END; """) conn.commit() print("āœ… ENTITIES and RELATIONS tables created or already exist.") except Exception as e: print(f"[ERROR] Failed to create tables: {e}") finally: cursor.close() create_tables_if_not_exist(oracle_conn) # ========================= # Global Configurations # ========================= INDEX_PATH = "./faiss_index" PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" # ========================= # LLM Definitions # ========================= llm = ChatOCIGenAI( model_id="meta.llama-3.1-405b-instruct", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000}, ) llm_for_rag = ChatOCIGenAI( model_id="meta.llama-3.1-405b-instruct", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", ) embeddings = OCIGenAIEmbeddings( model_id="cohere.embed-multilingual-v3.0", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", ) def create_knowledge_graph(chunks): cursor = oracle_conn.cursor() # Creates graph if it does not exist try: cursor.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE PROPERTY GRAPH {GRAPH_NAME} VERTEX TABLES (ENTITIES KEY (ID) LABEL ENTITIES PROPERTIES (NAME)) EDGE TABLES (RELATIONS KEY (ID) SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID) DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID) LABEL RELATIONS PROPERTIES (RELATION_TYPE, SOURCE_TEXT)) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists RAISE; END IF; END; """) print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.") except Exception as e: print(f"[GRAPH ERROR] Failed to create graph: {e}") # Inserting vertices and edges into the tables for doc in chunks: text = doc.page_content source = doc.metadata.get("source", "unknown") if not text.strip(): continue prompt = f""" You are an expert in knowledge extraction. Given the following technical text: {text} Extract key entities and relationships in the format: - Entity1 -[RELATION]-> Entity2 Use UPPERCASE for RELATION types. Return 'NONE' if nothing found. """ try: response = llm_for_rag.invoke(prompt) result = response.content.strip() except Exception as e: print(f"[ERROR] Gen AI call error: {e}") continue if result.upper() == "NONE": continue triples = result.splitlines() for triple in triples: parts = triple.split("-[") if len(parts) != 2: continue right_part = parts[1].split("]->") if len(right_part) != 2: continue raw_relation, entity2 = right_part relation = re.sub(r'\W+', '_', raw_relation.strip().upper()) entity1 = parts[0].strip() entity2 = entity2.strip() try: # Insertion of entities (with existence check) cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1]) cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2]) # Retrieve the IDs cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1]) source_id = cursor.fetchone()[0] cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2]) target_id = cursor.fetchone()[0] # Create relations cursor.execute(""" INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT) VALUES (:src, :tgt, :rel, :txt) """, [source_id, target_id, relation, source]) print(f"āœ… {entity1} -[{relation}]-> {entity2}") except Exception as e: print(f"[INSERT ERROR] {e}") oracle_conn.commit() cursor.close() print("šŸ’¾ Knowledge graph updated.") def extract_graph_keywords(question: str) -> str: prompt = f""" Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph. Question: "{question}" Rules: - Split compound terms (e.g., "API Gateway" → "API", "Gateway") - Remove duplicates - Do not include generic words such as: "what", "how", "the", "of", "in the document", etc. - Return only the keywords, separated by commas. No explanations. Result: """ try: resp = llm_for_rag.invoke(prompt) keywords_raw = resp.content.strip() # Additional post-processing: remove duplicates, normalize keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)} keywords = [kw for kw in keywords if kw] # remove empty strings return ", ".join(sorted(keywords)) except Exception as e: print(f"[KEYWORD EXTRACTION ERROR] {e}") return "" def query_knowledge_graph(query_text): cursor = oracle_conn.cursor() sanitized_text = query_text.lower() pgql = f""" SELECT from_entity, relation_type, to_entity FROM GRAPH_TABLE( {GRAPH_NAME} MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES) WHERE CONTAINS(e1.name, '{sanitized_text}') > 0 OR CONTAINS(e2.name, '{sanitized_text}') > 0 OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0 COLUMNS ( e1.name AS from_entity, r.RELATION_TYPE AS relation_type, e2.name AS to_entity ) ) FETCH FIRST 20 ROWS ONLY """ # Show the query formulated for Graph print(pgql) try: cursor.execute(pgql) rows = cursor.fetchall() if not rows: return "āš ļø No relationships found in the graph." return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows) except Exception as e: return f"[PGQL ERROR] {e}" finally: cursor.close() def split_llm_output_into_chapters(llm_text): chapters = [] current_chapter = [] lines = llm_text.splitlines() for line in lines: if re.match(chapter_separator_regex, line): if current_chapter: chapters.append("\n".join(current_chapter).strip()) current_chapter = [line] else: current_chapter.append(line) if current_chapter: chapters.append("\n".join(current_chapter).strip()) return chapters def semantic_chunking(text): prompt = f""" You received the following text extracted via OCR: {text} Your task: 1. Identify headings (short uppercase or bold lines, no period at the end) 2. Separate paragraphs by heading 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present 4. Indicate tables with [TABLE] in markdown format """ get_out = False while not get_out: try: response = llm_for_rag.invoke(prompt) get_out = True except: print("[ERROR] Gen AI call error") return response def read_pdfs(pdf_path): if "-ocr" in pdf_path: doc_pages = PyMuPDFLoader(str(pdf_path)).load() else: doc_pages = UnstructuredPDFLoader(str(pdf_path)).load() full_text = "\n".join([page.page_content for page in doc_pages]) return full_text def smart_split_text(text, max_chunk_size=10_000): chunks = [] start = 0 text_length = len(text) while start < text_length: end = min(start + max_chunk_size, text_length) split_point = max( text.rfind('.', start, end), text.rfind('!', start, end), text.rfind('?', start, end), text.rfind('\n\n', start, end) ) if split_point == -1 or split_point <= start: split_point = end else: split_point += 1 chunk = text[start:split_point].strip() if chunk: chunks.append(chunk) start = split_point return chunks def load_previously_indexed_docs(): if os.path.exists(PROCESSED_DOCS_FILE): with open(PROCESSED_DOCS_FILE, "rb") as f: return pickle.load(f) return set() def save_indexed_docs(docs): with open(PROCESSED_DOCS_FILE, "wb") as f: pickle.dump(docs, f) # ========================= # Main Function # ========================= def chat(): pdf_paths = ['AAAAAAAAAA.pdf'] # Your PDF Files as a Knowledge Base already_indexed_docs = load_previously_indexed_docs() updated_docs = set() try: vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True) print("āœ”ļø FAISS index loaded.") except Exception: print("āš ļø FAISS index not found, creating a new one.") vectorstore = None new_chunks = [] for pdf_path in tqdm(pdf_paths, desc=f"šŸ“„ Processing PDFs"): print(f" {os.path.basename(pdf_path)}") if pdf_path in already_indexed_docs: print(f"āœ… Document already indexed: {pdf_path}") continue full_text = read_pdfs(pdf_path=pdf_path) text_chunks = smart_split_text(full_text, max_chunk_size=10_000) overflow_buffer = "" for chunk in tqdm(text_chunks, desc=f"šŸ“„ Processing text chunks", dynamic_ncols=True, leave=False): current_text = overflow_buffer + chunk treated_text = semantic_chunking(current_text) if hasattr(treated_text, "content"): chapters = split_llm_output_into_chapters(treated_text.content) last_chapter = chapters[-1] if chapters else "" if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")): print("šŸ“Œ Last chapter seems incomplete, saving for the next cycle") overflow_buffer = last_chapter chapters = chapters[:-1] else: overflow_buffer = "" for chapter_text in chapters: doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) new_chunks.append(doc) print(f"āœ… New chapter indexed:\n{chapter_text}...\n") else: print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}") updated_docs.add(str(pdf_path)) if new_chunks: if vectorstore: vectorstore.add_documents(new_chunks) else: vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings) vectorstore.save_local(INDEX_PATH) save_indexed_docs(already_indexed_docs.union(updated_docs)) print(f"šŸ’¾ {len(new_chunks)} chunks added to FAISS index.") print("🧠 Building knowledge graph...") create_knowledge_graph(new_chunks) else: print("šŸ“ No new documents to index.") retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100}) template = """ Document context: {context} Graph context: {graph_context} Question: {input} Interpretation rules: - You can search for a step-by-step tutorial about a subject - You can search a concept description about a subject - You can search for a list of components about a subject """ prompt = PromptTemplate.from_template(template) def get_context(x): query = x.get("input") if isinstance(x, dict) else x return retriever.invoke(query) chain = ( RunnableMap({ "context": RunnableLambda(get_context), "graph_context": RunnableLambda(lambda x: query_knowledge_graph(extract_graph_keywords(x.get("input") if isinstance(x, dict) else x))), "input": lambda x: x.get("input") if isinstance(x, dict) else x }) | prompt | llm | StrOutputParser() ) print("āœ… READY") while True: query = input("ā“ Question (or 'quit' to exit): ") if query.lower() == "quit": break response = chain.invoke(query) print("\nšŸ“œ RESPONSE:\n") print(response) print("\n" + "=" * 80 + "\n") # šŸš€ Run if __name__ == "__main__": chat()