From 48c08be647dd18de60f42865bd613aae576701b6 Mon Sep 17 00:00:00 2001 From: Marko Budiselic Date: Mon, 10 Nov 2025 15:46:50 -0800 Subject: [PATCH 1/2] Add unstructured2graph improvements --- unstructured2graph/examples/graphrag.py | 2 +- unstructured2graph/examples/loading.py | 4 ++-- unstructured2graph/examples/sources.py | 16 +++++++++++++++- .../src/unstructured2graph/loaders.py | 18 ++++++++++++------ .../src/unstructured2graph/memgraph.py | 6 +++++- uv.lock | 12 +++++++++++- 6 files changed, 46 insertions(+), 12 deletions(-) diff --git a/unstructured2graph/examples/graphrag.py b/unstructured2graph/examples/graphrag.py index facc8c4..1585e73 100644 --- a/unstructured2graph/examples/graphrag.py +++ b/unstructured2graph/examples/graphrag.py @@ -17,7 +17,7 @@ f""" CALL embeddings.text(['Hello world prompt']) YIELD embeddings, success CALL vector_search.search('vs_name', 10, embeddings[0]) YIELD distance, node, similarity - MATCH (node)-[r*bfs]-(dst) + MATCH (node)-[r*bfs]-(dst:Chunk) WITH DISTINCT dst, degree(dst) AS degree ORDER BY degree DESC RETURN dst LIMIT 10; """ diff --git a/unstructured2graph/examples/loading.py b/unstructured2graph/examples/loading.py index ebb15f0..8e7abf0 100644 --- a/unstructured2graph/examples/loading.py +++ b/unstructured2graph/examples/loading.py @@ -7,7 +7,7 @@ from memgraph_toolbox.api.memgraph import Memgraph from unstructured2graph import from_unstructured, create_index -from sources import SOURCES +import sources as SOURCES SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) LIGHTRAG_DIR = os.path.join(SCRIPT_DIR, "..", "lightrag_storage.out") @@ -34,7 +34,7 @@ async def from_unstructured_with_prep(): await lightrag_wrapper.initialize(working_dir=LIGHTRAG_DIR) await from_unstructured( - SOURCES, memgraph, lightrag_wrapper, only_chunks=False, link_chunks=True + SOURCES.MEMGRAPH_DOCS_GITHUB_LATEST_RAW, memgraph, lightrag_wrapper, only_chunks=False, link_chunks=True ) await lightrag_wrapper.afinalize() diff --git a/unstructured2graph/examples/sources.py b/unstructured2graph/examples/sources.py index afa3747..eba2b74 100644 --- a/unstructured2graph/examples/sources.py +++ b/unstructured2graph/examples/sources.py @@ -6,7 +6,7 @@ pypdf_samples_dir = os.path.join(SCRIPT_DIR, "..", "sample-data", "pdf", "sample-files") docx_samples_dir = os.path.join(SCRIPT_DIR, "..", "sample-data", "doc") xls_samples_dir = os.path.join(SCRIPT_DIR, "..", "sample-data", "xls") -SOURCES = [ +RANDOM = [ os.path.join( pypdf_samples_dir, "011-google-doc-document", "google-doc-document.pdf" ), @@ -14,3 +14,17 @@ # os.path.join(xls_samples_dir, "financial-sample.xlsx"), # "https://memgraph.com/docs/ai-ecosystem/graph-rag", ] +MEMGRAPH_DOCS = [ + "https://memgraph.com/docs/querying/clauses", + "https://memgraph.com/docs/clustering/high-availability" +] + +MEMGRAPH_DOCS_GITHUB_LATEST = [ + "https://github.com/memgraph/documentation/pull/1452/files" +] + +MEMGRAPH_DOCS_GITHUB_LATEST_RAW = [ + "https://raw.githubusercontent.com/memgraph/documentation/f6f165649b89efc51fa4153fffc08ff5304ca0c9/pages/database-management/authentication-and-authorization/mlbac-migration-guide.mdx", + # "https://raw.githubusercontent.com/memgraph/documentation/f6f165649b89efc51fa4153fffc08ff5304ca0c9/pages/database-management/authentication-and-authorization/role-based-access-control.mdx", + # "https://raw.githubusercontent.com/memgraph/documentation/40ab6644f7113aa5cb86faa48961d2cb2c34f2cc/pages/data-migration/parquet.mdx", +] diff --git a/unstructured2graph/src/unstructured2graph/loaders.py b/unstructured2graph/src/unstructured2graph/loaders.py index c48662c..63c7101 100644 --- a/unstructured2graph/src/unstructured2graph/loaders.py +++ b/unstructured2graph/src/unstructured2graph/loaders.py @@ -132,12 +132,10 @@ async def from_unstructured( ) memgraph_node_props = [] for chunk in document.chunks: - if not only_chunks: - await lightrag_wrapper.ainsert( - input=chunk.text, file_paths=[chunk.hash] - ) + logger.info(f"Chunk: {chunk.hash} - {chunk.text}") memgraph_node_props.append({"hash": chunk.hash, "text": chunk.text}) create_nodes_from_list(memgraph, memgraph_node_props, "Chunk", 100) + if link_chunks: hash_pairs = [ (document.chunks[i].hash, document.chunks[i + 1].hash) @@ -149,6 +147,15 @@ async def from_unstructured( for from_hash, to_hash in hash_pairs ] link_nodes_in_order(memgraph, "Chunk", "hash", relationships, "NEXT") + + for chunk in document.chunks: + if not only_chunks: + await lightrag_wrapper.ainsert( + input=chunk.text, file_paths=[chunk.hash] + ) + if not only_chunks: + connect_chunks_to_entities(memgraph, "Chunk", "base") + processed_chunks += len(document.chunks) elapsed_time = time.time() - start_time estimated_time_remaining = ( @@ -168,5 +175,4 @@ async def from_unstructured( logger.info( f"Processed {processed_chunks} chunks out of {total_chunks}. Estimated time remaining: {time_str}" ) - if not only_chunks: - connect_chunks_to_entities(memgraph, "Chunk", "base") + diff --git a/unstructured2graph/src/unstructured2graph/memgraph.py b/unstructured2graph/src/unstructured2graph/memgraph.py index de644c9..0e58e97 100644 --- a/unstructured2graph/src/unstructured2graph/memgraph.py +++ b/unstructured2graph/src/unstructured2graph/memgraph.py @@ -45,7 +45,11 @@ def create_nodes_from_list( def connect_chunks_to_entities(memgraph: Memgraph, chunk_label: str, entity_label: str): memgraph.query( - f"MATCH (n:{entity_label}), (m:{chunk_label}) WHERE n.file_path = m.hash CREATE (n)-[:MENTIONED_IN]->(m);" + f""" + MATCH (n:{entity_label}), (m:{chunk_label}) + WHERE n.file_path = m.hash + MERGE (n)-[:MENTIONED_IN]->(m); + """ ) diff --git a/uv.lock b/uv.lock index 689d00b..1908184 100644 --- a/uv.lock +++ b/uv.lock @@ -1600,6 +1600,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358 }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550 }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126 }, + { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904 }, + { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228 }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654 }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305 }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472 }, @@ -1609,6 +1611,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684 }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647 }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073 }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385 }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329 }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100 }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079 }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997 }, @@ -1618,6 +1622,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586 }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281 }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142 }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846 }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814 }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899 }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814 }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073 }, @@ -1627,6 +1633,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497 }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662 }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210 }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759 }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288 }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685 }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586 }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346 }, @@ -1634,6 +1642,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659 }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355 }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512 }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508 }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760 }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425 }, ] @@ -6466,7 +6476,7 @@ wheels = [ [[package]] name = "unstructured2graph" -version = "0.1.1" +version = "0.1.2" source = { editable = "unstructured2graph" } dependencies = [ { name = "lightrag-memgraph" }, From ca5fcea8c406b17d37a983d9602092d19c37ecce Mon Sep 17 00:00:00 2001 From: Marko Budiselic Date: Mon, 10 Nov 2025 16:21:13 -0800 Subject: [PATCH 2/2] Refactor the GraphRAG example + add the LLM summarization --- unstructured2graph/examples/graphrag.py | 72 +++++++++++++++---- unstructured2graph/examples/loading.py | 6 +- .../examples/prompt_templates.py | 17 +++++ unstructured2graph/examples/sources.py | 2 +- .../src/unstructured2graph/loaders.py | 1 - 5 files changed, 81 insertions(+), 17 deletions(-) create mode 100644 unstructured2graph/examples/prompt_templates.py diff --git a/unstructured2graph/examples/graphrag.py b/unstructured2graph/examples/graphrag.py index 1585e73..8732a59 100644 --- a/unstructured2graph/examples/graphrag.py +++ b/unstructured2graph/examples/graphrag.py @@ -1,32 +1,76 @@ +import argparse +import asyncio +import logging +import os + from memgraph_toolbox.api.memgraph import Memgraph +from openai import OpenAI from unstructured2graph import compute_embeddings, create_vector_search_index +from loading import from_unstructured_with_prep +import prompt_templates -if __name__ == "__main__": +async def full_graphrag(args): #### INGESTION - # TODO(gitbuda): Add the import here. memgraph = Memgraph() - compute_embeddings(memgraph, "Chunk") - create_vector_search_index(memgraph, "Chunk", "embedding") + if args.ingestion: + await from_unstructured_with_prep() + compute_embeddings(memgraph, "Chunk") + create_vector_search_index(memgraph, "Chunk", "embedding") - #### RETRIEVAL / GRAPHRAG - # The Native/One-query GraphRAG! - # TODO(gitbuda): In the current small graph, the Chunks are not connected via the entity graph. + #### RETRIEVAL / GRAPHRAG -> The Native/One-query GraphRAG! + prompt = "What is different under v3.7 compared to v3.6?" + retrieved_chunks = [] for row in memgraph.query( f""" - CALL embeddings.text(['Hello world prompt']) YIELD embeddings, success - CALL vector_search.search('vs_name', 10, embeddings[0]) YIELD distance, node, similarity + CALL embeddings.text(['{prompt}']) YIELD embeddings, success + CALL vector_search.search('vs_name', 5, embeddings[0]) YIELD distance, node, similarity MATCH (node)-[r*bfs]-(dst:Chunk) WITH DISTINCT dst, degree(dst) AS degree ORDER BY degree DESC - RETURN dst LIMIT 10; + RETURN dst LIMIT 5; """ ): if "description" in row["dst"]: - print(row["dst"]["description"]) + retrieved_chunks.append(row["dst"]["description"]) if "text" in row["dst"]: - print(row["dst"]["text"]) - print("----") + retrieved_chunks.append(row["dst"]["text"]) #### SUMMARIZATION - # TODO(gitbuda): Call LLM to generate the final answer. + if not retrieved_chunks: + print("No chunks retrieved. Cannot generate answer.") + else: + context = "\n\n".join(retrieved_chunks) + system_message = prompt_templates.system_message + user_message = prompt_templates.user_message(context, prompt) + if not os.environ.get("OPENAI_API_KEY"): + raise ValueError( + "OPENAI_API_KEY environment variable is not set. Please set your OpenAI API key." + ) + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + completion = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": user_message}, + ], + temperature=0.1, + ) + answer = completion.choices[0].message.content + print(f"\nQuestion: {prompt}") + print(f"\nAnswer:\n{answer}") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser( + description="GraphRAG: Retrieve and answer questions using graph-based RAG" + ) + parser.add_argument( + "--ingestion", + action="store_true", + help="Run data ingestion (load documents, compute embeddings, create vector index). By default, ingestion is skipped.", + ) + args = parser.parse_args() + + asyncio.run(full_graphrag(args)) diff --git a/unstructured2graph/examples/loading.py b/unstructured2graph/examples/loading.py index 8e7abf0..9042e3f 100644 --- a/unstructured2graph/examples/loading.py +++ b/unstructured2graph/examples/loading.py @@ -34,7 +34,11 @@ async def from_unstructured_with_prep(): await lightrag_wrapper.initialize(working_dir=LIGHTRAG_DIR) await from_unstructured( - SOURCES.MEMGRAPH_DOCS_GITHUB_LATEST_RAW, memgraph, lightrag_wrapper, only_chunks=False, link_chunks=True + SOURCES.MEMGRAPH_DOCS_GITHUB_LATEST_RAW, + memgraph, + lightrag_wrapper, + only_chunks=False, + link_chunks=True, ) await lightrag_wrapper.afinalize() diff --git a/unstructured2graph/examples/prompt_templates.py b/unstructured2graph/examples/prompt_templates.py new file mode 100644 index 0000000..277f4f3 --- /dev/null +++ b/unstructured2graph/examples/prompt_templates.py @@ -0,0 +1,17 @@ +system_message = f""" + You are a helpful assistant that answers questions based on the provided context. + Use only the information from the context to answer the question. + If the context doesn't contain enough information to answer the question, say so. +""" + +# Create the prompt with context +user_message = ( + lambda context, prompt: f""" + Based on the following context, please answer the question. + + Context: {context} + + Question: {prompt} + + Answer:""" +) diff --git a/unstructured2graph/examples/sources.py b/unstructured2graph/examples/sources.py index eba2b74..92bd089 100644 --- a/unstructured2graph/examples/sources.py +++ b/unstructured2graph/examples/sources.py @@ -16,7 +16,7 @@ ] MEMGRAPH_DOCS = [ "https://memgraph.com/docs/querying/clauses", - "https://memgraph.com/docs/clustering/high-availability" + "https://memgraph.com/docs/clustering/high-availability", ] MEMGRAPH_DOCS_GITHUB_LATEST = [ diff --git a/unstructured2graph/src/unstructured2graph/loaders.py b/unstructured2graph/src/unstructured2graph/loaders.py index 63c7101..888d196 100644 --- a/unstructured2graph/src/unstructured2graph/loaders.py +++ b/unstructured2graph/src/unstructured2graph/loaders.py @@ -175,4 +175,3 @@ async def from_unstructured( logger.info( f"Processed {processed_chunks} chunks out of {total_chunks}. Estimated time remaining: {time_str}" ) -