diff --git a/msft_graphrag/artifacts.zip b/msft_graphrag/artifacts.zip index 5707c5c..cb60770 100644 Binary files a/msft_graphrag/artifacts.zip and b/msft_graphrag/artifacts.zip differ diff --git a/msft_graphrag/ms_graphrag_import.ipynb b/msft_graphrag/ms_graphrag_import.ipynb index a6b05eb..1e7448c 100644 --- a/msft_graphrag/ms_graphrag_import.ipynb +++ b/msft_graphrag/ms_graphrag_import.ipynb @@ -8,6 +8,8 @@ "## Neo4j Import of GraphRAG Result Parquet files\n", "This notebook imports the results of the GraphRAG indexing process into the Neo4j Graph database for further processing, analysis or visualization.\n", "\n", + "You can create a free Neo4j Graph database (free AuraDB) [here](https://neo4j.com/product/auradb/).\n", + "\n", "### How does it work?\n", "The notebook loads the parquet files from the output folder of your indexing process and loads them into Pandas dataframes. It then uses a batching approach to send a slice of the data into Neo4j to create nodes and relationships and add relevant properties. The id-arrays on most entities are turned into relationships.\n", "\n", @@ -33,18 +35,9 @@ { "cell_type": "code", "execution_count": 2, - "id": "3eeee95f-e4f2-4052-94fb-a5dc8ab542ae", + "id": "54481183", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n", - " from pandas.core import (\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from neo4j import GraphDatabase\n", @@ -53,17 +46,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "b6c15443-4acb-4f91-88ea-4e08abaa4c29", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "NEO4J_URI=\"bolt://localhost\"\n", + "NEO4J_URI=\"neo4j+s://name.databases.neo4j.io\"\n", "NEO4J_USERNAME=\"neo4j\"\n", "NEO4J_PASSWORD=\"password\"\n", "NEO4J_DATABASE=\"neo4j\"\n", "\n", - "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" + "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))\n", + "driver" ] }, { @@ -76,17 +81,18 @@ "def batched_import(statement, df, batch_size=1000):\n", " \"\"\"\n", " Import a dataframe into Neo4j using a batched approach.\n", - " Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.\n", + " Parameters: statement is the Cypher query to execute, df is the dataframe to import, \n", + " and batch_size is the number of rows to import in each batch.\n", " \"\"\"\n", " total = len(df)\n", " start_s = time.time()\n", - " for start in range(0,total, batch_size):\n", - " batch = df.iloc[start: min(start+batch_size,total)]\n", + " for start in range(0, total, batch_size):\n", + " batch = df.iloc[start: min(start+batch_size, total)]\n", " result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n", " rows=batch.to_dict('records'),\n", " database_=NEO4J_DATABASE)\n", " print(result.summary.counters)\n", - " print(f'{total} rows in { time.time() - start_s} s.') \n", + " print(f'{total} rows in {time.time() - start_s} s.') \n", " return total" ] }, @@ -173,6 +179,56 @@ "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "id", + "rawType": "object", + "type": "string" + }, + { + "name": "human_readable_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "title", + "rawType": "object", + "type": "string" + }, + { + "name": "creation_date", + "rawType": "object", + "type": "string" + }, + { + "name": "metadata", + "rawType": "object", + "type": "unknown" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "0c649b88-e220-4cd1-a303-8bed88b107ef", + "rows": [ + [ + "0", + "6852a66a08b38d4d412f043ca78ada50c66b2bb5e4724a4a94fb2849d62ad29f0000613f83d8dea22b5462a419f011e4d5c5b9948c8e743dbb64c64fd090dda6", + "1", + "penitencia.txt", + "2025-03-26 21:49:16 +0100", + null + ] + ], + "shape": { + "columns": 5, + "rows": 1 + } + }, "text/html": [ "
\n", "