deusebio · deusebio · Apr 5, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -63,9 +63,14 @@ jobs:
 
           docker network create my-network
 
-          # Start Neo4j if we are testing chapter 10
+          # Start Neo4j and JanusGraph if we are testing chapter 10
           if [ "${{ matrix.chapter.name }}" == "chap10" ];
           then
+            docker run --rm --detach --name janusgraph \
+              --publish=8182:8182 \
+              janusgraph/janusgraph:1.1.0
+            docker network connect my-network janusgraph
+
             docker run --rm --detach --name neo4j \
               --publish=7474:7474 --publish=7687:7687 \
               --user="$(id -u):$(id -g)" \
@@ -83,10 +88,16 @@ jobs:
             --env KAGGLE_USERNAME=${KAGGLE_USERNAME} \
             --env KAGGLE_KEY=${KAGGLE_TOKEN} \
             --env NEO4J_HOST=neo4j \
+            --env JANUSGRAPH_HOST=janusgraph \
             graph-machine-learning:latest
           docker network connect my-network graph-machine-learning-box
 
           # Run tests
           cd docker
 
-          ./tests.sh ${{ matrix.chapter.folder }}
+          ./tests.sh ${{ matrix.chapter.folder }}
+
+      - name: tmate session if tests fail
+        if: failure() && github.event_name == 'workflow_dispatch'
+        uses: mxschmitt/action-tmate@v3
+
diff --git a/Chapter10/00_Data_Conversion.ipynb b/Chapter10/00_Data_Conversion.ipynb
@@ -0,0 +1,305 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "750472d1-ddd0-4a69-914c-77ac2ee9025c",
+   "metadata": {},
+   "source": [
+    "# Data conversion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "551da007-54e2-43b4-aad4-1ee33330510e",
+   "metadata": {},
+   "source": [
+    "This script allows the data conversion of the Cypher query to import the Movie dataset into Neo4j into two dataframes (nodes, and edges) that are imported in JanusGraph."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2afbf7aa-8903-4f0a-b622-2ed8ff947dfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"movieCreationQuery.txt\", \"r\") as fid:\n",
+    "    lines = fid.readlines()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b7e2b1d6-260b-4c51-aa50-8ce7573b7563",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "is_edge_regex = re.compile(\"\\[.*\\]\")\n",
+    "\n",
+    "def is_edge(line):\n",
+    "    if is_edge_regex.search(line):\n",
+    "        return True\n",
+    "    return False\n",
+    "\n",
+    "is_valid_regex = re.compile(\"\\(.*\\)\")\n",
+    "\n",
+    "def is_valid(line):\n",
+    "    if is_valid_regex.search(line):\n",
+    "        return True\n",
+    "    return False    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e2eb2bcc-ca2a-4204-95a0-55073ac6148d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json \n",
+    "\n",
+    "def dict_serializer(input_dict):\n",
+    "    return {\n",
+    "        k: json.dumps(v) if isinstance(v, dict) or isinstance(v, list) else v\n",
+    "        for k, v in input_dict.items()\n",
+    "    }\n",
+    "\n",
+    "single_quote_parser = lambda props: dict_serializer(json.loads(re.sub(\"(\\w+):\", r\"'\\1':\",  props).replace(\"'\",\"\\\"\")))\n",
+    "double_quote_parser = lambda props: dict_serializer(json.loads(re.sub(\"(\\w+):\", r'\"\\1\":',  props)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "12620971-08dd-4eb2-98f9-a351e44e412d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "\n",
+    "@dataclass\n",
+    "class Node:\n",
+    "    id: str\n",
+    "    label: str\n",
+    "    props: dict\n",
+    "\n",
+    "@dataclass\n",
+    "class Edge:\n",
+    "    id: tuple[str, str]\n",
+    "    label: str\n",
+    "    props: dict\n",
+    "\n",
+    "node_regex = re.compile(r'.*\\((\\w*):(\\w*).*({.*})\\).*')\n",
+    "\n",
+    "def parse_node_line(line):\n",
+    "\n",
+    "    name, label, props = node_regex.match(line).groups()\n",
+    "\n",
+    "    parser = double_quote_parser if '\"' in props else single_quote_parser\n",
+    "\n",
+    "    return Node(name, label, parser(props))\n",
+    "\n",
+    "edge_regex = re.compile(r'^[a-zA-Z\\s]*\\((\\w+)\\)-\\[:(\\w+) ({.*})\\]->\\((\\w+)\\).*')\n",
+    "edge_regex_noprop = re.compile(r'^[a-zA-Z\\s]*\\((\\w+)\\)-\\[:(\\w+)]->\\((\\w+)\\).*')\n",
+    "\n",
+    "def parse_edge_line(line):\n",
+    "\n",
+    "    try:\n",
+    "        source, rel_type, props, target = edge_regex.match(line).groups()\n",
+    "    except:\n",
+    "        source, rel_type, target = edge_regex_noprop.match(line).groups()\n",
+    "        props=\"{}\"\n",
+    "\n",
+    "    parser = double_quote_parser if '\"' in props else single_quote_parser\n",
+    "\n",
+    "    return Edge((source, target), rel_type, parser(props))\n",
+    "\n",
+    "\n",
+    "def parse_line(line: str):\n",
+    "    if not is_valid(line):\n",
+    "        return None\n",
+    "    \n",
+    "    if is_edge(line):\n",
+    "        return parse_edge_line(line)\n",
+    "    return parse_node_line(line)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bedf909d-1132-4be6-a4fc-ce8897e3bbae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "line=lines[18]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2d0b79ff-d34b-43db-9e3f-ba7dcbabd85f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Edge(id=('Emil', 'TheMatrix'), label='ACTED_IN', props={'roles': '[\"Emil\"]'})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parse_line(line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f7b03500-8d09-4d41-9b3c-f07f91ce8f22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "parsed_outout = [parse_line(line) for line in lines]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "147efe7d-6044-4711-be5c-caf4c5d65bda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nodes = [item for item in parsed_outout if isinstance(item, Node)]\n",
+    "edges = [item for item in parsed_outout if isinstance(item, Edge)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6bd3f299-569d-4661-a085-0b9a03dd9ede",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "171"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(nodes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "e6190a56-e7f5-4682-b773-ca8fdb3fa46d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "254"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(edges)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "58b8e97a-a774-4d77-a261-aeea4a112bb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "368f624d-1f3a-42a8-b747-80d39ec71cae",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Node(id='TheMatrix', label='Movie', props={'title': 'The Matrix', 'released': 1999, 'tagline': 'Welcome to the Real World'}),\n",
+       " Node(id='Keanu', label='Person', props={'name': 'Keanu Reeves', 'born': 1964}),\n",
+       " Node(id='Carrie', label='Person', props={'name': 'Carrie-Anne Moss', 'born': 1967}),\n",
+       " Node(id='Laurence', label='Person', props={'name': 'Laurence Fishburne', 'born': 1961}),\n",
+       " Node(id='Hugo', label='Person', props={'name': 'Hugo Weaving', 'born': 1960}),\n",
+       " Node(id='LillyW', label='Person', props={'name': 'Lilly Wachowski', 'born': 1967}),\n",
+       " Node(id='LanaW', label='Person', props={'name': 'Lana Wachowski', 'born': 1965}),\n",
+       " Node(id='JoelS', label='Person', props={'name': 'Joel Silver', 'born': 1952}),\n",
+       " Node(id='Emil', label='Person', props={'name': 'Emil Eifrem', 'born': 1978}),\n",
+       " Node(id='TheMatrixReloaded', label='Movie', props={'title': 'The Matrix Reloaded', 'released': 2003, 'tagline': 'Free your mind'})]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nodes[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ab66884a-5174-437c-91a2-d822366ba16b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_records([{\"id\": node.id, \"label\": node.label, \"props\": node.props} for node in nodes]).set_index(\"id\").to_pickle(\"nodes.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "188e259e-b56f-4c81-9a5a-a23f63ba1146",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_records([{\"id\": edge.id, \"label\": edge.label, \"props\": edge.props} for edge in edges]).set_index(\"id\").to_pickle(\"edges.pkl\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chap10",
+   "language": "python",
+   "name": "chap10"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}