Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,14 @@ jobs:

docker network create my-network

# Start Neo4j if we are testing chapter 10
# Start Neo4j and JanusGraph if we are testing chapter 10
if [ "${{ matrix.chapter.name }}" == "chap10" ];
then
docker run --rm --detach --name janusgraph \
--publish=8182:8182 \
janusgraph/janusgraph:1.1.0
docker network connect my-network janusgraph

docker run --rm --detach --name neo4j \
--publish=7474:7474 --publish=7687:7687 \
--user="$(id -u):$(id -g)" \
Expand All @@ -83,10 +88,16 @@ jobs:
--env KAGGLE_USERNAME=${KAGGLE_USERNAME} \
--env KAGGLE_KEY=${KAGGLE_TOKEN} \
--env NEO4J_HOST=neo4j \
--env JANUSGRAPH_HOST=janusgraph \
graph-machine-learning:latest
docker network connect my-network graph-machine-learning-box

# Run tests
cd docker

./tests.sh ${{ matrix.chapter.folder }}
./tests.sh ${{ matrix.chapter.folder }}

- name: tmate session if tests fail
if: failure() && github.event_name == 'workflow_dispatch'
uses: mxschmitt/action-tmate@v3

305 changes: 305 additions & 0 deletions Chapter10/00_Data_Conversion.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "750472d1-ddd0-4a69-914c-77ac2ee9025c",
"metadata": {},
"source": [
"# Data conversion"
]
},
{
"cell_type": "markdown",
"id": "551da007-54e2-43b4-aad4-1ee33330510e",
"metadata": {},
"source": [
"This script allows the data conversion of the Cypher query to import the Movie dataset into Neo4j into two dataframes (nodes, and edges) that are imported in JanusGraph."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2afbf7aa-8903-4f0a-b622-2ed8ff947dfa",
"metadata": {},
"outputs": [],
"source": [
"with open(\"movieCreationQuery.txt\", \"r\") as fid:\n",
" lines = fid.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b7e2b1d6-260b-4c51-aa50-8ce7573b7563",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"is_edge_regex = re.compile(\"\\[.*\\]\")\n",
"\n",
"def is_edge(line):\n",
" if is_edge_regex.search(line):\n",
" return True\n",
" return False\n",
"\n",
"is_valid_regex = re.compile(\"\\(.*\\)\")\n",
"\n",
"def is_valid(line):\n",
" if is_valid_regex.search(line):\n",
" return True\n",
" return False "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e2eb2bcc-ca2a-4204-95a0-55073ac6148d",
"metadata": {},
"outputs": [],
"source": [
"import json \n",
"\n",
"def dict_serializer(input_dict):\n",
" return {\n",
" k: json.dumps(v) if isinstance(v, dict) or isinstance(v, list) else v\n",
" for k, v in input_dict.items()\n",
" }\n",
"\n",
"single_quote_parser = lambda props: dict_serializer(json.loads(re.sub(\"(\\w+):\", r\"'\\1':\", props).replace(\"'\",\"\\\"\")))\n",
"double_quote_parser = lambda props: dict_serializer(json.loads(re.sub(\"(\\w+):\", r'\"\\1\":', props)))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "12620971-08dd-4eb2-98f9-a351e44e412d",
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class Node:\n",
" id: str\n",
" label: str\n",
" props: dict\n",
"\n",
"@dataclass\n",
"class Edge:\n",
" id: tuple[str, str]\n",
" label: str\n",
" props: dict\n",
"\n",
"node_regex = re.compile(r'.*\\((\\w*):(\\w*).*({.*})\\).*')\n",
"\n",
"def parse_node_line(line):\n",
"\n",
" name, label, props = node_regex.match(line).groups()\n",
"\n",
" parser = double_quote_parser if '\"' in props else single_quote_parser\n",
"\n",
" return Node(name, label, parser(props))\n",
"\n",
"edge_regex = re.compile(r'^[a-zA-Z\\s]*\\((\\w+)\\)-\\[:(\\w+) ({.*})\\]->\\((\\w+)\\).*')\n",
"edge_regex_noprop = re.compile(r'^[a-zA-Z\\s]*\\((\\w+)\\)-\\[:(\\w+)]->\\((\\w+)\\).*')\n",
"\n",
"def parse_edge_line(line):\n",
"\n",
" try:\n",
" source, rel_type, props, target = edge_regex.match(line).groups()\n",
" except:\n",
" source, rel_type, target = edge_regex_noprop.match(line).groups()\n",
" props=\"{}\"\n",
"\n",
" parser = double_quote_parser if '\"' in props else single_quote_parser\n",
"\n",
" return Edge((source, target), rel_type, parser(props))\n",
"\n",
"\n",
"def parse_line(line: str):\n",
" if not is_valid(line):\n",
" return None\n",
" \n",
" if is_edge(line):\n",
" return parse_edge_line(line)\n",
" return parse_node_line(line)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bedf909d-1132-4be6-a4fc-ce8897e3bbae",
"metadata": {},
"outputs": [],
"source": [
"line=lines[18]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2d0b79ff-d34b-43db-9e3f-ba7dcbabd85f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Edge(id=('Emil', 'TheMatrix'), label='ACTED_IN', props={'roles': '[\"Emil\"]'})"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parse_line(line)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f7b03500-8d09-4d41-9b3c-f07f91ce8f22",
"metadata": {},
"outputs": [],
"source": [
"parsed_outout = [parse_line(line) for line in lines]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "147efe7d-6044-4711-be5c-caf4c5d65bda",
"metadata": {},
"outputs": [],
"source": [
"nodes = [item for item in parsed_outout if isinstance(item, Node)]\n",
"edges = [item for item in parsed_outout if isinstance(item, Edge)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6bd3f299-569d-4661-a085-0b9a03dd9ede",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"171"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(nodes)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e6190a56-e7f5-4682-b773-ca8fdb3fa46d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"254"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(edges)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "58b8e97a-a774-4d77-a261-aeea4a112bb8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "368f624d-1f3a-42a8-b747-80d39ec71cae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Node(id='TheMatrix', label='Movie', props={'title': 'The Matrix', 'released': 1999, 'tagline': 'Welcome to the Real World'}),\n",
" Node(id='Keanu', label='Person', props={'name': 'Keanu Reeves', 'born': 1964}),\n",
" Node(id='Carrie', label='Person', props={'name': 'Carrie-Anne Moss', 'born': 1967}),\n",
" Node(id='Laurence', label='Person', props={'name': 'Laurence Fishburne', 'born': 1961}),\n",
" Node(id='Hugo', label='Person', props={'name': 'Hugo Weaving', 'born': 1960}),\n",
" Node(id='LillyW', label='Person', props={'name': 'Lilly Wachowski', 'born': 1967}),\n",
" Node(id='LanaW', label='Person', props={'name': 'Lana Wachowski', 'born': 1965}),\n",
" Node(id='JoelS', label='Person', props={'name': 'Joel Silver', 'born': 1952}),\n",
" Node(id='Emil', label='Person', props={'name': 'Emil Eifrem', 'born': 1978}),\n",
" Node(id='TheMatrixReloaded', label='Movie', props={'title': 'The Matrix Reloaded', 'released': 2003, 'tagline': 'Free your mind'})]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nodes[:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ab66884a-5174-437c-91a2-d822366ba16b",
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame.from_records([{\"id\": node.id, \"label\": node.label, \"props\": node.props} for node in nodes]).set_index(\"id\").to_pickle(\"nodes.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "188e259e-b56f-4c81-9a5a-a23f63ba1146",
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame.from_records([{\"id\": edge.id, \"label\": edge.label, \"props\": edge.props} for edge in edges]).set_index(\"id\").to_pickle(\"edges.pkl\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "chap10",
"language": "python",
"name": "chap10"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading