From e4023c0348fc829a298980b0ef2aa595017a4e9c Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 30 Apr 2019 11:25:01 +0200 Subject: [PATCH 001/216] test of commitment in CLion --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index dfef3803..40b8661a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ recentMorphStoreProjectConf.log Dockerfile .DS_Store doc/doxygen/latex -doc/doxygen/html \ No newline at end of file +doc/doxygen/html +test \ No newline at end of file From b496864214014221873292702faaf7ce68a585ab Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 30 Apr 2019 11:29:20 +0200 Subject: [PATCH 002/216] remove test commitment in gitignore --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 40b8661a..c1de3bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,4 @@ recentMorphStoreProjectConf.log Dockerfile .DS_Store doc/doxygen/latex -doc/doxygen/html -test \ No newline at end of file +doc/doxygen/html \ No newline at end of file From 1272bd8a25131a828769f8122a236e3a73470558 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 2 May 2019 15:35:47 +0200 Subject: [PATCH 003/216] added graph.h with structs, cmakelists.txt --- include/core/storage/graph.h | 55 ++++++++++++++++++++++++++++++ test/graph/CMakeLists.txt | 15 ++++++++ test/graph/generate_ldbc_graph.cpp | 31 +++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 include/core/storage/graph.h create mode 100644 test/graph/CMakeLists.txt create mode 100644 test/graph/generate_ldbc_graph.cpp diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h new file mode 100644 index 00000000..3a0044b6 --- /dev/null +++ b/include/core/storage/graph.h @@ -0,0 +1,55 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph.h + * @brief Graph storage format -> adjacency Lists + * @todo Add property structure to Vertex and Edge + */ + +#ifndef MORPHSTORE_GRAPH_H +#define MORPHSTORE_GRAPH_H + +#include +#include + +namespace graph{ + + struct Vertex; + struct Edge; + + struct Vertex{ + uint64_t id; + uint64_t ldbc_id; + int entity; + vector adjList; + }; + + struct Edge{ + Vertex* target; + int relation; + }; + + struct graph{ + unordered_map vertices; + void addVertex(); + void addEdge; + }; + +} + +#endif //MORPHSTORE_GRAPH_H diff --git a/test/graph/CMakeLists.txt b/test/graph/CMakeLists.txt new file mode 100644 index 00000000..708bb9af --- /dev/null +++ b/test/graph/CMakeLists.txt @@ -0,0 +1,15 @@ +if ( CTEST_ALL OR CTEST_STORAGE ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/graph/ldbc_graph_test_app ) + + add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp ) + target_compile_options( ldbc_graph_test_app PRIVATE + -Werror + -Wall + -Wextra + -pedantic + -fstack-protector-all + $<$:-DDEBUG> ) + target_link_libraries( ldbc_graph_test_app PRIVATE "-ldl" ) + + add_test( ldbc_graph_test ldbc_graph_test_app ) +endif() \ No newline at end of file diff --git a/test/graph/generate_ldbc_graph.cpp b/test/graph/generate_ldbc_graph.cpp new file mode 100644 index 00000000..b535040f --- /dev/null +++ b/test/graph/generate_ldbc_graph.cpp @@ -0,0 +1,31 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file generate_ldbc_graph.cpp + * @brief Test for generating social network graph from LDBC files + * @todo TODOS? + */ + +#include + +int main( void ){ + + std::cout << "Generating LDBC social network ..." << endl; + std::cout.flush(); + return 0; +} \ No newline at end of file From 7160b742f51efb4945fd481a5235efdfdb5f43b2 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 2 May 2019 15:40:54 +0200 Subject: [PATCH 004/216] first test file with print --- test/graph/generate_ldbc_graph.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/graph/generate_ldbc_graph.cpp b/test/graph/generate_ldbc_graph.cpp index b535040f..bf596c23 100644 --- a/test/graph/generate_ldbc_graph.cpp +++ b/test/graph/generate_ldbc_graph.cpp @@ -27,5 +27,6 @@ int main( void ){ std::cout << "Generating LDBC social network ..." << endl; std::cout.flush(); + return 0; } \ No newline at end of file From 50c0446a4d122cc355acf63ab481ccaa03c6d2a5 Mon Sep 17 00:00:00 2001 From: Alexander Krause Date: Thu, 2 May 2019 15:50:34 +0200 Subject: [PATCH 005/216] Added subdir for graph test in cmakelists --- test/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8580fab2..f83e3c49 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,4 +5,6 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) +add_subdirectory( graph ) + add_subdirectory(vector) \ No newline at end of file From e1125ce4d909807341324230743f6ed8ce3b0eb2 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 2 May 2019 16:03:49 +0200 Subject: [PATCH 006/216] change dir of graph test file, cmakelists changes.. --- test/CMakeLists.txt | 2 +- test/{ => core/storage}/graph/CMakeLists.txt | 4 ++-- test/{ => core/storage}/graph/generate_ldbc_graph.cpp | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename test/{ => core/storage}/graph/CMakeLists.txt (71%) rename test/{ => core/storage}/graph/generate_ldbc_graph.cpp (100%) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f83e3c49..29695891 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,6 +5,6 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) -add_subdirectory( graph ) +add_subdirectory( core/storage/graph ) add_subdirectory(vector) \ No newline at end of file diff --git a/test/graph/CMakeLists.txt b/test/core/storage/graph/CMakeLists.txt similarity index 71% rename from test/graph/CMakeLists.txt rename to test/core/storage/graph/CMakeLists.txt index 708bb9af..335469f9 100644 --- a/test/graph/CMakeLists.txt +++ b/test/core/storage/graph/CMakeLists.txt @@ -1,7 +1,7 @@ if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/graph/ldbc_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc_graph_test_app ) - add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp ) + add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp) target_compile_options( ldbc_graph_test_app PRIVATE -Werror -Wall diff --git a/test/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp similarity index 100% rename from test/graph/generate_ldbc_graph.cpp rename to test/core/storage/graph/generate_ldbc_graph.cpp From 79c4c239323a2f2d66d08be140dc459a1c7cc665 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 2 May 2019 17:15:29 +0200 Subject: [PATCH 007/216] read LDBC files into dicts (intermediate dictionary) --- include/core/storage/graph.h | 10 +- .../storage/graph/generate_ldbc_graph.cpp | 218 +++++++++++++++++- 2 files changed, 220 insertions(+), 8 deletions(-) diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index 3a0044b6..5f7e6708 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -33,10 +33,10 @@ namespace graph{ struct Edge; struct Vertex{ - uint64_t id; - uint64_t ldbc_id; + unsigned long int id; + unsigned long int ldbc_id; int entity; - vector adjList; + std::vector adjList; }; struct Edge{ @@ -45,9 +45,9 @@ namespace graph{ }; struct graph{ - unordered_map vertices; + std::unordered_map vertices; void addVertex(); - void addEdge; + void addEdge(); }; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index bf596c23..908ba566 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -23,10 +23,222 @@ #include -int main( void ){ +#include +#include +#include +#include + +using namespace std; - std::cout << "Generating LDBC social network ..." << endl; +struct Relation{ + unsigned long int fromID; + unsigned long int toID; + int relID; +}; + +void importDataLookup(string address, unordered_map &rLookup){ + cout << "Reading Lookups from " << address; std::cout.flush(); - + + char* buffer; + ifstream data(address, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening + uint64_t fileSize = 0; + + if(!data){ + cerr << "\nError, opening file. "; + exit(EXIT_FAILURE); + } + + if (data.is_open()) { + fileSize = data.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + data.clear(); + data.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory with the filesize and the char size + buffer = (char*) malloc( fileSize * sizeof( char ) ); + data.read(buffer, fileSize); // read data as one big block + size_t start = 0; + string delimiter = "\t"; + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + + // get a row into string form buffer with start- and end-point and do stuff ... + string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != string::npos){ + row.erase(0,1); + } + + string relationName = row.substr(0, row.find(delimiter)); + row.erase(0, row.find(delimiter) + delimiter.length()); + string relID_str = row.substr(0, row.find(delimiter)); + + // convert string data to needed types + int relID = stoi(relID_str, nullptr, 10); + + // put into lookup data structure + rLookup.insert(make_pair(relID, relationName)); + + start = i; // set new starting point (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + data.close(); + + cout << " --> DONE" << endl; +} + +void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ + + cout << "Reading Vertices from " << vertexFile; + std::cout.flush(); + + char* buffer; + ifstream graph(vertexFile, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening + uint64_t fileSize = 0; + + if(!graph){ + cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + if (graph.is_open()) { + fileSize = graph.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + graph.clear(); + graph.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory with the filesize and the char size + buffer = (char*) malloc( fileSize * sizeof( char ) ); + graph.read(buffer, fileSize); // read data as one big block + size_t start = 0; + string delimiter = "\t"; + int entityIndex = 0; + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point and do stuff ... + string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != string::npos){ + row.erase(0,1); + } + + // for entities we have to look that there is NO '\t' in the string + if(row.find(delimiter) == string::npos){ + eLookup.insert(make_pair(entityIndex, row)); + entityIndex++; + }else{ + + // acutal data: first is ldbc_id and second global_id + string ldbc_str = row.substr(0, row.find(delimiter)); + string global_str = row.erase(0, row.find(delimiter) + delimiter.length()); // erase from row (...)\t[data] + + // convert string to long int + unsigned long int ldbc_id = stol(ldbc_str,nullptr,10); + unsigned long int global_id = stol(global_str,nullptr,10); + + vDict.insert({global_id, make_pair(entityIndex-1, ldbc_id)}); + + } + start = i; // set new starting point (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + graph.close(); + + cout << " --> DONE" << endl; +} + +void importDataRelations(string relationsFile, vector &rList){ + + cout << "Reading Relations from " << relationsFile; + std::cout.flush(); + + char* buffer; + ifstream graph(relationsFile, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening + uint64_t fileSize = 0; + + if(!graph){ + cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + if (graph.is_open()) { + fileSize = graph.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + graph.clear(); + graph.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory with the filesize and the char size + buffer = (char*) malloc( fileSize * sizeof( char ) ); + graph.read(buffer, fileSize); // read data as one big block + size_t start = 0; + string delimiter = "\t"; + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + + // get a row into string form buffer with start- and end-point and do stuff ... + string row(&buffer[start], &buffer[i]); + + string fromID_str = row.substr(0, row.find(delimiter)); + row.erase(0, row.find(delimiter) + delimiter.length()); + string toID_str = row.substr(0, row.find(delimiter)); + string relID_str = row.erase(0, row.find(delimiter) + delimiter.length()); + + // convert string data to needed types + unsigned long int fromID = stol(fromID_str,nullptr,10); + if(toID_str == "-1") toID_str = fromID_str; // if the toID is -1 --> loop to itself; refers to the multiple attributes + unsigned long int toID = stol(toID_str,nullptr,10); + int relID = stoi(relID_str, nullptr, 10); + + // write to relationDict data structure + Relation r; + r.fromID = fromID; + r.toID = toID; + r.relID = relID; + rList.push_back(r); + + start = i; // set new starting point (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + graph.close(); + + cout << " --> DONE" << endl; +} + +int main( void ){ + + // -------------------------------- Reading data from LDBC-tsv-files -------------------------------- + + // TODO: change intermediate results[] tsv -> [dicts] -> vertices to direct computation? + // Lookups for entity and relation: (e.g. (0 -> knows), (1 -> isLocatedIn), ... ) + unordered_map entityLookup; + unordered_map relationLookup; + + // Vertex data from tsv-files: unordered_map { global_id -> (entity.id, ldbc.id) } + unordered_map> vertexDict; + + // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) + vector relationDict; + + // TODO: get base directory with cin -> user input + string base = "/home/tim/Documents/TUD/(8) Informatik SS 2019/LDBC_Graph_Generating/LDBC_Python_Files/"; + + importDataLookup(base + "relationLookup.tsv", relationLookup); + importDataVertex(base + "entityDict.tsv", vertexDict, entityLookup); // entityLookup is built within the function automatically + importDataRelations(base + "relationDict.tsv", relationDict); + + // --------------------------------------- Generating the graph --------------------------------------- + return 0; } \ No newline at end of file From 934c2bd41ffb5dd6f0c4219ecad0c2d825200578 Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 3 May 2019 16:16:21 +0200 Subject: [PATCH 008/216] added graph structure with class graph, vertex, edge; generated graph in test --- include/core/storage/edge.h | 65 ++++++++++++++ include/core/storage/graph.h | 85 ++++++++++++++---- include/core/storage/vertex.h | 88 +++++++++++++++++++ .../storage/graph/generate_ldbc_graph.cpp | 43 ++++++++- 4 files changed, 261 insertions(+), 20 deletions(-) create mode 100644 include/core/storage/edge.h create mode 100644 include/core/storage/vertex.h diff --git a/include/core/storage/edge.h b/include/core/storage/edge.h new file mode 100644 index 00000000..356bc6da --- /dev/null +++ b/include/core/storage/edge.h @@ -0,0 +1,65 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertex.h + * @brief edge class and its functions + * @todo Add data structure for property +*/ + +#ifndef MORPHSTORE_EDGE_H +#define MORPHSTORE_EDGE_H + +#include + +namespace graph{ + + class Vertex; + + class Edge{ + + private: + Vertex* target; + int relation; + + public: + Edge(Vertex* target, int relation){ + target = target; + relation = relation; + } + + Vertex* getTarget() const{ + return target; + } + + int getRelation() const{ + return relation; + } + + void setTarget(Vertex* targetVertex){ + this->target = targetVertex; + } + + void setRelation(int rel){ + this->relation = rel; + } + }; +} + + + +#endif //MORPHSTORE_EDGE_H diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index 5f7e6708..9d0090ba 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -24,30 +24,83 @@ #ifndef MORPHSTORE_GRAPH_H #define MORPHSTORE_GRAPH_H +#include + #include #include +#include + namespace graph{ - struct Vertex; - struct Edge; + class Graph{ - struct Vertex{ - unsigned long int id; - unsigned long int ldbc_id; - int entity; - std::vector adjList; - }; + private: + // mapping global id -> vertex + std::unordered_map vertices; - struct Edge{ - Vertex* target; - int relation; - }; + public: + + void addVertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + // if key is not present -> create vertex + if(vertices.find(id) == vertices.end()){ + Vertex v(id, ldbc_id, entity); + vertices.insert(std::make_pair(id, v)); + }else{ + std::cout << "Vertex with ID " << id << " already exists!"; + } + } + + void addEdge(unsigned long int sourceID, unsigned long int targetID, int relation){ + if(existID(vertices, sourceID) && existID(vertices, targetID)){ + Vertex* sourceV = &vertices.at(sourceID); + Vertex* targetV = &vertices.at(targetID); + Edge e(targetV, relation); + sourceV->addEdge(e); + }else{ + std::cout << "Source-/Target-Vertex-ID do not exist!"; + } + } + + // Function to check if the ID is present or not + bool existID(std::unordered_map& v, unsigned long int id){ + if(v.find(id) == v.end()){ + return false; + } + return true; + } + + int getTotalNumberOfEdges(){ + int totalNumberEdges = 0; + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + totalNumberEdges += it->second.getAdjList().size(); + } + return totalNumberEdges; + } + + void statistics(){ + std::cout << "---------------- Statistics ----------------" << std::endl; + std::cout << "Number of vertices: " << vertices.size() << std::endl; + std::cout << "Number of relations/edges: " << getTotalNumberOfEdges() << std::endl; + std::cout << "--------------------------------------------" << std::endl; + } + + void printVertexByID(unsigned long int id){ + Vertex* v = &vertices.at(id); + std::cout << "Vertex-ID: "<< v->getId() << std::endl; + std::cout << " LDBC-ID: "<< v->getLDBC_Id() << std::endl; + std::cout << " Entity-ID: "<< v->getEntity() << std::endl; + std::cout << " #Edges: " << v->getAdjList().size() << std::endl; + std::cout << " Adj.List: "; + + const std::vector& adjList = v->getAdjList(); + for(const auto& e : adjList){ + // TODO: print edges of vertex: (target, rel.id) -> Problem: (SEGFAULT) when doing e.getTarget()->getId() -> No access to target + std::cout << "(" << /* e.getTarget()->getId() << "," <<*/ e.getRelation() << ") "; + } + std::cout << "\n"; + } - struct graph{ - std::unordered_map vertices; - void addVertex(); - void addEdge(); }; } diff --git a/include/core/storage/vertex.h b/include/core/storage/vertex.h new file mode 100644 index 00000000..451afb84 --- /dev/null +++ b/include/core/storage/vertex.h @@ -0,0 +1,88 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertex.h + * @brief vertex class and its functions + * @todo Add data structure for properties +*/ + +#ifndef MORPHSTORE_VERTEX_H +#define MORPHSTORE_VERTEX_H + +#include + +#include +#include + + +namespace graph{ + + class Vertex{ + + private: + unsigned long int id; + unsigned long int ldbc_id; + int entity; + std::vector adjList; + + public: + + Vertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + SetVertex(id, ldbc_id, entity); + } + + void SetVertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + this->id = id; + this->ldbc_id = ldbc_id; + this->entity = entity; + } + + unsigned long int getId() const{ + return id; + } + + unsigned long int getLDBC_Id(){ + return ldbc_id; + } + + int getEntity(){ + return entity; + } + + const std::vector& getAdjList() const{ + return adjList; + } + + + void setEntity(int newEntity){ + entity = newEntity; + } + + bool deleteAdjList(){ + adjList.clear(); + if(adjList.size() == 0) return true; + return false; + } + + void addEdge(Edge e){ + this->adjList.push_back(e); + } + }; +} + +#endif //MORPHSTORE_VERTEX_H diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 908ba566..0a63204d 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -89,7 +89,7 @@ void importDataLookup(string address, unordered_map &rLookup){ delete[] buffer; // free memory data.close(); - cout << " --> DONE" << endl; + cout << " --> done" << endl; } void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ @@ -153,7 +153,7 @@ void importDataVertex(string vertexFile, unordered_map DONE" << endl; + cout << " --> done" << endl; } void importDataRelations(string relationsFile, vector &rList){ @@ -213,7 +213,36 @@ void importDataRelations(string relationsFile, vector &rList){ delete[] buffer; // free memory graph.close(); - cout << " --> DONE" << endl; + cout << " --> done" << endl; +} + +void generateVertices(unordered_map>& vertexDict, graph::Graph& g){ + + cout << "Generating Vertices ..."; + std::cout.flush(); + + // iterate through vertex-dict. and generate the vertices (objects) in the graph + for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ + unsigned long int id = it->first; + unsigned long int ldbc_id = it->second.second; + int entity = it->second.first; + g.addVertex(id, ldbc_id, entity); + } + + cout << " --> done" << endl; +} + +void generateEdges(vector& rDict, graph::Graph& g){ + + cout << "Generating Relations ..."; + std::cout.flush(); + + // iterate through relationDict and add (target.id, rel.id) to the vertex adj.-list + for(std::vector::iterator it = rDict.begin(); it != rDict.end(); ++it){ + g.addEdge(it->fromID, it->toID, it->relID); + } + + cout << " --> done" << endl; } int main( void ){ @@ -226,7 +255,7 @@ int main( void ){ unordered_map relationLookup; // Vertex data from tsv-files: unordered_map { global_id -> (entity.id, ldbc.id) } - unordered_map> vertexDict; + unordered_map> vertexDict; // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) vector relationDict; @@ -240,5 +269,11 @@ int main( void ){ // --------------------------------------- Generating the graph --------------------------------------- + graph::Graph ldbc_graph; + generateVertices(vertexDict, ldbc_graph); + generateEdges(relationDict, ldbc_graph); + ldbc_graph.printVertexByID(90563); + ldbc_graph.statistics(); + return 0; } \ No newline at end of file From f79e8ce71688b0364fb79f3d300a7790b10ebea3 Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 3 May 2019 17:34:39 +0200 Subject: [PATCH 009/216] removed edge.h (edge is struct in vertex.h); fixed printing vertex infos --- include/core/storage/edge.h | 65 ------------------- include/core/storage/graph.h | 21 +++--- include/core/storage/vertex.h | 20 ++++-- .../storage/graph/generate_ldbc_graph.cpp | 3 +- 4 files changed, 28 insertions(+), 81 deletions(-) delete mode 100644 include/core/storage/edge.h diff --git a/include/core/storage/edge.h b/include/core/storage/edge.h deleted file mode 100644 index 356bc6da..00000000 --- a/include/core/storage/edge.h +++ /dev/null @@ -1,65 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file vertex.h - * @brief edge class and its functions - * @todo Add data structure for property -*/ - -#ifndef MORPHSTORE_EDGE_H -#define MORPHSTORE_EDGE_H - -#include - -namespace graph{ - - class Vertex; - - class Edge{ - - private: - Vertex* target; - int relation; - - public: - Edge(Vertex* target, int relation){ - target = target; - relation = relation; - } - - Vertex* getTarget() const{ - return target; - } - - int getRelation() const{ - return relation; - } - - void setTarget(Vertex* targetVertex){ - this->target = targetVertex; - } - - void setRelation(int rel){ - this->relation = rel; - } - }; -} - - - -#endif //MORPHSTORE_EDGE_H diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index 9d0090ba..d3f3a82a 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -55,10 +55,9 @@ namespace graph{ if(existID(vertices, sourceID) && existID(vertices, targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); - Edge e(targetV, relation); - sourceV->addEdge(e); + sourceV->addEdge(targetV, relation); }else{ - std::cout << "Source-/Target-Vertex-ID do not exist!"; + std::cout << "Source-/Target-Vertex-ID does not exist!"; } } @@ -73,7 +72,7 @@ namespace graph{ int getTotalNumberOfEdges(){ int totalNumberEdges = 0; for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - totalNumberEdges += it->second.getAdjList().size(); + totalNumberEdges += it->second.getNumberOfEdges(); } return totalNumberEdges; } @@ -86,17 +85,17 @@ namespace graph{ } void printVertexByID(unsigned long int id){ + std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); - std::cout << "Vertex-ID: "<< v->getId() << std::endl; - std::cout << " LDBC-ID: "<< v->getLDBC_Id() << std::endl; - std::cout << " Entity-ID: "<< v->getEntity() << std::endl; - std::cout << " #Edges: " << v->getAdjList().size() << std::endl; - std::cout << " Adj.List: "; + std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; + std::cout << "LDBC-ID: \t"<< v->getLDBC_Id() << std::endl; + std::cout << "Entity-ID: \t"<< v->getEntity() << std::endl; + std::cout << "#Edges: \t" << v->getAdjList().size() << std::endl; + std::cout << "Adj.List: "; const std::vector& adjList = v->getAdjList(); for(const auto& e : adjList){ - // TODO: print edges of vertex: (target, rel.id) -> Problem: (SEGFAULT) when doing e.getTarget()->getId() -> No access to target - std::cout << "(" << /* e.getTarget()->getId() << "," <<*/ e.getRelation() << ") "; + std::cout << "(" << e.target->getId() << "," << e.relation << ") "; } std::cout << "\n"; } diff --git a/include/core/storage/vertex.h b/include/core/storage/vertex.h index 451afb84..3189327e 100644 --- a/include/core/storage/vertex.h +++ b/include/core/storage/vertex.h @@ -17,21 +17,26 @@ /** * @file vertex.h - * @brief vertex class and its functions + * @brief vertex class and its functions + Edge struct * @todo Add data structure for properties */ #ifndef MORPHSTORE_VERTEX_H #define MORPHSTORE_VERTEX_H -#include - #include #include namespace graph{ + class Vertex; + + struct Edge{ + Vertex* target; + int relation; + }; + class Vertex{ private: @@ -79,9 +84,16 @@ namespace graph{ return false; } - void addEdge(Edge e){ + void addEdge(Vertex* target, int rel){ + Edge e; + e.relation = rel; + e.target = target; this->adjList.push_back(e); } + + int getNumberOfEdges(){ + return adjList.size(); + } }; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 0a63204d..1e6575a2 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -272,7 +272,8 @@ int main( void ){ graph::Graph ldbc_graph; generateVertices(vertexDict, ldbc_graph); generateEdges(relationDict, ldbc_graph); - ldbc_graph.printVertexByID(90563); + + //ldbc_graph.printVertexByID(90563); ldbc_graph.statistics(); return 0; From 29b77fd65677cbd61784065986fdd19b216aef14 Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 3 May 2019 17:52:46 +0200 Subject: [PATCH 010/216] added some comments... --- include/core/storage/graph.h | 17 +++++++++++------ include/core/storage/vertex.h | 11 +++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index d3f3a82a..ced63124 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -36,14 +36,15 @@ namespace graph{ class Graph{ private: - // mapping global id -> vertex + // main data structure: mapping global id -> vertex std::unordered_map vertices; public: + // function to add a new (ldbc) vertex to the graph void addVertex(unsigned long int id, unsigned long int ldbc_id, int entity){ // if key is not present -> create vertex - if(vertices.find(id) == vertices.end()){ + if(existID(id)){ Vertex v(id, ldbc_id, entity); vertices.insert(std::make_pair(id, v)); }else{ @@ -51,8 +52,9 @@ namespace graph{ } } + // function that creates a new relation/edge between two (existing) vertices void addEdge(unsigned long int sourceID, unsigned long int targetID, int relation){ - if(existID(vertices, sourceID) && existID(vertices, targetID)){ + if(existID(sourceID) && existID(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); sourceV->addEdge(targetV, relation); @@ -61,14 +63,15 @@ namespace graph{ } } - // Function to check if the ID is present or not - bool existID(std::unordered_map& v, unsigned long int id){ - if(v.find(id) == v.end()){ + // function to check if the ID is present or not + bool existID(unsigned long int id){ + if(vertices.find(id) == vertices.end()){ return false; } return true; } + // this function returns the total number of edges in the graph int getTotalNumberOfEdges(){ int totalNumberEdges = 0; for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ @@ -77,6 +80,7 @@ namespace graph{ return totalNumberEdges; } + // for debbuging void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << vertices.size() << std::endl; @@ -84,6 +88,7 @@ namespace graph{ std::cout << "--------------------------------------------" << std::endl; } + // for debugging void printVertexByID(unsigned long int id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); diff --git a/include/core/storage/vertex.h b/include/core/storage/vertex.h index 3189327e..de1c4f88 100644 --- a/include/core/storage/vertex.h +++ b/include/core/storage/vertex.h @@ -32,6 +32,7 @@ namespace graph{ class Vertex; + // this struct represents a relation to a target vertex; relation is the number in the lookup table struct Edge{ Vertex* target; int relation; @@ -40,6 +41,7 @@ namespace graph{ class Vertex{ private: + // Vertex contains a (global) id; (old) ldbc id; entity number for lookup; vector adjList for the adjacency List unsigned long int id; unsigned long int ldbc_id; int entity; @@ -47,6 +49,7 @@ namespace graph{ public: + // constrcutor without the adjList (Vertex can contain no edges int the graph) Vertex(unsigned long int id, unsigned long int ldbc_id, int entity){ SetVertex(id, ldbc_id, entity); } @@ -69,6 +72,7 @@ namespace graph{ return entity; } + // returns a reference (read-only) of the adjacency list const std::vector& getAdjList() const{ return adjList; } @@ -78,12 +82,7 @@ namespace graph{ entity = newEntity; } - bool deleteAdjList(){ - adjList.clear(); - if(adjList.size() == 0) return true; - return false; - } - + // function to add new neighbor vertex void addEdge(Vertex* target, int rel){ Edge e; e.relation = rel; From c7205e70a2841b815f5e1b215b255843c2450bd8 Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 6 May 2019 13:06:16 +0200 Subject: [PATCH 011/216] changed unsigned long int's to size_t --- include/core/storage/graph.h | 17 +++++---- include/core/storage/vertex.h | 13 +++---- .../storage/graph/generate_ldbc_graph.cpp | 35 ++++++++++--------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index ced63124..fca99f02 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -37,14 +37,14 @@ namespace graph{ private: // main data structure: mapping global id -> vertex - std::unordered_map vertices; + std::unordered_map vertices; public: // function to add a new (ldbc) vertex to the graph - void addVertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + void addVertex(size_t id, size_t ldbc_id, int entity){ // if key is not present -> create vertex - if(existID(id)){ + if(!existID(id)){ Vertex v(id, ldbc_id, entity); vertices.insert(std::make_pair(id, v)); }else{ @@ -53,7 +53,7 @@ namespace graph{ } // function that creates a new relation/edge between two (existing) vertices - void addEdge(unsigned long int sourceID, unsigned long int targetID, int relation){ + void addEdge(size_t sourceID, size_t targetID, int relation){ if(existID(sourceID) && existID(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -64,7 +64,7 @@ namespace graph{ } // function to check if the ID is present or not - bool existID(unsigned long int id){ + bool existID(size_t id){ if(vertices.find(id) == vertices.end()){ return false; } @@ -73,8 +73,8 @@ namespace graph{ // this function returns the total number of edges in the graph int getTotalNumberOfEdges(){ - int totalNumberEdges = 0; - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size_t totalNumberEdges = 0; + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ totalNumberEdges += it->second.getNumberOfEdges(); } return totalNumberEdges; @@ -89,7 +89,7 @@ namespace graph{ } // for debugging - void printVertexByID(unsigned long int id){ + void printVertexByID(size_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; @@ -104,7 +104,6 @@ namespace graph{ } std::cout << "\n"; } - }; } diff --git a/include/core/storage/vertex.h b/include/core/storage/vertex.h index de1c4f88..31d50a29 100644 --- a/include/core/storage/vertex.h +++ b/include/core/storage/vertex.h @@ -42,29 +42,30 @@ namespace graph{ private: // Vertex contains a (global) id; (old) ldbc id; entity number for lookup; vector adjList for the adjacency List - unsigned long int id; - unsigned long int ldbc_id; + size_t id; + // TODO: remove ldbc_id from Vertex schema (to get more general structure without ldbc-dependency) + size_t ldbc_id; int entity; std::vector adjList; public: // constrcutor without the adjList (Vertex can contain no edges int the graph) - Vertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + Vertex(size_t id, size_t ldbc_id, int entity){ SetVertex(id, ldbc_id, entity); } - void SetVertex(unsigned long int id, unsigned long int ldbc_id, int entity){ + void SetVertex(size_t id, size_t ldbc_id, int entity){ this->id = id; this->ldbc_id = ldbc_id; this->entity = entity; } - unsigned long int getId() const{ + size_t getId() const{ return id; } - unsigned long int getLDBC_Id(){ + size_t getLDBC_Id(){ return ldbc_id; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 1e6575a2..c64dd21f 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -27,17 +27,18 @@ #include #include #include +#include using namespace std; struct Relation{ - unsigned long int fromID; - unsigned long int toID; + size_t fromID; + size_t toID; int relID; }; void importDataLookup(string address, unordered_map &rLookup){ - cout << "Reading Lookups from " << address; + cout << "Reading LDBC-Lookups ..."; std::cout.flush(); char* buffer; @@ -92,9 +93,9 @@ void importDataLookup(string address, unordered_map &rLookup){ cout << " --> done" << endl; } -void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ +void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ - cout << "Reading Vertices from " << vertexFile; + cout << "Reading LDBC-Vertices ..."; std::cout.flush(); char* buffer; @@ -139,9 +140,9 @@ void importDataVertex(string vertexFile, unordered_map size_t + size_t ldbc_id = stoul(ldbc_str,nullptr,10); + size_t global_id = stoul(global_str,nullptr,10); vDict.insert({global_id, make_pair(entityIndex-1, ldbc_id)}); @@ -158,7 +159,7 @@ void importDataVertex(string vertexFile, unordered_map &rList){ - cout << "Reading Relations from " << relationsFile; + cout << "Reading LDBC-Relations ..."; std::cout.flush(); char* buffer; @@ -194,9 +195,9 @@ void importDataRelations(string relationsFile, vector &rList){ string relID_str = row.erase(0, row.find(delimiter) + delimiter.length()); // convert string data to needed types - unsigned long int fromID = stol(fromID_str,nullptr,10); + size_t fromID = stoul(fromID_str,nullptr,10); if(toID_str == "-1") toID_str = fromID_str; // if the toID is -1 --> loop to itself; refers to the multiple attributes - unsigned long int toID = stol(toID_str,nullptr,10); + size_t toID = stoul(toID_str,nullptr,10); int relID = stoi(relID_str, nullptr, 10); // write to relationDict data structure @@ -216,15 +217,15 @@ void importDataRelations(string relationsFile, vector &rList){ cout << " --> done" << endl; } -void generateVertices(unordered_map>& vertexDict, graph::Graph& g){ +void generateVertices(unordered_map>& vertexDict, graph::Graph& g){ cout << "Generating Vertices ..."; std::cout.flush(); // iterate through vertex-dict. and generate the vertices (objects) in the graph - for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ - unsigned long int id = it->first; - unsigned long int ldbc_id = it->second.second; + for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ + size_t id = it->first; + size_t ldbc_id = it->second.second; int entity = it->second.first; g.addVertex(id, ldbc_id, entity); } @@ -249,13 +250,13 @@ int main( void ){ // -------------------------------- Reading data from LDBC-tsv-files -------------------------------- - // TODO: change intermediate results[] tsv -> [dicts] -> vertices to direct computation? + // TODO: change intermediate results[] tsv -> [dicts] -> vertices to direct computation? (but then we lose the ldbc_id, if we remove it in Vertex class) // Lookups for entity and relation: (e.g. (0 -> knows), (1 -> isLocatedIn), ... ) unordered_map entityLookup; unordered_map relationLookup; // Vertex data from tsv-files: unordered_map { global_id -> (entity.id, ldbc.id) } - unordered_map> vertexDict; + unordered_map> vertexDict; // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) vector relationDict; From f8ec509c36c716cff5c761d592f829b568c90f24 Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 13 May 2019 10:36:08 +0200 Subject: [PATCH 012/216] changed namespace 'graph -> morphstore'; replace size_t with uint64_t --- include/core/storage/graph.h | 16 +++++------ include/core/storage/vertex.h | 14 +++++----- .../storage/graph/generate_ldbc_graph.cpp | 28 +++++++++---------- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/include/core/storage/graph.h b/include/core/storage/graph.h index fca99f02..89b825df 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph.h @@ -31,18 +31,18 @@ #include -namespace graph{ +namespace morphstore{ class Graph{ private: // main data structure: mapping global id -> vertex - std::unordered_map vertices; + std::unordered_map vertices; public: // function to add a new (ldbc) vertex to the graph - void addVertex(size_t id, size_t ldbc_id, int entity){ + void addVertex(uint64_t id, uint64_t ldbc_id, int entity){ // if key is not present -> create vertex if(!existID(id)){ Vertex v(id, ldbc_id, entity); @@ -53,7 +53,7 @@ namespace graph{ } // function that creates a new relation/edge between two (existing) vertices - void addEdge(size_t sourceID, size_t targetID, int relation){ + void addEdge(uint64_t sourceID, uint64_t targetID, int relation){ if(existID(sourceID) && existID(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -64,7 +64,7 @@ namespace graph{ } // function to check if the ID is present or not - bool existID(size_t id){ + bool existID(uint64_t id){ if(vertices.find(id) == vertices.end()){ return false; } @@ -73,8 +73,8 @@ namespace graph{ // this function returns the total number of edges in the graph int getTotalNumberOfEdges(){ - size_t totalNumberEdges = 0; - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + uint64_t totalNumberEdges = 0; + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ totalNumberEdges += it->second.getNumberOfEdges(); } return totalNumberEdges; @@ -89,7 +89,7 @@ namespace graph{ } // for debugging - void printVertexByID(size_t id){ + void printVertexByID(uint64_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; diff --git a/include/core/storage/vertex.h b/include/core/storage/vertex.h index 31d50a29..9a9043cf 100644 --- a/include/core/storage/vertex.h +++ b/include/core/storage/vertex.h @@ -28,7 +28,7 @@ #include -namespace graph{ +namespace morphstore{ class Vertex; @@ -42,30 +42,30 @@ namespace graph{ private: // Vertex contains a (global) id; (old) ldbc id; entity number for lookup; vector adjList for the adjacency List - size_t id; + uint64_t id; // TODO: remove ldbc_id from Vertex schema (to get more general structure without ldbc-dependency) - size_t ldbc_id; + uint64_t ldbc_id; int entity; std::vector adjList; public: // constrcutor without the adjList (Vertex can contain no edges int the graph) - Vertex(size_t id, size_t ldbc_id, int entity){ + Vertex(uint64_t id, uint64_t ldbc_id, int entity){ SetVertex(id, ldbc_id, entity); } - void SetVertex(size_t id, size_t ldbc_id, int entity){ + void SetVertex(uint64_t id, uint64_t ldbc_id, int entity){ this->id = id; this->ldbc_id = ldbc_id; this->entity = entity; } - size_t getId() const{ + uint64_t getId() const{ return id; } - size_t getLDBC_Id(){ + uint64_t getLDBC_Id(){ return ldbc_id; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index c64dd21f..366ea1f4 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -32,8 +32,8 @@ using namespace std; struct Relation{ - size_t fromID; - size_t toID; + uint64_t fromID; + uint64_t toID; int relID; }; @@ -93,7 +93,7 @@ void importDataLookup(string address, unordered_map &rLookup){ cout << " --> done" << endl; } -void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ +void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ cout << "Reading LDBC-Vertices ..."; std::cout.flush(); @@ -141,8 +141,8 @@ void importDataVertex(string vertexFile, unordered_map size_t - size_t ldbc_id = stoul(ldbc_str,nullptr,10); - size_t global_id = stoul(global_str,nullptr,10); + uint64_t ldbc_id = stoul(ldbc_str,nullptr,10); + uint64_t global_id = stoul(global_str,nullptr,10); vDict.insert({global_id, make_pair(entityIndex-1, ldbc_id)}); @@ -195,9 +195,9 @@ void importDataRelations(string relationsFile, vector &rList){ string relID_str = row.erase(0, row.find(delimiter) + delimiter.length()); // convert string data to needed types - size_t fromID = stoul(fromID_str,nullptr,10); + uint64_t fromID = stoul(fromID_str,nullptr,10); if(toID_str == "-1") toID_str = fromID_str; // if the toID is -1 --> loop to itself; refers to the multiple attributes - size_t toID = stoul(toID_str,nullptr,10); + uint64_t toID = stoul(toID_str,nullptr,10); int relID = stoi(relID_str, nullptr, 10); // write to relationDict data structure @@ -217,15 +217,15 @@ void importDataRelations(string relationsFile, vector &rList){ cout << " --> done" << endl; } -void generateVertices(unordered_map>& vertexDict, graph::Graph& g){ +void generateVertices(unordered_map>& vertexDict, morphstore::Graph& g){ cout << "Generating Vertices ..."; std::cout.flush(); // iterate through vertex-dict. and generate the vertices (objects) in the graph - for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ - size_t id = it->first; - size_t ldbc_id = it->second.second; + for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ + uint64_t id = it->first; + uint64_t ldbc_id = it->second.second; int entity = it->second.first; g.addVertex(id, ldbc_id, entity); } @@ -233,7 +233,7 @@ void generateVertices(unordered_map>& vertexDict, grap cout << " --> done" << endl; } -void generateEdges(vector& rDict, graph::Graph& g){ +void generateEdges(vector& rDict, morphstore::Graph& g){ cout << "Generating Relations ..."; std::cout.flush(); @@ -256,7 +256,7 @@ int main( void ){ unordered_map relationLookup; // Vertex data from tsv-files: unordered_map { global_id -> (entity.id, ldbc.id) } - unordered_map> vertexDict; + unordered_map> vertexDict; // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) vector relationDict; @@ -270,7 +270,7 @@ int main( void ){ // --------------------------------------- Generating the graph --------------------------------------- - graph::Graph ldbc_graph; + morphstore::Graph ldbc_graph; generateVertices(vertexDict, ldbc_graph); generateEdges(relationDict, ldbc_graph); From 6f5c62c40ab7f21ece8c6d16d986f97e129be051 Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 13 May 2019 10:40:48 +0200 Subject: [PATCH 013/216] new directory in Engine/core/storage --- include/core/storage/{ => graph}/graph.h | 2 +- include/core/storage/{ => graph}/vertex.h | 0 test/core/storage/graph/generate_ldbc_graph.cpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename include/core/storage/{ => graph}/graph.h (99%) rename include/core/storage/{ => graph}/vertex.h (100%) diff --git a/include/core/storage/graph.h b/include/core/storage/graph/graph.h similarity index 99% rename from include/core/storage/graph.h rename to include/core/storage/graph/graph.h index 89b825df..ae65615d 100644 --- a/include/core/storage/graph.h +++ b/include/core/storage/graph/graph.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_GRAPH_H #define MORPHSTORE_GRAPH_H -#include +#include #include #include diff --git a/include/core/storage/vertex.h b/include/core/storage/graph/vertex.h similarity index 100% rename from include/core/storage/vertex.h rename to include/core/storage/graph/vertex.h diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 366ea1f4..a9aa5b3d 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -21,7 +21,7 @@ * @todo TODOS? */ -#include +#include #include #include From f5da8f5377806233ce774a921dbecd7a5c05969c Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 14 May 2019 16:26:13 +0200 Subject: [PATCH 014/216] started ldbc_import.h; changed cmake to handle experimental::filesystem --- include/core/storage/graph/graph.h | 20 ++-- include/core/storage/graph/ldbc_import.h | 94 +++++++++++++++++++ include/core/storage/graph/vertex.h | 8 +- test/core/storage/graph/CMakeLists.txt | 4 +- .../storage/graph/generate_ldbc_graph.cpp | 11 ++- 5 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 include/core/storage/graph/ldbc_import.h diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index ae65615d..1f21d473 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -42,9 +42,9 @@ namespace morphstore{ public: // function to add a new (ldbc) vertex to the graph - void addVertex(uint64_t id, uint64_t ldbc_id, int entity){ + void add_vertex(uint64_t id, uint64_t ldbc_id, int entity){ // if key is not present -> create vertex - if(!existID(id)){ + if(!exist_id(id)){ Vertex v(id, ldbc_id, entity); vertices.insert(std::make_pair(id, v)); }else{ @@ -53,18 +53,18 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices - void addEdge(uint64_t sourceID, uint64_t targetID, int relation){ - if(existID(sourceID) && existID(targetID)){ + void add_edge(uint64_t sourceID, uint64_t targetID, int relation){ + if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); - sourceV->addEdge(targetV, relation); + sourceV->add_edge(targetV, relation); }else{ std::cout << "Source-/Target-Vertex-ID does not exist!"; } } // function to check if the ID is present or not - bool existID(uint64_t id){ + bool exist_id(const uint64_t id){ if(vertices.find(id) == vertices.end()){ return false; } @@ -72,10 +72,10 @@ namespace morphstore{ } // this function returns the total number of edges in the graph - int getTotalNumberOfEdges(){ + int get_total_number_of_edges(){ uint64_t totalNumberEdges = 0; for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - totalNumberEdges += it->second.getNumberOfEdges(); + totalNumberEdges += it->second.get_number_of_edges(); } return totalNumberEdges; } @@ -84,12 +84,12 @@ namespace morphstore{ void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << vertices.size() << std::endl; - std::cout << "Number of relations/edges: " << getTotalNumberOfEdges() << std::endl; + std::cout << "Number of relations/edges: " << get_total_number_of_edges() << std::endl; std::cout << "--------------------------------------------" << std::endl; } // for debugging - void printVertexByID(uint64_t id){ + void print_vertex_by_id(uint64_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h new file mode 100644 index 00000000..35c9df94 --- /dev/null +++ b/include/core/storage/graph/ldbc_import.h @@ -0,0 +1,94 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file ldbc_import.h + * @brief this class reads the ldbc files and generates the graph + * @todo Any TODOS? +*/ + +#ifndef MORPHSTORE_LDBC_IMPORT_H +#define MORPHSTORE_LDBC_IMPORT_H + +#include +#include +#include +#include + +namespace morphstore{ + + class LDBC_Import{ + + private: + std::string directory; + std::vector verticesPaths; + std::vector relationsPaths; + + + public: + // constructor + LDBC_Import(std::string dir){ + directory = dir; + insert_file_names(dir); + } + + void insert_file_names(std::string dir){ + for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ + // ignore files starting with a '.' + if(entry.path().string()[dir.size()] == '.'){ + continue; + }else{ + // insert file path to vertices or relations vector + differentiate(entry.path().string(), dir); + } + } + } + + // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector + void differentiate(std::string path, std::string dir){ + // if the string contains a '_' -> it's a relation file; otherwise a vertex file + // remove dir name to remain only the *.csv + if(path.substr(dir.size()).find('_') != std::string::npos ){ + relationsPaths.push_back(path); + }else{ + verticesPaths.push_back(path); + } + } + + // for debugging + void print_file_names(){ + std::cout << "Vertices-Files: " << std::endl; + for(auto& v : verticesPaths){ + std::cout << "\t" << v << std::endl; + } + + std::cout << "Relations-Files: " << std::endl; + for(auto& rel : relationsPaths){ + std::cout << "\t" << rel << std::endl; + } + + } + + + + }; + + + +} + +#endif //MORPHSTORE_LDBC_IMPORT_H diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 9a9043cf..fc8e28ab 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -26,6 +26,7 @@ #include #include +#include namespace morphstore{ @@ -48,6 +49,9 @@ namespace morphstore{ int entity; std::vector adjList; + // properties + std::unordered_map properties; + public: // constrcutor without the adjList (Vertex can contain no edges int the graph) @@ -84,14 +88,14 @@ namespace morphstore{ } // function to add new neighbor vertex - void addEdge(Vertex* target, int rel){ + void add_edge(Vertex *target, int rel){ Edge e; e.relation = rel; e.target = target; this->adjList.push_back(e); } - int getNumberOfEdges(){ + int get_number_of_edges(){ return adjList.size(); } }; diff --git a/test/core/storage/graph/CMakeLists.txt b/test/core/storage/graph/CMakeLists.txt index 335469f9..ad5c2387 100644 --- a/test/core/storage/graph/CMakeLists.txt +++ b/test/core/storage/graph/CMakeLists.txt @@ -1,7 +1,7 @@ if ( CTEST_ALL OR CTEST_STORAGE ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc_graph_test_app ) - add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp) + add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp ) target_compile_options( ldbc_graph_test_app PRIVATE -Werror -Wall @@ -9,7 +9,7 @@ if ( CTEST_ALL OR CTEST_STORAGE ) -pedantic -fstack-protector-all $<$:-DDEBUG> ) - target_link_libraries( ldbc_graph_test_app PRIVATE "-ldl" ) + target_link_libraries( ldbc_graph_test_app PRIVATE "-ldl" stdc++fs) add_test( ldbc_graph_test ldbc_graph_test_app ) endif() \ No newline at end of file diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index a9aa5b3d..a75efbde 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -22,6 +22,7 @@ */ #include +#include #include #include @@ -31,6 +32,7 @@ using namespace std; + struct Relation{ uint64_t fromID; uint64_t toID; @@ -227,7 +229,7 @@ void generateVertices(unordered_map>& vertexDict, uint64_t id = it->first; uint64_t ldbc_id = it->second.second; int entity = it->second.first; - g.addVertex(id, ldbc_id, entity); + g.add_vertex(id, ldbc_id, entity); } cout << " --> done" << endl; @@ -240,7 +242,7 @@ void generateEdges(vector& rDict, morphstore::Graph& g){ // iterate through relationDict and add (target.id, rel.id) to the vertex adj.-list for(std::vector::iterator it = rDict.begin(); it != rDict.end(); ++it){ - g.addEdge(it->fromID, it->toID, it->relID); + g.add_edge(it->fromID, it->toID, it->relID); } cout << " --> done" << endl; @@ -277,5 +279,10 @@ int main( void ){ //ldbc_graph.printVertexByID(90563); ldbc_graph.statistics(); + // NEW LDBC-IMPORT TEST + morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); + ldbcImport.print_file_names(); + + return 0; } \ No newline at end of file From 11afae82231fd4d414d0bdbacc082847c0d47188 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 14 May 2019 16:38:31 +0200 Subject: [PATCH 015/216] little changes... --- include/core/storage/graph/ldbc_import.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 35c9df94..0debbc33 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -72,12 +72,12 @@ namespace morphstore{ // for debugging void print_file_names(){ std::cout << "Vertices-Files: " << std::endl; - for(auto& v : verticesPaths){ + for(const auto& v : verticesPaths){ std::cout << "\t" << v << std::endl; } std::cout << "Relations-Files: " << std::endl; - for(auto& rel : relationsPaths){ + for(const auto& rel : relationsPaths){ std::cout << "\t" << rel << std::endl; } From 3100ea9fe495ad7829392b523a97610a94605334 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 16 May 2019 13:39:23 +0200 Subject: [PATCH 016/216] buffer allocation for vertex files --- include/core/storage/graph/ldbc_import.h | 97 ++++++++++++++++++- .../storage/graph/generate_ldbc_graph.cpp | 5 + 2 files changed, 98 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 0debbc33..6c28ca14 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include namespace morphstore{ @@ -41,9 +43,13 @@ namespace morphstore{ public: // constructor - LDBC_Import(std::string dir){ + LDBC_Import(const std::string& dir){ directory = dir; - insert_file_names(dir); + insert_file_names(directory); + } + + std::string getDirectory() const{ + return directory; } void insert_file_names(std::string dir){ @@ -69,6 +75,91 @@ namespace morphstore{ } } + // This function generates the the vertices from the vertex-vector + void generate_Vertices(){ + // data structure for attributes: entity -> (attributes), e.g. tagclass -> (id, name, url) + std::unordered_map> attributes; + + if(!verticesPaths.empty()) { + std::cout << "Generating LDBC-Vertices ..." << std::endl; + std::cout.flush(); + + // (1) calculate global size to allocate + for (const auto &address : verticesPaths) { + + // get the entity from address ([...path...] / [entity-name].csv) and put key into attributes map + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + attributes[entity]; + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + if (vertexFile.is_open()) { + fileSize = vertexFile.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // (2) allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; + + // (3) do actual work with data in buffer ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point and do stuff ... + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + // handle first line of *.csv: contains the attributes; first attribute is ldbc_id -> important for edge-generation + if(row.rfind("id", 0) == 0){ + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + size_t last = 0; + size_t next = 0; + while ((next = row.find(delimiter, last)) != std::string::npos){ + attributes[entity].push_back(row.substr(last, next-last)); + last = next + 1; + } + // last attribute + attributes[entity].push_back(row.substr(last)); + }else{ + // (4) generate vertex with properties and write to graph + } + + start = i; // set new starting point (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + vertexFile.close(); + } + + // Verify + for( std::unordered_map >::const_iterator ptr=attributes.begin(); ptr!=attributes.end(); ptr++) { + std::cout << ptr->first << ": "; + for( std::vector::const_iterator eptr=ptr->second.begin(); eptr!=ptr->second.end(); eptr++){ + std::cout << *eptr << " "; + } + std::cout << std::endl; + } + + } + } + // for debugging void print_file_names(){ std::cout << "Vertices-Files: " << std::endl; @@ -83,8 +174,6 @@ namespace morphstore{ } - - }; diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index a75efbde..92f3f3a3 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -266,10 +266,12 @@ int main( void ){ // TODO: get base directory with cin -> user input string base = "/home/tim/Documents/TUD/(8) Informatik SS 2019/LDBC_Graph_Generating/LDBC_Python_Files/"; + /* importDataLookup(base + "relationLookup.tsv", relationLookup); importDataVertex(base + "entityDict.tsv", vertexDict, entityLookup); // entityLookup is built within the function automatically importDataRelations(base + "relationDict.tsv", relationDict); + // --------------------------------------- Generating the graph --------------------------------------- morphstore::Graph ldbc_graph; @@ -279,9 +281,12 @@ int main( void ){ //ldbc_graph.printVertexByID(90563); ldbc_graph.statistics(); + */ + // NEW LDBC-IMPORT TEST morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); ldbcImport.print_file_names(); + ldbcImport.generate_Vertices(); return 0; From a17146f5793dea1a7e864a81f7336948c9eb7adf Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 16 May 2019 15:19:36 +0200 Subject: [PATCH 017/216] vertex data structure for intermediate results in ldbc importer --- include/core/storage/graph/ldbc_import.h | 38 +++++++++++++------ .../storage/graph/generate_ldbc_graph.cpp | 2 +- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 6c28ca14..050848c4 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -39,6 +39,7 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; + std::unordered_map> vertices; // intermediate data structure for vertices public: @@ -87,9 +88,14 @@ namespace morphstore{ // (1) calculate global size to allocate for (const auto &address : verticesPaths) { + // data strcuture for attributes e.g. taglass hsa id, name, url + std::vector attributes; + // get the entity from address ([...path...] / [entity-name].csv) and put key into attributes map std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - attributes[entity]; + + std::cout << "\t{Processing " + entity + ".csv}" << std::endl; + std::cout.flush(); char* buffer; @@ -131,13 +137,30 @@ namespace morphstore{ size_t last = 0; size_t next = 0; while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes[entity].push_back(row.substr(last, next-last)); + attributes.push_back(row.substr(last, next-last)); last = next + 1; } // last attribute - attributes[entity].push_back(row.substr(last)); + attributes.push_back(row.substr(last)); }else{ // (4) generate vertex with properties and write to graph + // (4) generate vertex with properties and write to graph + std::unordered_map properties; + + size_t last = 0; + size_t next = 0; + size_t attrIndex = 0; + while ((next = row.find(delimiter, last)) != std::string::npos){ + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); + last = next + 1; + attrIndex++; + } + // last attribute + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + // add entity + properties.insert(std::make_pair("entity", entity)); + vertices.insert(std::make_pair( std::stoull(row.substr(0, row.find(delimiter))), properties)); + properties.clear(); } start = i; // set new starting point (otherwise it's concatenated) @@ -148,15 +171,6 @@ namespace morphstore{ vertexFile.close(); } - // Verify - for( std::unordered_map >::const_iterator ptr=attributes.begin(); ptr!=attributes.end(); ptr++) { - std::cout << ptr->first << ": "; - for( std::vector::const_iterator eptr=ptr->second.begin(); eptr!=ptr->second.end(); eptr++){ - std::cout << *eptr << " "; - } - std::cout << std::endl; - } - } } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 92f3f3a3..92f77f14 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -285,7 +285,7 @@ int main( void ){ // NEW LDBC-IMPORT TEST morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); - ldbcImport.print_file_names(); + //ldbcImport.print_file_names(); ldbcImport.generate_Vertices(); From cab04464344e9874383843c8c8d5b6092ee67bea Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 16 May 2019 17:22:54 +0200 Subject: [PATCH 018/216] first attempt of loading ldbc-vertex-data into graph structure (i.e generate vertices) -> works --- include/core/storage/graph/graph.h | 17 ++- include/core/storage/graph/ldbc_import.h | 28 ++++- include/core/storage/graph/vertex.h | 21 ++-- .../storage/graph/generate_ldbc_graph.cpp | 108 +----------------- 4 files changed, 52 insertions(+), 122 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 1f21d473..3132d0d4 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -42,10 +42,22 @@ namespace morphstore{ public: // function to add a new (ldbc) vertex to the graph - void add_vertex(uint64_t id, uint64_t ldbc_id, int entity){ + void add_vertex(uint64_t id, uint64_t ldbc_id){ // if key is not present -> create vertex if(!exist_id(id)){ - Vertex v(id, ldbc_id, entity); + Vertex v(id, ldbc_id); + vertices.insert(std::make_pair(id, v)); + }else{ + std::cout << "Vertex with ID " << id << " already exists!"; + } + } + + // function to add a new (ldbc) vertex to the graph + void add_vertex_with_properties(uint64_t id, uint64_t ldbc_id, std::unordered_map& props ){ + // if key is not present -> create vertex + if(!exist_id(id)){ + Vertex v(id, ldbc_id); + v.setProperties(props); vertices.insert(std::make_pair(id, v)); }else{ std::cout << "Vertex with ID " << id << " already exists!"; @@ -94,7 +106,6 @@ namespace morphstore{ Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; std::cout << "LDBC-ID: \t"<< v->getLDBC_Id() << std::endl; - std::cout << "Entity-ID: \t"<< v->getEntity() << std::endl; std::cout << "#Edges: \t" << v->getAdjList().size() << std::endl; std::cout << "Adj.List: "; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 050848c4..df802702 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -39,10 +39,11 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - std::unordered_map> vertices; // intermediate data structure for vertices + std::unordered_map> verticesMap; // intermediate data structure for vertices public: + // constructor LDBC_Import(const std::string& dir){ directory = dir; @@ -77,7 +78,7 @@ namespace morphstore{ } // This function generates the the vertices from the vertex-vector - void generate_Vertices(){ + void read_data_vertices(){ // data structure for attributes: entity -> (attributes), e.g. tagclass -> (id, name, url) std::unordered_map> attributes; @@ -143,8 +144,7 @@ namespace morphstore{ // last attribute attributes.push_back(row.substr(last)); }else{ - // (4) generate vertex with properties and write to graph - // (4) generate vertex with properties and write to graph + // (4) get properties from buffer/row and store them std::unordered_map properties; size_t last = 0; @@ -159,7 +159,8 @@ namespace morphstore{ properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); // add entity properties.insert(std::make_pair("entity", entity)); - vertices.insert(std::make_pair( std::stoull(row.substr(0, row.find(delimiter))), properties)); + + verticesMap.insert(std::make_pair( std::stoull(row.substr(0, row.find(delimiter))), properties)); properties.clear(); } @@ -174,6 +175,13 @@ namespace morphstore{ } } + void generate_vertices_in_graph(Graph& graph){ + for(const auto& v : verticesMap){ + std::unordered_map props = verticesMap.at(v.first); + graph.add_vertex_with_properties(v.first, v.first, props); + } + } + // for debugging void print_file_names(){ std::cout << "Vertices-Files: " << std::endl; @@ -188,6 +196,16 @@ namespace morphstore{ } + // debugging + void printVertexAt(uint64_t ldbc_id){ + std::unordered_map searchedObject = verticesMap.at(ldbc_id); + std::cout << "Vertex={ ldbc_id=" << ldbc_id << " "; + for(const auto& attr : searchedObject) { + std::cout << attr.first << "=" << attr.second << " "; + } + std::cout << " }" << std::endl; + } + }; diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index fc8e28ab..989196f4 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -46,7 +46,6 @@ namespace morphstore{ uint64_t id; // TODO: remove ldbc_id from Vertex schema (to get more general structure without ldbc-dependency) uint64_t ldbc_id; - int entity; std::vector adjList; // properties @@ -55,14 +54,13 @@ namespace morphstore{ public: // constrcutor without the adjList (Vertex can contain no edges int the graph) - Vertex(uint64_t id, uint64_t ldbc_id, int entity){ - SetVertex(id, ldbc_id, entity); + Vertex(uint64_t id, uint64_t ldbc_id){ + SetVertex(id, ldbc_id); } - void SetVertex(uint64_t id, uint64_t ldbc_id, int entity){ + void SetVertex(uint64_t id, uint64_t ldbc_id){ this->id = id; this->ldbc_id = ldbc_id; - this->entity = entity; } uint64_t getId() const{ @@ -73,18 +71,17 @@ namespace morphstore{ return ldbc_id; } - int getEntity(){ - return entity; - } - // returns a reference (read-only) of the adjacency list const std::vector& getAdjList() const{ return adjList; } - - void setEntity(int newEntity){ - entity = newEntity; + void setProperties(std::unordered_map& properties){ + if(!properties.empty()){ + this->properties = properties; + }else{ + std::cout << "The properties-list is empty!" << std::endl; + } } // function to add new neighbor vertex diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 92f77f14..d4b1e148 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -95,70 +95,6 @@ void importDataLookup(string address, unordered_map &rLookup){ cout << " --> done" << endl; } -void importDataVertex(string vertexFile, unordered_map> &vDict, unordered_map &eLookup){ - - cout << "Reading LDBC-Vertices ..."; - std::cout.flush(); - - char* buffer; - ifstream graph(vertexFile, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening - uint64_t fileSize = 0; - - if(!graph){ - cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - if (graph.is_open()) { - fileSize = graph.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - graph.clear(); - graph.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory with the filesize and the char size - buffer = (char*) malloc( fileSize * sizeof( char ) ); - graph.read(buffer, fileSize); // read data as one big block - size_t start = 0; - string delimiter = "\t"; - int entityIndex = 0; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point and do stuff ... - string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != string::npos){ - row.erase(0,1); - } - - // for entities we have to look that there is NO '\t' in the string - if(row.find(delimiter) == string::npos){ - eLookup.insert(make_pair(entityIndex, row)); - entityIndex++; - }else{ - - // acutal data: first is ldbc_id and second global_id - string ldbc_str = row.substr(0, row.find(delimiter)); - string global_str = row.erase(0, row.find(delimiter) + delimiter.length()); // erase from row (...)\t[data] - - // convert string to long int // TODO: is stoul enough for string -> size_t - uint64_t ldbc_id = stoul(ldbc_str,nullptr,10); - uint64_t global_id = stoul(global_str,nullptr,10); - - vDict.insert({global_id, make_pair(entityIndex-1, ldbc_id)}); - - } - start = i; // set new starting point (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - graph.close(); - - cout << " --> done" << endl; -} - void importDataRelations(string relationsFile, vector &rList){ cout << "Reading LDBC-Relations ..."; @@ -219,22 +155,6 @@ void importDataRelations(string relationsFile, vector &rList){ cout << " --> done" << endl; } -void generateVertices(unordered_map>& vertexDict, morphstore::Graph& g){ - - cout << "Generating Vertices ..."; - std::cout.flush(); - - // iterate through vertex-dict. and generate the vertices (objects) in the graph - for(std::unordered_map>::iterator it = vertexDict.begin(); it != vertexDict.end(); ++it){ - uint64_t id = it->first; - uint64_t ldbc_id = it->second.second; - int entity = it->second.first; - g.add_vertex(id, ldbc_id, entity); - } - - cout << " --> done" << endl; -} - void generateEdges(vector& rDict, morphstore::Graph& g){ cout << "Generating Relations ..."; @@ -263,31 +183,15 @@ int main( void ){ // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) vector relationDict; + // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ // TODO: get base directory with cin -> user input - string base = "/home/tim/Documents/TUD/(8) Informatik SS 2019/LDBC_Graph_Generating/LDBC_Python_Files/"; - - /* - importDataLookup(base + "relationLookup.tsv", relationLookup); - importDataVertex(base + "entityDict.tsv", vertexDict, entityLookup); // entityLookup is built within the function automatically - importDataRelations(base + "relationDict.tsv", relationDict); - - - // --------------------------------------- Generating the graph --------------------------------------- - - morphstore::Graph ldbc_graph; - generateVertices(vertexDict, ldbc_graph); - generateEdges(relationDict, ldbc_graph); - - //ldbc_graph.printVertexByID(90563); - ldbc_graph.statistics(); - - */ - - // NEW LDBC-IMPORT TEST morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); //ldbcImport.print_file_names(); - ldbcImport.generate_Vertices(); - + //ldbcImport.read_data_vertices(); + //ldbcImport.printVertexAt(2199024637094); + morphstore::Graph socialGraph; + ldbcImport.generate_vertices_in_graph(socialGraph); + socialGraph.statistics(); return 0; } \ No newline at end of file From 5c652bf0cc841b79a52c37bdf58a7c33e3c10cc7 Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 17 May 2019 12:29:14 +0200 Subject: [PATCH 019/216] little changes; added comments --- include/core/storage/graph/graph.h | 16 +-- include/core/storage/graph/ldbc_import.h | 115 +++++++++++++----- include/core/storage/graph/vertex.h | 14 ++- .../storage/graph/generate_ldbc_graph.cpp | 82 +------------ 4 files changed, 110 insertions(+), 117 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 3132d0d4..be35d83a 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -48,7 +48,7 @@ namespace morphstore{ Vertex v(id, ldbc_id); vertices.insert(std::make_pair(id, v)); }else{ - std::cout << "Vertex with ID " << id << " already exists!"; + std::cout << "Vertex with ID " << id << " already exists in the database!"; } } @@ -57,21 +57,21 @@ namespace morphstore{ // if key is not present -> create vertex if(!exist_id(id)){ Vertex v(id, ldbc_id); - v.setProperties(props); + v.set_properties(props); vertices.insert(std::make_pair(id, v)); }else{ - std::cout << "Vertex with ID " << id << " already exists!"; + std::cout << "Vertex with ID " << id << " already exists in the database!"; } } // function that creates a new relation/edge between two (existing) vertices - void add_edge(uint64_t sourceID, uint64_t targetID, int relation){ + void add_edge(uint64_t sourceID, uint64_t targetID, std::string rel){ if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); - sourceV->add_edge(targetV, relation); + sourceV->add_edge(targetV, rel); }else{ - std::cout << "Source-/Target-Vertex-ID does not exist!"; + std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; } } @@ -106,10 +106,10 @@ namespace morphstore{ Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; std::cout << "LDBC-ID: \t"<< v->getLDBC_Id() << std::endl; - std::cout << "#Edges: \t" << v->getAdjList().size() << std::endl; + std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; std::cout << "Adj.List: "; - const std::vector& adjList = v->getAdjList(); + const std::vector& adjList = v->get_adjList(); for(const auto& e : adjList){ std::cout << "(" << e.target->getId() << "," << e.relation << ") "; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index df802702..7ca828f4 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -39,7 +38,7 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - std::unordered_map> verticesMap; // intermediate data structure for vertices + std::unordered_map> verticesMap; // intermediate results public: @@ -54,6 +53,7 @@ namespace morphstore{ return directory; } + // function which iterates through directory to receive file names (entire path) void insert_file_names(std::string dir){ for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ // ignore files starting with a '.' @@ -77,27 +77,22 @@ namespace morphstore{ } } - // This function generates the the vertices from the vertex-vector + // this function reads the vertices-files and write it to the intermediate map verticesMap void read_data_vertices(){ - // data structure for attributes: entity -> (attributes), e.g. tagclass -> (id, name, url) - std::unordered_map> attributes; if(!verticesPaths.empty()) { - std::cout << "Generating LDBC-Vertices ..." << std::endl; + std::cout << "Generating LDBC-Vertices ..."; std::cout.flush(); - // (1) calculate global size to allocate + // iterate through vector of vertex-addresses for (const auto &address : verticesPaths) { - // data strcuture for attributes e.g. taglass hsa id, name, url + // data structure for attributes of entity, e.g. taglass -> id, name, url std::vector attributes; - // get the entity from address ([...path...] / [entity-name].csv) and put key into attributes map + // get the entity from address ([...path...] / [entity-name].csv) std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - std::cout << "\t{Processing " + entity + ".csv}" << std::endl; - std::cout.flush(); - char* buffer; uint64_t fileSize = 0; @@ -109,22 +104,23 @@ namespace morphstore{ exit(EXIT_FAILURE); } + // calculate file size if (vertexFile.is_open()) { fileSize = vertexFile.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. vertexFile.clear(); vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } - // (2) allocate memory + // allocate memory buffer = (char*) malloc( fileSize * sizeof( char ) ); vertexFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; - // (3) do actual work with data in buffer ... + // read buffer and do the magic ... for(size_t i = 0; i < fileSize; ++i){ if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point and do stuff ... + // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); // remove unnecessary '\n' at the beginning of a string @@ -132,11 +128,11 @@ namespace morphstore{ row.erase(0,1); } - // handle first line of *.csv: contains the attributes; first attribute is ldbc_id -> important for edge-generation + size_t last = 0; + size_t next = 0; + // first line of *.csv contains the attributes -> write to attributes vector if(row.rfind("id", 0) == 0){ // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - size_t last = 0; - size_t next = 0; while ((next = row.find(delimiter, last)) != std::string::npos){ attributes.push_back(row.substr(last, next-last)); last = next + 1; @@ -144,42 +140,105 @@ namespace morphstore{ // last attribute attributes.push_back(row.substr(last)); }else{ - // (4) get properties from buffer/row and store them + // actual data: write to intermediate properties map std::unordered_map properties; - - size_t last = 0; - size_t next = 0; size_t attrIndex = 0; while ((next = row.find(delimiter, last)) != std::string::npos){ properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); last = next + 1; - attrIndex++; + ++attrIndex; } // last attribute properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); // add entity properties.insert(std::make_pair("entity", entity)); - + //insert into main importer data structure verticesMap.insert(std::make_pair( std::stoull(row.substr(0, row.find(delimiter))), properties)); - properties.clear(); + properties.clear(); // free memory } - start = i; // set new starting point (otherwise it's concatenated) + start = i; // set new starting point for buffer (otherwise it's concatenated) } } delete[] buffer; // free memory vertexFile.close(); } - + std::cout << " --> done" << std::endl; } } + // function which generates the vertices to a given graph void generate_vertices_in_graph(Graph& graph){ + // for every vertex in the intermediate verticesMap, get properties map and insert into graph for(const auto& v : verticesMap){ std::unordered_map props = verticesMap.at(v.first); graph.add_vertex_with_properties(v.first, v.first, props); } + // clear vector + verticesMap.clear(); + } + + // this function reads the relation-files and write it to the intermediate map verticesMap + void read_data_edges(){ + + if(!verticesPaths.empty()) { + std::cout << "Generating LDBC-Edges ..."; + std::cout.flush(); + + // iterate through vector of vertex-addresses + for (const auto &address : relationsPaths) { + + // data structure for attributes of entity, e.g. taglass -> id, name, url + std::vector attributes; + + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (vertexFile.is_open()) { + fileSize = vertexFile.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + vertexFile.close(); + } + std::cout << " --> done" << std::endl; + } } // for debugging @@ -197,7 +256,7 @@ namespace morphstore{ } // debugging - void printVertexAt(uint64_t ldbc_id){ + void print_vertex_at(uint64_t ldbc_id){ std::unordered_map searchedObject = verticesMap.at(ldbc_id); std::cout << "Vertex={ ldbc_id=" << ldbc_id << " "; for(const auto& attr : searchedObject) { diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 989196f4..6e89f6aa 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -36,7 +36,7 @@ namespace morphstore{ // this struct represents a relation to a target vertex; relation is the number in the lookup table struct Edge{ Vertex* target; - int relation; + std::string relation; }; class Vertex{ @@ -72,11 +72,11 @@ namespace morphstore{ } // returns a reference (read-only) of the adjacency list - const std::vector& getAdjList() const{ + const std::vector& get_adjList() const{ return adjList; } - void setProperties(std::unordered_map& properties){ + void set_properties(std::unordered_map &properties){ if(!properties.empty()){ this->properties = properties; }else{ @@ -85,13 +85,17 @@ namespace morphstore{ } // function to add new neighbor vertex - void add_edge(Vertex *target, int rel){ + void add_edge(Vertex *target, std::string relation){ Edge e; - e.relation = rel; e.target = target; + e.relation = relation; this->adjList.push_back(e); } + void add_edge_with_property(){ + // TODO + } + int get_number_of_edges(){ return adjList.size(); } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index d4b1e148..820f97ae 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -33,67 +33,7 @@ using namespace std; -struct Relation{ - uint64_t fromID; - uint64_t toID; - int relID; -}; - -void importDataLookup(string address, unordered_map &rLookup){ - cout << "Reading LDBC-Lookups ..."; - std::cout.flush(); - - char* buffer; - ifstream data(address, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening - uint64_t fileSize = 0; - - if(!data){ - cerr << "\nError, opening file. "; - exit(EXIT_FAILURE); - } - - if (data.is_open()) { - fileSize = data.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - data.clear(); - data.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory with the filesize and the char size - buffer = (char*) malloc( fileSize * sizeof( char ) ); - data.read(buffer, fileSize); // read data as one big block - size_t start = 0; - string delimiter = "\t"; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - - // get a row into string form buffer with start- and end-point and do stuff ... - string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != string::npos){ - row.erase(0,1); - } - - string relationName = row.substr(0, row.find(delimiter)); - row.erase(0, row.find(delimiter) + delimiter.length()); - string relID_str = row.substr(0, row.find(delimiter)); - - // convert string data to needed types - int relID = stoi(relID_str, nullptr, 10); - - // put into lookup data structure - rLookup.insert(make_pair(relID, relationName)); - - start = i; // set new starting point (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - data.close(); - - cout << " --> done" << endl; -} +/* void importDataRelations(string relationsFile, vector &rList){ @@ -168,27 +108,17 @@ void generateEdges(vector& rDict, morphstore::Graph& g){ cout << " --> done" << endl; } -int main( void ){ - - // -------------------------------- Reading data from LDBC-tsv-files -------------------------------- - - // TODO: change intermediate results[] tsv -> [dicts] -> vertices to direct computation? (but then we lose the ldbc_id, if we remove it in Vertex class) - // Lookups for entity and relation: (e.g. (0 -> knows), (1 -> isLocatedIn), ... ) - unordered_map entityLookup; - unordered_map relationLookup; - - // Vertex data from tsv-files: unordered_map { global_id -> (entity.id, ldbc.id) } - unordered_map> vertexDict; + */ - // Relationship data from tsv-files: vector of struct Relation (fromID, ToID, rel.id) - vector relationDict; +int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + // TODO: get base directory with cin -> user input morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); //ldbcImport.print_file_names(); - //ldbcImport.read_data_vertices(); - //ldbcImport.printVertexAt(2199024637094); + ldbcImport.read_data_vertices(); + //ldbcImport.print_vertex_at(2199024637094); morphstore::Graph socialGraph; ldbcImport.generate_vertices_in_graph(socialGraph); socialGraph.statistics(); From a197aad96e62dc9beb785e082021b2590be417fc Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 17 May 2019 15:00:53 +0200 Subject: [PATCH 020/216] added hash function for pair as key in unordered map --- include/core/storage/graph/graph.h | 26 ++++++-------- include/core/storage/graph/ldbc_import.h | 35 +++++++++++++------ include/core/storage/graph/vertex.h | 20 +++-------- .../storage/graph/generate_ldbc_graph.cpp | 4 +-- 4 files changed, 43 insertions(+), 42 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index be35d83a..9bde6396 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -42,25 +42,22 @@ namespace morphstore{ public: // function to add a new (ldbc) vertex to the graph - void add_vertex(uint64_t id, uint64_t ldbc_id){ - // if key is not present -> create vertex - if(!exist_id(id)){ - Vertex v(id, ldbc_id); - vertices.insert(std::make_pair(id, v)); + void add_vertex(const Vertex& v){ + if(!exist_id(v.getId())){ + Vertex v; + vertices.insert(std::make_pair(v.getId(), v)); }else{ - std::cout << "Vertex with ID " << id << " already exists in the database!"; + std::cout << "Vertex with ID " << v.getId() << " already exists in the database!"; } } // function to add a new (ldbc) vertex to the graph - void add_vertex_with_properties(uint64_t id, uint64_t ldbc_id, std::unordered_map& props ){ - // if key is not present -> create vertex - if(!exist_id(id)){ - Vertex v(id, ldbc_id); + void add_vertex_with_properties(Vertex& v, std::unordered_map& props ){ + if(!exist_id(v.getId())){ v.set_properties(props); - vertices.insert(std::make_pair(id, v)); - }else{ - std::cout << "Vertex with ID " << id << " already exists in the database!"; + vertices.insert(std::make_pair(v.getId(), v)); + } else{ + std::cout << "Vertex with ID " << v.getId() << " already exists in the database!"; } } @@ -89,7 +86,7 @@ namespace morphstore{ for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ totalNumberEdges += it->second.get_number_of_edges(); } - return totalNumberEdges; + return static_cast(totalNumberEdges); } // for debbuging @@ -105,7 +102,6 @@ namespace morphstore{ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; - std::cout << "LDBC-ID: \t"<< v->getLDBC_Id() << std::endl; std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; std::cout << "Adj.List: "; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 7ca828f4..1b4c180f 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -30,6 +30,17 @@ #include #include +// hash function used to hash a pair of any kind using XOR (for verticesMap) +struct hash_pair { + template + size_t operator()(const std::pair& p) const + { + auto hash1 = std::hash{}(p.first); + auto hash2 = std::hash{}(p.second); + return hash1 ^ hash2; + } +}; + namespace morphstore{ class LDBC_Import{ @@ -38,7 +49,9 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - std::unordered_map> verticesMap; // intermediate results + + // intermediate vertex data structure: (entity, ldbc_id) -> properties [key is pair because we have to handle the local ids -> identification] + std::unordered_map< std::pair , std::unordered_map, hash_pair> verticesMap; public: @@ -81,7 +94,7 @@ namespace morphstore{ void read_data_vertices(){ if(!verticesPaths.empty()) { - std::cout << "Generating LDBC-Vertices ..."; + std::cout << "Reading LDBC-Vertices ..."; std::cout.flush(); // iterate through vector of vertex-addresses @@ -106,7 +119,7 @@ namespace morphstore{ // calculate file size if (vertexFile.is_open()) { - fileSize = vertexFile.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. vertexFile.clear(); vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } @@ -153,7 +166,7 @@ namespace morphstore{ // add entity properties.insert(std::make_pair("entity", entity)); //insert into main importer data structure - verticesMap.insert(std::make_pair( std::stoull(row.substr(0, row.find(delimiter))), properties)); + verticesMap.insert({{entity, row.substr(0, row.find(delimiter))}, properties}); properties.clear(); // free memory } @@ -171,9 +184,10 @@ namespace morphstore{ // function which generates the vertices to a given graph void generate_vertices_in_graph(Graph& graph){ // for every vertex in the intermediate verticesMap, get properties map and insert into graph - for(const auto& v : verticesMap){ - std::unordered_map props = verticesMap.at(v.first); - graph.add_vertex_with_properties(v.first, v.first, props); + for(const auto& vertex : verticesMap){ + std::unordered_map props = verticesMap.at(vertex.first); + Vertex v; + graph.add_vertex_with_properties(v, props); } // clear vector verticesMap.clear(); @@ -208,7 +222,7 @@ namespace morphstore{ // calculate file size if (vertexFile.is_open()) { - fileSize = vertexFile.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. vertexFile.clear(); vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } @@ -256,8 +270,9 @@ namespace morphstore{ } // debugging - void print_vertex_at(uint64_t ldbc_id){ - std::unordered_map searchedObject = verticesMap.at(ldbc_id); + void print_vertex_at(std::string entity, std::string ldbc_id){ + std::pair key = {entity, ldbc_id}; + std::unordered_map searchedObject = verticesMap.at(key); std::cout << "Vertex={ ldbc_id=" << ldbc_id << " "; for(const auto& attr : searchedObject) { std::cout << attr.first << "=" << attr.second << " "; diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 6e89f6aa..ab35542f 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -44,33 +44,23 @@ namespace morphstore{ private: // Vertex contains a (global) id; (old) ldbc id; entity number for lookup; vector adjList for the adjacency List uint64_t id; - // TODO: remove ldbc_id from Vertex schema (to get more general structure without ldbc-dependency) - uint64_t ldbc_id; std::vector adjList; - // properties std::unordered_map properties; public: // constrcutor without the adjList (Vertex can contain no edges int the graph) - Vertex(uint64_t id, uint64_t ldbc_id){ - SetVertex(id, ldbc_id); - } - - void SetVertex(uint64_t id, uint64_t ldbc_id){ - this->id = id; - this->ldbc_id = ldbc_id; + Vertex(){ + // unique ID generation + static uint64_t startID = 0; + id = startID++; } uint64_t getId() const{ return id; } - uint64_t getLDBC_Id(){ - return ldbc_id; - } - // returns a reference (read-only) of the adjacency list const std::vector& get_adjList() const{ return adjList; @@ -97,7 +87,7 @@ namespace morphstore{ } int get_number_of_edges(){ - return adjList.size(); + return static_cast(adjList.size()); } }; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 820f97ae..385aed58 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -118,9 +118,9 @@ int main( void ){ morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); //ldbcImport.print_file_names(); ldbcImport.read_data_vertices(); - //ldbcImport.print_vertex_at(2199024637094); + morphstore::Graph socialGraph; - ldbcImport.generate_vertices_in_graph(socialGraph); + //ldbcImport.generate_vertices_in_graph(socialGraph); socialGraph.statistics(); return 0; From dfcecb01bbee8a98768fcfad507f5261232e6f54 Mon Sep 17 00:00:00 2001 From: Tim Date: Sat, 18 May 2019 12:18:23 +0200 Subject: [PATCH 021/216] added global id lookup when ldbc generating --- include/core/storage/graph/ldbc_import.h | 41 ++++++------------- .../storage/graph/generate_ldbc_graph.cpp | 6 +-- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 1b4c180f..b8736fa5 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -50,8 +50,8 @@ namespace morphstore{ std::vector verticesPaths; std::vector relationsPaths; - // intermediate vertex data structure: (entity, ldbc_id) -> properties [key is pair because we have to handle the local ids -> identification] - std::unordered_map< std::pair , std::unordered_map, hash_pair> verticesMap; + // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id + std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; public: @@ -90,8 +90,8 @@ namespace morphstore{ } } - // this function reads the vertices-files and write it to the intermediate map verticesMap - void read_data_vertices(){ + // this function reads the vertices-files and creates vertices in a graph + void generate_vertices(Graph &graph){ if(!verticesPaths.empty()) { std::cout << "Reading LDBC-Vertices ..."; @@ -156,6 +156,7 @@ namespace morphstore{ // actual data: write to intermediate properties map std::unordered_map properties; size_t attrIndex = 0; + std::string ldbcID = row.substr(0, row.find(delimiter)); while ((next = row.find(delimiter, last)) != std::string::npos){ properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); last = next + 1; @@ -165,8 +166,13 @@ namespace morphstore{ properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); // add entity properties.insert(std::make_pair("entity", entity)); - //insert into main importer data structure - verticesMap.insert({{entity, row.substr(0, row.find(delimiter))}, properties}); + //----------------------------------------------------- + // create vertex and insert into graph with properties + Vertex v; + graph.add_vertex_with_properties(v, properties); + // map entity and ldbc id to system generated id + globalIdLookupMap.insert({{entity, ldbcID}, v.getId()}); + //----------------------------------------------------- properties.clear(); // free memory } @@ -181,18 +187,6 @@ namespace morphstore{ } } - // function which generates the vertices to a given graph - void generate_vertices_in_graph(Graph& graph){ - // for every vertex in the intermediate verticesMap, get properties map and insert into graph - for(const auto& vertex : verticesMap){ - std::unordered_map props = verticesMap.at(vertex.first); - Vertex v; - graph.add_vertex_with_properties(v, props); - } - // clear vector - verticesMap.clear(); - } - // this function reads the relation-files and write it to the intermediate map verticesMap void read_data_edges(){ @@ -269,17 +263,6 @@ namespace morphstore{ } - // debugging - void print_vertex_at(std::string entity, std::string ldbc_id){ - std::pair key = {entity, ldbc_id}; - std::unordered_map searchedObject = verticesMap.at(key); - std::cout << "Vertex={ ldbc_id=" << ldbc_id << " "; - for(const auto& attr : searchedObject) { - std::cout << attr.first << "=" << attr.second << " "; - } - std::cout << " }" << std::endl; - } - }; diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 385aed58..811307ee 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -116,10 +116,10 @@ int main( void ){ // TODO: get base directory with cin -> user input morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); - //ldbcImport.print_file_names(); - ldbcImport.read_data_vertices(); - morphstore::Graph socialGraph; + + //ldbcImport.print_file_names(); + ldbcImport.generate_vertices(socialGraph); //ldbcImport.generate_vertices_in_graph(socialGraph); socialGraph.statistics(); From d5b72bb567013a0bf272d4b111ba857c51ee169b Mon Sep 17 00:00:00 2001 From: Tim Date: Sat, 18 May 2019 13:35:13 +0200 Subject: [PATCH 022/216] little changes; started relation-import functionality (to be continued) --- include/core/storage/graph/ldbc_import.h | 47 ++++++++++++-- .../storage/graph/generate_ldbc_graph.cpp | 63 ------------------- 2 files changed, 41 insertions(+), 69 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index b8736fa5..44a74f07 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -29,6 +29,7 @@ #include #include #include +#include // hash function used to hash a pair of any kind using XOR (for verticesMap) struct hash_pair { @@ -49,7 +50,7 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - + std::vector entities; // for the multi-value attributes (lookup) // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; @@ -91,10 +92,10 @@ namespace morphstore{ } // this function reads the vertices-files and creates vertices in a graph - void generate_vertices(Graph &graph){ + void generate_vertices(morphstore::Graph &graph){ if(!verticesPaths.empty()) { - std::cout << "Reading LDBC-Vertices ..."; + std::cout << "Generating LDBC-Vertices ..."; std::cout.flush(); // iterate through vector of vertex-addresses @@ -143,8 +144,9 @@ namespace morphstore{ size_t last = 0; size_t next = 0; + // first line of *.csv contains the attributes -> write to attributes vector - if(row.rfind("id", 0) == 0){ + if(start == 0){ // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector while ((next = row.find(delimiter, last)) != std::string::npos){ attributes.push_back(row.substr(last, next-last)); @@ -182,13 +184,25 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); + // insert entity into vector + entities.push_back(entity); } std::cout << " --> done" << std::endl; } } - // this function reads the relation-files and write it to the intermediate map verticesMap - void read_data_edges(){ + // function which returns true, if parameter is a entity in ldbc-files + bool isEntity(const std::string& entity){ + // iterate through entities vector to look up for paramater + if (std::find(entities.begin(), entities.end(), entity) != entities.end()){ + return true; + } + return false; + } + + + // this function reads the relation-files and generates edges in graph + void generate_edges(/*morphstore::Graph& graph*/){ if(!verticesPaths.empty()) { std::cout << "Generating LDBC-Edges ..."; @@ -238,6 +252,26 @@ namespace morphstore{ row.erase(0,1); } + size_t last = 0; + size_t next = 0; + + // TODO: continue here {read first line and check if its relation or multi attribute file + generate edges in graph} + // first line of *.csv: Differentiate whether it's + // (1) relation without properties: e.g. Person.id|Person.id -> number = 2 + // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> number = 3 + // (3) multiple attribute: e.g. Person.id|email -> number = 2 + isEntity("email") == false + if(start == 0){ + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos){ + attributes.push_back(row.substr(last, next-last)); + last = next + 1; + } + // last attribute + attributes.push_back(row.substr(last)); + }else{ + // actual data ... do stuff here + } + start = i; // set new starting point for buffer (otherwise it's concatenated) } } @@ -245,6 +279,7 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); } + globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory std::cout << " --> done" << std::endl; } } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 811307ee..179fe85b 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -32,69 +32,7 @@ using namespace std; - /* - -void importDataRelations(string relationsFile, vector &rList){ - - cout << "Reading LDBC-Relations ..."; - std::cout.flush(); - - char* buffer; - ifstream graph(relationsFile, std::ios::binary | std::ios::ate ); // 'ate' means: open and seek to end immediately after opening - uint64_t fileSize = 0; - - if(!graph){ - cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - if (graph.is_open()) { - fileSize = graph.tellg(); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - graph.clear(); - graph.seekg( 0, std::ios::beg ); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory with the filesize and the char size - buffer = (char*) malloc( fileSize * sizeof( char ) ); - graph.read(buffer, fileSize); // read data as one big block - size_t start = 0; - string delimiter = "\t"; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - - // get a row into string form buffer with start- and end-point and do stuff ... - string row(&buffer[start], &buffer[i]); - - string fromID_str = row.substr(0, row.find(delimiter)); - row.erase(0, row.find(delimiter) + delimiter.length()); - string toID_str = row.substr(0, row.find(delimiter)); - string relID_str = row.erase(0, row.find(delimiter) + delimiter.length()); - - // convert string data to needed types - uint64_t fromID = stoul(fromID_str,nullptr,10); - if(toID_str == "-1") toID_str = fromID_str; // if the toID is -1 --> loop to itself; refers to the multiple attributes - uint64_t toID = stoul(toID_str,nullptr,10); - int relID = stoi(relID_str, nullptr, 10); - - // write to relationDict data structure - Relation r; - r.fromID = fromID; - r.toID = toID; - r.relID = relID; - rList.push_back(r); - - start = i; // set new starting point (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - graph.close(); - - cout << " --> done" << endl; -} - void generateEdges(vector& rDict, morphstore::Graph& g){ cout << "Generating Relations ..."; @@ -120,7 +58,6 @@ int main( void ){ //ldbcImport.print_file_names(); ldbcImport.generate_vertices(socialGraph); - //ldbcImport.generate_vertices_in_graph(socialGraph); socialGraph.statistics(); return 0; From 24915f7503f265b8c56788f068f6df2f2f5a258d Mon Sep 17 00:00:00 2001 From: Tim Date: Sat, 18 May 2019 13:40:33 +0200 Subject: [PATCH 023/216] forgotten comments and little changes --- include/core/storage/graph/ldbc_import.h | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 44a74f07..0f151afa 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -204,7 +204,7 @@ namespace morphstore{ // this function reads the relation-files and generates edges in graph void generate_edges(/*morphstore::Graph& graph*/){ - if(!verticesPaths.empty()) { + if(!relationsPaths.empty()) { std::cout << "Generating LDBC-Edges ..."; std::cout.flush(); @@ -214,30 +214,33 @@ namespace morphstore{ // data structure for attributes of entity, e.g. taglass -> id, name, url std::vector attributes; - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string fromEntity = relation.substr(0, relation.find('_')); + std::string relationName = relation.substr(fromEntity.size() + 1, relation.find('_') - 1); + std::string toEntity = relation.substr(fromEntity.size() + relationName.size() + 2, relation.find('_')); char* buffer; uint64_t fileSize = 0; - std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!vertexFile) { + if (!relationFile) { std::cerr << "Error, opening file. "; exit(EXIT_FAILURE); } // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory buffer = (char*) malloc( fileSize * sizeof( char ) ); - vertexFile.read(buffer, fileSize); // read data as one big block + relationFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; @@ -277,7 +280,7 @@ namespace morphstore{ } delete[] buffer; // free memory - vertexFile.close(); + relationFile.close(); } globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory std::cout << " --> done" << std::endl; From abc475019d8883cf9ad9a237dd4b4fe38d04f248 Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 20 May 2019 17:34:20 +0200 Subject: [PATCH 024/216] working edge-generating from ldbc files --- include/core/storage/graph/graph.h | 13 ++ include/core/storage/graph/ldbc_import.h | 148 +++++++++++------- include/core/storage/graph/vertex.h | 18 ++- .../storage/graph/generate_ldbc_graph.cpp | 19 +-- 4 files changed, 120 insertions(+), 78 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 9bde6396..fb8c19df 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -72,6 +72,17 @@ namespace morphstore{ } } + // function that creates a new relation/edge between two (existing) vertices WITH property + void add_edge_with_property(uint64_t sourceID, uint64_t targetID, std::string rel, std::pair property){ + if(exist_id(sourceID) && exist_id(targetID)){ + Vertex* sourceV = &vertices.at(sourceID); + Vertex* targetV = &vertices.at(targetID); + sourceV->add_edge_with_property(targetV, rel, property); + }else{ + std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; + } + } + // function to check if the ID is present or not bool exist_id(const uint64_t id){ if(vertices.find(id) == vertices.end()){ @@ -110,6 +121,8 @@ namespace morphstore{ std::cout << "(" << e.target->getId() << "," << e.relation << ") "; } std::cout << "\n"; + std::cout << "Properties: "; v->print_properties(); + std::cout << "\n"; } }; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 0f151afa..663c8078 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -95,7 +95,7 @@ namespace morphstore{ void generate_vertices(morphstore::Graph &graph){ if(!verticesPaths.empty()) { - std::cout << "Generating LDBC-Vertices ..."; + std::cout << "(1/2) Generating LDBC-Vertices ..."; std::cout.flush(); // iterate through vector of vertex-addresses @@ -155,7 +155,7 @@ namespace morphstore{ // last attribute attributes.push_back(row.substr(last)); }else{ - // actual data: write to intermediate properties map + // actual data: std::unordered_map properties; size_t attrIndex = 0; std::string ldbcID = row.substr(0, row.find(delimiter)); @@ -202,85 +202,119 @@ namespace morphstore{ // this function reads the relation-files and generates edges in graph - void generate_edges(/*morphstore::Graph& graph*/){ + void generate_edges(morphstore::Graph& graph){ if(!relationsPaths.empty()) { - std::cout << "Generating LDBC-Edges ..."; + std::cout << "(2/2) Generating LDBC-Edges ..."; std::cout.flush(); // iterate through vector of vertex-addresses for (const auto &address : relationsPaths) { - // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; - // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); std::string fromEntity = relation.substr(0, relation.find('_')); - std::string relationName = relation.substr(fromEntity.size() + 1, relation.find('_') - 1); - std::string toEntity = relation.substr(fromEntity.size() + relationName.size() + 2, relation.find('_')); + relation.erase(0, relation.find('_') + 1); - char* buffer; - - uint64_t fileSize = 0; + std::string relationName = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::string toEntity = relation; - if (!relationFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); + // check from file name whether it's a relation file or multi value attribute file + // TODO: change handling of multi-value attributes (now just skipping...) + if(!isEntity(toEntity)){ + // multiple attribute; toEntity in file-name is no entity -> e.g. isEntity("email") == false + std::cout << "\tFile is a multi-value attribute file. Skipping!" << std::endl; } + // handling of relation-files ... + else{ - // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } + char* buffer; - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - relationFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; + uint64_t fileSize = 0; - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } + if (!relationFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } - size_t last = 0; - size_t next = 0; + // calculate file size + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } - // TODO: continue here {read first line and check if its relation or multi attribute file + generate edges in graph} - // first line of *.csv: Differentiate whether it's - // (1) relation without properties: e.g. Person.id|Person.id -> number = 2 - // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> number = 3 - // (3) multiple attribute: e.g. Person.id|email -> number = 2 + isEntity("email") == false - if(start == 0){ - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes.push_back(row.substr(last, next-last)); - last = next + 1; + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + relationFile.read(buffer, fileSize); // read data as one big block + + size_t start = 0; + std::string delimiter = "|"; + bool hasProperties = false; + std::string propertyKey; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); } - // last attribute - attributes.push_back(row.substr(last)); - }else{ - // actual data ... do stuff here - } - start = i; // set new starting point for buffer (otherwise it's concatenated) + size_t last = 0; + size_t next = 0; + size_t count = 0; + + // first line of *.csv: Differentiate whether it's + // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 + // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 + if(start == 0){ + // if there are 2 delimiter ('|') -> relation file with properties + while ((next = row.find(delimiter, last)) != std::string::npos){ + last = next + 1; + ++count; + } + if(count == 2){ + hasProperties = true; + propertyKey = row.substr(last); + } + }else{ + // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property + // get the system-(global) id's from local ids + uint64_t fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + // remove from id from string + row.erase(0, row.find(delimiter) + delimiter.length()); + std::string value; + uint64_t toID; + if(!hasProperties){ + // WITHOUT properties: just from the first delimiter on + toID = globalIdLookupMap.at({toEntity, row}); + + // Generate edge in graph + graph.add_edge(fromID, toID, relationName); + }else{ + // with properties means: toID is until the next delimiter, and then the value for the property + toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); + row.erase(0, row.find(delimiter) + delimiter.length()); + value = row; + graph.add_edge_with_property(fromID, toID, relationName, {propertyKey, value}); + } + } + start = i; // set new starting point for buffer (otherwise it's concatenated) + } } - } - delete[] buffer; // free memory - relationFile.close(); + delete[] buffer; // free memory + relationFile.close(); + + } } globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory std::cout << " --> done" << std::endl; diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index ab35542f..704ce389 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -37,6 +37,7 @@ namespace morphstore{ struct Edge{ Vertex* target; std::string relation; + std::pair property; }; class Vertex{ @@ -74,7 +75,7 @@ namespace morphstore{ } } - // function to add new neighbor vertex + // function that creates a new relation/edge between two (existing) vertices withouht properties void add_edge(Vertex *target, std::string relation){ Edge e; e.target = target; @@ -82,13 +83,24 @@ namespace morphstore{ this->adjList.push_back(e); } - void add_edge_with_property(){ - // TODO + // add edge with properties to vertex + void add_edge_with_property(Vertex *target, std::string relation, std::pair property){ + Edge e; + e.target = target; + e.relation = relation; + e.property = property; + this->adjList.push_back(e); } int get_number_of_edges(){ return static_cast(adjList.size()); } + + void print_properties(){ + for(const auto& entry : properties){ + std::cout << "{" << entry.first << ": " << entry.second << "}"; + } + } }; } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 179fe85b..a94a579b 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -30,24 +30,6 @@ #include #include -using namespace std; - -/* -void generateEdges(vector& rDict, morphstore::Graph& g){ - - cout << "Generating Relations ..."; - std::cout.flush(); - - // iterate through relationDict and add (target.id, rel.id) to the vertex adj.-list - for(std::vector::iterator it = rDict.begin(); it != rDict.end(); ++it){ - g.add_edge(it->fromID, it->toID, it->relID); - } - - cout << " --> done" << endl; -} - - */ - int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ @@ -58,6 +40,7 @@ int main( void ){ //ldbcImport.print_file_names(); ldbcImport.generate_vertices(socialGraph); + ldbcImport.generate_edges(socialGraph); socialGraph.statistics(); return 0; From 6d9924ec6b9ea362464e2e07c8e367568cc93753 Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 20 May 2019 17:49:41 +0200 Subject: [PATCH 025/216] little changes in ldbc importer --- include/core/storage/graph/ldbc_import.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 663c8078..d4cc8535 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -50,14 +50,13 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - std::vector entities; // for the multi-value attributes (lookup) + std::vector entities; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; public: - // constructor LDBC_Import(const std::string& dir){ directory = dir; insert_file_names(directory); @@ -310,10 +309,8 @@ namespace morphstore{ start = i; // set new starting point for buffer (otherwise it's concatenated) } } - delete[] buffer; // free memory relationFile.close(); - } } globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory @@ -327,7 +324,6 @@ namespace morphstore{ for(const auto& v : verticesPaths){ std::cout << "\t" << v << std::endl; } - std::cout << "Relations-Files: " << std::endl; for(const auto& rel : relationsPaths){ std::cout << "\t" << rel << std::endl; @@ -336,9 +332,6 @@ namespace morphstore{ } }; - - - } #endif //MORPHSTORE_LDBC_IMPORT_H From 1f3c739f9402ac76f68c8534397aadc6c9829f1a Mon Sep 17 00:00:00 2001 From: Tim Date: Mon, 20 May 2019 19:08:41 +0200 Subject: [PATCH 026/216] remove unused include's --- include/core/storage/graph/graph.h | 2 +- include/core/storage/graph/ldbc_import.h | 6 ++---- include/core/storage/graph/vertex.h | 2 +- test/core/storage/graph/generate_ldbc_graph.cpp | 8 +------- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index fb8c19df..31ab69d2 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -18,7 +18,7 @@ /** * @file graph.h * @brief Graph storage format -> adjacency Lists - * @todo Add property structure to Vertex and Edge + * @todo */ #ifndef MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index d4cc8535..e46dbd58 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,7 +18,7 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph - * @todo Any TODOS? + * @todo process Multi-value attributes */ #ifndef MORPHSTORE_LDBC_IMPORT_H @@ -221,10 +221,8 @@ namespace morphstore{ std::string toEntity = relation; // check from file name whether it's a relation file or multi value attribute file - // TODO: change handling of multi-value attributes (now just skipping...) if(!isEntity(toEntity)){ - // multiple attribute; toEntity in file-name is no entity -> e.g. isEntity("email") == false - std::cout << "\tFile is a multi-value attribute file. Skipping!" << std::endl; + // TODO: add code here for multi value attr... } // handling of relation-files ... else{ diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 704ce389..4ef41a72 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -18,7 +18,7 @@ /** * @file vertex.h * @brief vertex class and its functions + Edge struct - * @todo Add data structure for properties + * @todo */ #ifndef MORPHSTORE_VERTEX_H diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index a94a579b..cc95d1a4 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -18,18 +18,12 @@ /** * @file generate_ldbc_graph.cpp * @brief Test for generating social network graph from LDBC files - * @todo TODOS? + * @todo */ #include #include -#include -#include -#include -#include -#include - int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ From 61d2ff90afb67214d8fd92854dd318712229db90 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 21 May 2019 13:53:59 +0200 Subject: [PATCH 027/216] added simply functionality for multi-value attributes (just write the last attribute) --- include/core/storage/graph/graph.h | 11 ++- include/core/storage/graph/ldbc_import.h | 85 +++++++++++++------ include/core/storage/graph/vertex.h | 8 +- .../storage/graph/generate_ldbc_graph.cpp | 8 +- 4 files changed, 84 insertions(+), 28 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 31ab69d2..8de57e3b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -54,7 +54,7 @@ namespace morphstore{ // function to add a new (ldbc) vertex to the graph void add_vertex_with_properties(Vertex& v, std::unordered_map& props ){ if(!exist_id(v.getId())){ - v.set_properties(props); + v.add_properties(props); vertices.insert(std::make_pair(v.getId(), v)); } else{ std::cout << "Vertex with ID " << v.getId() << " already exists in the database!"; @@ -83,6 +83,15 @@ namespace morphstore{ } } + // this adds a specific key-value pair (property) to a vertex given by its id + void add_property_to_vertex(uint64_t id, const std::pair& property){ + if(exist_id(id)){ + vertices.at(id).add_property(property); + }else{ + std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; + } + } + // function to check if the ID is present or not bool exist_id(const uint64_t id){ if(vertices.find(id) == vertices.end()){ diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e46dbd58..1fe8f7d8 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -220,39 +220,75 @@ namespace morphstore{ std::string toEntity = relation; + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!relationFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + relationFile.read(buffer, fileSize); // read data as one big block + + size_t start = 0; + std::string delimiter = "|"; + // check from file name whether it's a relation file or multi value attribute file if(!isEntity(toEntity)){ - // TODO: add code here for multi value attr... - } - // handling of relation-files ... - else{ + // Multi-value-attributes: just take the last recently one + std::string propertyKey; + std::unordered_map multiValueAttr; + uint64_t systemID; + std::string value; - char* buffer; + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); - uint64_t fileSize = 0; + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' + if(start == 0){ + propertyKey = row.substr(row.find(delimiter) + 1); + }else{ + // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) + systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + value = row.substr(row.find(delimiter) + 1); + multiValueAttr[systemID] = value; + } - if (!relationFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); + start = i; // set new starting point for buffer (otherwise it's concatenated) + } } - - // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + // iterate through multiValue map and assign property to vertex + for(const auto &pair : multiValueAttr){ + const std::pair& keyValuePair = {propertyKey, pair.second}; + graph.add_property_to_vertex(pair.first, keyValuePair); } - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - relationFile.read(buffer, fileSize); // read data as one big block + } + // handling of relation-files ... + else{ - size_t start = 0; - std::string delimiter = "|"; bool hasProperties = false; std::string propertyKey; + uint64_t fromID, toID; // read buffer and do the magic ... for(size_t i = 0; i < fileSize; ++i){ @@ -285,11 +321,10 @@ namespace morphstore{ }else{ // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property // get the system-(global) id's from local ids - uint64_t fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); // remove from id from string row.erase(0, row.find(delimiter) + delimiter.length()); std::string value; - uint64_t toID; if(!hasProperties){ // WITHOUT properties: just from the first delimiter on toID = globalIdLookupMap.at({toEntity, row}); @@ -307,9 +342,9 @@ namespace morphstore{ start = i; // set new starting point for buffer (otherwise it's concatenated) } } - delete[] buffer; // free memory - relationFile.close(); } + delete[] buffer; // free memory + relationFile.close(); } globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory std::cout << " --> done" << std::endl; diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 4ef41a72..17a6eb3c 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -67,7 +67,8 @@ namespace morphstore{ return adjList; } - void set_properties(std::unordered_map &properties){ + // this function adds a whole property map to a vertex + void add_properties(std::unordered_map &properties){ if(!properties.empty()){ this->properties = properties; }else{ @@ -75,6 +76,11 @@ namespace morphstore{ } } + // this adds one key-value pair to the vertex's property map + void add_property(const std::pair& property){ + this->properties[property.first] = property.second; + } + // function that creates a new relation/edge between two (existing) vertices withouht properties void add_edge(Vertex *target, std::string relation){ Edge e; diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index cc95d1a4..5f5124b6 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -32,10 +32,16 @@ int main( void ){ morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::Graph socialGraph; - //ldbcImport.print_file_names(); + // generate vertices & edges from LDBC files and insert into socialGraph ldbcImport.generate_vertices(socialGraph); ldbcImport.generate_edges(socialGraph); + socialGraph.statistics(); + // test vertices: + socialGraph.print_vertex_by_id(100454); + socialGraph.print_vertex_by_id(100450); + socialGraph.print_vertex_by_id(100168); + return 0; } \ No newline at end of file From 80f1c2cf0abd36847514f585ce46ac4b19f95d80 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 21 May 2019 13:59:01 +0200 Subject: [PATCH 028/216] comment-out stuff --- include/core/storage/graph/graph.h | 1 + test/core/storage/graph/generate_ldbc_graph.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 8de57e3b..afab91c2 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -132,6 +132,7 @@ namespace morphstore{ std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); std::cout << "\n"; + std::cout << "-----------------------------------------------" << std::endl; } }; diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 5f5124b6..9ae5179b 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -37,11 +37,11 @@ int main( void ){ ldbcImport.generate_edges(socialGraph); socialGraph.statistics(); - + /* // test vertices: socialGraph.print_vertex_by_id(100454); socialGraph.print_vertex_by_id(100450); socialGraph.print_vertex_by_id(100168); - + */ return 0; } \ No newline at end of file From 4da862fa86c5f0e0d639698f432233134cb1d724 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 23 May 2019 13:36:10 +0200 Subject: [PATCH 029/216] little changes; added time measuring --- include/core/storage/graph/graph.h | 37 ++++++++++--------- include/core/storage/graph/ldbc_import.h | 9 ++--- include/core/storage/graph/vertex.h | 8 ++++ .../storage/graph/generate_ldbc_graph.cpp | 8 ++++ 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index afab91c2..d9c70c4b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -41,24 +41,9 @@ namespace morphstore{ public: - // function to add a new (ldbc) vertex to the graph - void add_vertex(const Vertex& v){ - if(!exist_id(v.getId())){ - Vertex v; - vertices.insert(std::make_pair(v.getId(), v)); - }else{ - std::cout << "Vertex with ID " << v.getId() << " already exists in the database!"; - } - } - - // function to add a new (ldbc) vertex to the graph - void add_vertex_with_properties(Vertex& v, std::unordered_map& props ){ - if(!exist_id(v.getId())){ - v.add_properties(props); - vertices.insert(std::make_pair(v.getId(), v)); - } else{ - std::cout << "Vertex with ID " << v.getId() << " already exists in the database!"; - } + void add_vertex(){ + Vertex v; + vertices.insert(std::make_pair(v.getId(), v)); } // function that creates a new relation/edge between two (existing) vertices @@ -83,6 +68,14 @@ namespace morphstore{ } } + // function to add a new (ldbc) vertex to the graph and returns system-ID + uint64_t add_vertex_with_properties(std::unordered_map& props ){ + Vertex v; + v.add_properties(props); + vertices.insert(std::make_pair(v.getId(), v)); + return v.getId(); + } + // this adds a specific key-value pair (property) to a vertex given by its id void add_property_to_vertex(uint64_t id, const std::pair& property){ if(exist_id(id)){ @@ -92,6 +85,14 @@ namespace morphstore{ } } + void add_entity_to_vertex(uint64_t id, std::string entity){ + if(exist_id(id)){ + vertices.at(id).add_entity(entity); + }else{ + std::cout << "Vertex with ID " << id << " does not exist in the database!"; + } + } + // function to check if the ID is present or not bool exist_id(const uint64_t id){ if(vertices.find(id) == vertices.end()){ diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 1fe8f7d8..1aec2a57 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -165,14 +165,13 @@ namespace morphstore{ } // last attribute properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); - // add entity - properties.insert(std::make_pair("entity", entity)); //----------------------------------------------------- // create vertex and insert into graph with properties - Vertex v; - graph.add_vertex_with_properties(v, properties); + uint64_t systemID = graph.add_vertex_with_properties(properties); + // add entity to vertex + graph.add_entity_to_vertex(systemID, entity); // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, v.getId()}); + globalIdLookupMap.insert({{entity, ldbcID}, systemID}); //----------------------------------------------------- properties.clear(); // free memory } diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 17a6eb3c..b4b2c4cb 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -27,6 +27,7 @@ #include #include #include +#include namespace morphstore{ @@ -48,6 +49,8 @@ namespace morphstore{ std::vector adjList; // properties std::unordered_map properties; + // a vertex can have multiple entites + std::unordered_set entities; public: @@ -98,6 +101,11 @@ namespace morphstore{ this->adjList.push_back(e); } + // add entity to vertex + void add_entity(std::string e){ + this->entities.insert(e); + } + int get_number_of_edges(){ return static_cast(adjList.size()); } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 9ae5179b..7d7f24b3 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -23,10 +23,12 @@ #include #include +#include // for high_resolution_clock int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time // TODO: get base directory with cin -> user input morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); @@ -36,7 +38,13 @@ int main( void ){ ldbcImport.generate_vertices(socialGraph); ldbcImport.generate_edges(socialGraph); + // measuring time... + auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time + std::chrono::duration elapsed = finish - start; + socialGraph.statistics(); + std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; + /* // test vertices: socialGraph.print_vertex_by_id(100454); From 8e6a8816a439144f2ae007050f7e688f66347e6a Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 23 May 2019 14:03:38 +0200 Subject: [PATCH 030/216] declaring some function parameters as const --- include/core/storage/graph/graph.h | 14 +++++++------- include/core/storage/graph/ldbc_import.h | 2 +- include/core/storage/graph/vertex.h | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index d9c70c4b..a12655dd 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -47,7 +47,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices - void add_edge(uint64_t sourceID, uint64_t targetID, std::string rel){ + void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel){ if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -58,7 +58,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices WITH property - void add_edge_with_property(uint64_t sourceID, uint64_t targetID, std::string rel, std::pair property){ + void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property){ if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -69,7 +69,7 @@ namespace morphstore{ } // function to add a new (ldbc) vertex to the graph and returns system-ID - uint64_t add_vertex_with_properties(std::unordered_map& props ){ + uint64_t add_vertex_with_properties(const std::unordered_map& props ){ Vertex v; v.add_properties(props); vertices.insert(std::make_pair(v.getId(), v)); @@ -77,7 +77,7 @@ namespace morphstore{ } // this adds a specific key-value pair (property) to a vertex given by its id - void add_property_to_vertex(uint64_t id, const std::pair& property){ + void add_property_to_vertex(uint64_t id, const std::pair& property){ if(exist_id(id)){ vertices.at(id).add_property(property); }else{ @@ -85,7 +85,7 @@ namespace morphstore{ } } - void add_entity_to_vertex(uint64_t id, std::string entity){ + void add_entity_to_vertex(const uint64_t id, const std::string& entity){ if(exist_id(id)){ vertices.at(id).add_entity(entity); }else{ @@ -102,12 +102,12 @@ namespace morphstore{ } // this function returns the total number of edges in the graph - int get_total_number_of_edges(){ + uint64_t get_total_number_of_edges(){ uint64_t totalNumberEdges = 0; for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ totalNumberEdges += it->second.get_number_of_edges(); } - return static_cast(totalNumberEdges); + return totalNumberEdges; } // for debbuging diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 1aec2a57..8f8af379 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -269,7 +269,7 @@ namespace morphstore{ // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); value = row.substr(row.find(delimiter) + 1); - multiValueAttr[systemID] = value; + multiValueAttr[systemID] = std::move(value); } start = i; // set new starting point for buffer (otherwise it's concatenated) diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index b4b2c4cb..3758c132 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -71,7 +71,7 @@ namespace morphstore{ } // this function adds a whole property map to a vertex - void add_properties(std::unordered_map &properties){ + void add_properties(const std::unordered_map &properties){ if(!properties.empty()){ this->properties = properties; }else{ @@ -81,11 +81,11 @@ namespace morphstore{ // this adds one key-value pair to the vertex's property map void add_property(const std::pair& property){ - this->properties[property.first] = property.second; + this->properties[property.first] = std::move(property.second); } // function that creates a new relation/edge between two (existing) vertices withouht properties - void add_edge(Vertex *target, std::string relation){ + void add_edge(Vertex *target, const std::string& relation){ Edge e; e.target = target; e.relation = relation; @@ -93,7 +93,7 @@ namespace morphstore{ } // add edge with properties to vertex - void add_edge_with_property(Vertex *target, std::string relation, std::pair property){ + void add_edge_with_property(Vertex *target, const std::string& relation, const std::pair& property){ Edge e; e.target = target; e.relation = relation; @@ -102,12 +102,12 @@ namespace morphstore{ } // add entity to vertex - void add_entity(std::string e){ + void add_entity(const std::string& e){ this->entities.insert(e); } - int get_number_of_edges(){ - return static_cast(adjList.size()); + uint64_t get_number_of_edges(){ + return adjList.size(); } void print_properties(){ From af8003512cb371d1f048167f2aa04d6466fbdca2 Mon Sep 17 00:00:00 2001 From: Tim Date: Fri, 24 May 2019 15:35:05 +0200 Subject: [PATCH 031/216] comment stuff --- include/core/storage/graph/vertex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 3758c132..8feb7afb 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -34,7 +34,7 @@ namespace morphstore{ class Vertex; - // this struct represents a relation to a target vertex; relation is the number in the lookup table + // this struct represents a relation to a target vertex; struct Edge{ Vertex* target; std::string relation; From c582382965822dd5c9725c6e498924a0c44eb884 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 29 May 2019 11:54:25 +0200 Subject: [PATCH 032/216] changed directory of LDBC files in test to /opt/... --- test/core/storage/graph/generate_ldbc_graph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 7d7f24b3..5aa64129 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -31,7 +31,7 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time // TODO: get base directory with cin -> user input - morphstore::LDBC_Import ldbcImport("/home/tim/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::Graph socialGraph; // generate vertices & edges from LDBC files and insert into socialGraph From b7573cc7fa394052a9d018c8f4a0f3d73aa1977d Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Thu, 6 Jun 2019 11:10:46 +0200 Subject: [PATCH 033/216] added ldbc.import() -> generate_vertices() + generate_edges() --- include/core/storage/graph/ldbc_import.h | 23 ++++++++++++++----- .../storage/graph/generate_ldbc_graph.cpp | 3 +-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 8f8af379..701d3554 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -66,6 +66,17 @@ namespace morphstore{ return directory; } + // generate_vertices() + generate_edges() + void import(morphstore::Graph &graph){ + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); + // (1) generate vertices + generate_vertices(graph); + // (2) generate edges + generate_edges(graph); + std::cout << "--> done" << std::endl; + } + // function which iterates through directory to receive file names (entire path) void insert_file_names(std::string dir){ for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ @@ -94,8 +105,8 @@ namespace morphstore{ void generate_vertices(morphstore::Graph &graph){ if(!verticesPaths.empty()) { - std::cout << "(1/2) Generating LDBC-Vertices ..."; - std::cout.flush(); + //std::cout << "(1/2) Generating LDBC-Vertices ..."; + //std::cout.flush(); // iterate through vector of vertex-addresses for (const auto &address : verticesPaths) { @@ -185,7 +196,7 @@ namespace morphstore{ // insert entity into vector entities.push_back(entity); } - std::cout << " --> done" << std::endl; + //std::cout << " --> done" << std::endl; } } @@ -203,8 +214,8 @@ namespace morphstore{ void generate_edges(morphstore::Graph& graph){ if(!relationsPaths.empty()) { - std::cout << "(2/2) Generating LDBC-Edges ..."; - std::cout.flush(); + //std::cout << "(2/2) Generating LDBC-Edges ..."; + //std::cout.flush(); // iterate through vector of vertex-addresses for (const auto &address : relationsPaths) { @@ -346,7 +357,7 @@ namespace morphstore{ relationFile.close(); } globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory - std::cout << " --> done" << std::endl; + //std::cout << " --> done" << std::endl; } } diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 5aa64129..26b52c59 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -35,8 +35,7 @@ int main( void ){ morphstore::Graph socialGraph; // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.generate_vertices(socialGraph); - ldbcImport.generate_edges(socialGraph); + ldbcImport.import(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time From 8c6be74f6bc968dd376516536e41355130a19dde Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 11 Jun 2019 10:36:33 +0200 Subject: [PATCH 034/216] calculation of memory usage of graph --- include/core/storage/graph/graph.h | 10 ++++++++ include/core/storage/graph/vertex.h | 23 +++++++++++++++++++ .../storage/graph/generate_ldbc_graph.cpp | 4 ++++ 3 files changed, 37 insertions(+) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index a12655dd..700ab631 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -41,6 +41,16 @@ namespace morphstore{ public: + // calculate the graph size in bytes + size_t get_size_of_graph(){ + size_t size = 0; + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size += it->second.get_size_of_vertex(); + } + return size; + } + + // adds a vertex (without properties) void add_vertex(){ Vertex v; vertices.insert(std::make_pair(v.getId(), v)); diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 8feb7afb..7cbc231e 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -65,6 +65,29 @@ namespace morphstore{ return id; } + // calculate size of a vertex for memory usage in bytes + size_t get_size_of_vertex(){ + size_t size = 0; + size += sizeof(uint64_t); // id + // Adj.List: + for(const auto& e : adjList){ + size += sizeof(morphstore::Vertex*) + sizeof(std::string); + if(!e.property.first.empty()){ + size += (2 * sizeof(std::string)); + } + } + // properties: + for(std::unordered_map::iterator it = properties.begin(); it != properties.end(); ++it){ + size += (2 * sizeof(std::string)); + } + // entities: + for(std::unordered_set::iterator iter = entities.begin(); iter != entities.end(); ++iter){ + size += sizeof(std::string); + } + + return size; + } + // returns a reference (read-only) of the adjacency list const std::vector& get_adjList() const{ return adjList; diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 26b52c59..856539f6 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -50,5 +50,9 @@ int main( void ){ socialGraph.print_vertex_by_id(100450); socialGraph.print_vertex_by_id(100168); */ + + // calculate size of social graph + std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; + return 0; } \ No newline at end of file From 39fb1f5e58ac0d9a885e223bba59388b2a5e0755 Mon Sep 17 00:00:00 2001 From: Alexander Krause Date: Wed, 12 Jun 2019 10:09:26 +0200 Subject: [PATCH 035/216] Reworked size functions. Advise: Write custom allocator to record memory allocation sizes. --- include/core/storage/graph/graph.h | 1 + include/core/storage/graph/vertex.h | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 700ab631..eb91911e 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -44,6 +44,7 @@ namespace morphstore{ // calculate the graph size in bytes size_t get_size_of_graph(){ size_t size = 0; + size += sizeof(std::unordered_map); for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ size += it->second.get_size_of_vertex(); } diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index 7cbc231e..ea4ce3ea 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -39,6 +39,10 @@ namespace morphstore{ Vertex* target; std::string relation; std::pair property; + + size_t size_in_bytes() const { + return sizeof(Vertex*) + sizeof(char)*relation.length() + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); + }; }; class Vertex{ @@ -66,23 +70,22 @@ namespace morphstore{ } // calculate size of a vertex for memory usage in bytes - size_t get_size_of_vertex(){ + size_t get_size_of_vertex() { size_t size = 0; size += sizeof(uint64_t); // id // Adj.List: for(const auto& e : adjList){ - size += sizeof(morphstore::Vertex*) + sizeof(std::string); - if(!e.property.first.empty()){ - size += (2 * sizeof(std::string)); - } + size += e.size_in_bytes(); } - // properties: - for(std::unordered_map::iterator it = properties.begin(); it != properties.end(); ++it){ - size += (2 * sizeof(std::string)); + // properties: + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ + size += sizeof(char)*(property->first.length() + property->second.length()); } - // entities: + // entities: + size += sizeof( std::unordered_set ); for(std::unordered_set::iterator iter = entities.begin(); iter != entities.end(); ++iter){ - size += sizeof(std::string); + size += sizeof(char)*(*iter).length(); } return size; From 6de73b279f6fb9b63365b9e652b0d70c41019054 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 17 Jun 2019 11:23:07 +0200 Subject: [PATCH 036/216] updating comments and cleaning stuff --- include/core/storage/graph/graph.h | 1 + include/core/storage/graph/ldbc_import.h | 3 +++ include/core/storage/graph/vertex.h | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index eb91911e..6ba6c42b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -37,6 +37,7 @@ namespace morphstore{ private: // main data structure: mapping global id -> vertex + // unordered_map hast fast search time -> average = O(1); worst case = O(n): std::unordered_map vertices; public: diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 701d3554..e5763a5e 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -70,10 +70,12 @@ namespace morphstore{ void import(morphstore::Graph &graph){ std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); + // (1) generate vertices generate_vertices(graph); // (2) generate edges generate_edges(graph); + std::cout << "--> done" << std::endl; } @@ -176,6 +178,7 @@ namespace morphstore{ } // last attribute properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + //----------------------------------------------------- // create vertex and insert into graph with properties uint64_t systemID = graph.add_vertex_with_properties(properties); diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/vertex.h index ea4ce3ea..e5cdc3b4 100644 --- a/include/core/storage/graph/vertex.h +++ b/include/core/storage/graph/vertex.h @@ -48,7 +48,7 @@ namespace morphstore{ class Vertex{ private: - // Vertex contains a (global) id; (old) ldbc id; entity number for lookup; vector adjList for the adjacency List + // Vertex contains a (global) id; entity; vector adjList for the adjacency List uint64_t id; std::vector adjList; // properties From 2bb9e3404ef922c825625757022e976b732505b1 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 26 Jun 2019 12:39:14 +0200 Subject: [PATCH 037/216] new file structure; prep. for CSR format --- include/core/storage/graph/{ => adj_list}/graph.h | 2 +- include/core/storage/graph/{ => adj_list}/vertex.h | 0 test/core/storage/graph/generate_ldbc_graph.cpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename include/core/storage/graph/{ => adj_list}/graph.h (99%) rename include/core/storage/graph/{ => adj_list}/vertex.h (100%) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/adj_list/graph.h similarity index 99% rename from include/core/storage/graph/graph.h rename to include/core/storage/graph/adj_list/graph.h index 6ba6c42b..eeac2a22 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_GRAPH_H #define MORPHSTORE_GRAPH_H -#include +#include #include #include diff --git a/include/core/storage/graph/vertex.h b/include/core/storage/graph/adj_list/vertex.h similarity index 100% rename from include/core/storage/graph/vertex.h rename to include/core/storage/graph/adj_list/vertex.h diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph.cpp index 856539f6..0ab0afca 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph.cpp @@ -21,7 +21,7 @@ * @todo */ -#include +#include #include #include // for high_resolution_clock From f3ee80d1ff64c955c310070238f0b512ff1ea4aa Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 16 Jul 2019 16:05:30 +0200 Subject: [PATCH 038/216] added abstract graph class; cmakelist-update; preps for new CSR format --- include/core/storage/graph/adj_list/graph.h | 14 ++++++++++---- include/core/storage/graph/adj_list/vertex.h | 6 +++--- include/core/storage/graph/ldbc_import.h | 2 +- test/core/storage/graph/CMakeLists.txt | 10 +++++----- ..._graph.cpp => generate_ldbc_graph_adj_list.cpp} | 9 ++++++--- 5 files changed, 25 insertions(+), 16 deletions(-) rename test/core/storage/graph/{generate_ldbc_graph.cpp => generate_ldbc_graph_adj_list.cpp} (94%) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index eeac2a22..9e2c6455 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -21,19 +21,21 @@ * @todo */ -#ifndef MORPHSTORE_GRAPH_H -#define MORPHSTORE_GRAPH_H +#ifndef MORPHSTORE_GRAPH_ADJACENCY_LIST_H +#define MORPHSTORE_GRAPH_ADJACENCY_LIST_H #include +#include #include #include #include +#include namespace morphstore{ - class Graph{ + class AdjacencyList: public morphstore::Graph{ private: // main data structure: mapping global id -> vertex @@ -42,6 +44,10 @@ namespace morphstore{ public: + std::string getStorageFormat(){ + return "AdjacencyList"; + } + // calculate the graph size in bytes size_t get_size_of_graph(){ size_t size = 0; @@ -151,4 +157,4 @@ namespace morphstore{ } -#endif //MORPHSTORE_GRAPH_H +#endif //MORPHSTORE_GRAPH_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index e5cdc3b4..99b1a75e 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -21,8 +21,8 @@ * @todo */ -#ifndef MORPHSTORE_VERTEX_H -#define MORPHSTORE_VERTEX_H +#ifndef MORPHSTORE_VERTEX_ADJACENCY_LIST_H +#define MORPHSTORE_VERTEX_ADJACENCY_LIST_H #include #include @@ -144,4 +144,4 @@ namespace morphstore{ }; } -#endif //MORPHSTORE_VERTEX_H +#endif //MORPHSTORE_VERTEX_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e5763a5e..e645f093 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -214,7 +214,7 @@ namespace morphstore{ // this function reads the relation-files and generates edges in graph - void generate_edges(morphstore::Graph& graph){ + void generate_edges(morphstore::Graph &graph){ if(!relationsPaths.empty()) { //std::cout << "(2/2) Generating LDBC-Edges ..."; diff --git a/test/core/storage/graph/CMakeLists.txt b/test/core/storage/graph/CMakeLists.txt index ad5c2387..5393e17c 100644 --- a/test/core/storage/graph/CMakeLists.txt +++ b/test/core/storage/graph/CMakeLists.txt @@ -1,15 +1,15 @@ if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc_graph_adj_list_test_app ) - add_executable( ldbc_graph_test_app generate_ldbc_graph.cpp ) - target_compile_options( ldbc_graph_test_app PRIVATE + add_executable( ldbc_graph_adj_list_test_app generate_ldbc_graph_adj_list.cpp) + target_compile_options( ldbc_graph_adj_list_test_app PRIVATE -Werror -Wall -Wextra -pedantic -fstack-protector-all $<$:-DDEBUG> ) - target_link_libraries( ldbc_graph_test_app PRIVATE "-ldl" stdc++fs) + target_link_libraries( ldbc_graph_adj_list_test_app PRIVATE "-ldl" stdc++fs) - add_test( ldbc_graph_test ldbc_graph_test_app ) + add_test( ldbc_graph_adj_list_test ldbc_graph_adj_list_test_app ) endif() \ No newline at end of file diff --git a/test/core/storage/graph/generate_ldbc_graph.cpp b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp similarity index 94% rename from test/core/storage/graph/generate_ldbc_graph.cpp rename to test/core/storage/graph/generate_ldbc_graph_adj_list.cpp index 0ab0afca..43036b8e 100644 --- a/test/core/storage/graph/generate_ldbc_graph.cpp +++ b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp @@ -23,6 +23,7 @@ #include #include +#include #include // for high_resolution_clock int main( void ){ @@ -30,12 +31,14 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - // TODO: get base directory with cin -> user input morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::Graph socialGraph; + morphstore::AdjacencyList socialGraph; + morphstore::Graph *g; + g = &socialGraph; + // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.import(socialGraph); + ldbcImport.import(*g); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time From 163f51cb059fb5112a98bfc17a51678f18e3f911 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 16 Jul 2019 16:06:19 +0200 Subject: [PATCH 039/216] no idea whats that --- include/core/storage/graph/graph_abstract.h | 45 +++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 include/core/storage/graph/graph_abstract.h diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h new file mode 100644 index 00000000..69cba7ad --- /dev/null +++ b/include/core/storage/graph/graph_abstract.h @@ -0,0 +1,45 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph_abstract.h + * @brief this abstract class is for the ldbc importer for polymorphism of different storage formats (pointer to derived classes) + * @todo add all used functions of graphs in ldbc importer class +*/ + +#ifndef MORPHSTORE_GRAPH_ABSTRACT_H +#define MORPHSTORE_GRAPH_ABSTRACT_H + +#include + +namespace morphstore{ + + class Graph{ + public: + //virtual ~Graph(); + virtual std::string getStorageFormat() = 0; + virtual void add_vertex() = 0; + virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel) = 0; + virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property) = 0; + virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; + virtual void add_entity_to_vertex(const uint64_t id, const std::string& entity) = 0; + virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; + + }; +} + +#endif //MORPHSTORE_GRAPH_ABSTRACT_H From 84acf81dd0bdc723a9fac94ae3a7cae4f0fc39d9 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 16 Jul 2019 16:36:24 +0200 Subject: [PATCH 040/216] cleaning up --- include/core/storage/graph/adj_list/graph.h | 4 +++- include/core/storage/graph/graph_abstract.h | 3 +++ test/core/storage/graph/generate_ldbc_graph_adj_list.cpp | 7 ++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index 9e2c6455..bd16747c 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -35,6 +35,8 @@ namespace morphstore{ + static const std::string storageFormat = "AdjacencyList"; + class AdjacencyList: public morphstore::Graph{ private: @@ -45,7 +47,7 @@ namespace morphstore{ public: std::string getStorageFormat(){ - return "AdjacencyList"; + return storageFormat; } // calculate the graph size in bytes diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h index 69cba7ad..a942733d 100644 --- a/include/core/storage/graph/graph_abstract.h +++ b/include/core/storage/graph/graph_abstract.h @@ -32,6 +32,9 @@ namespace morphstore{ public: //virtual ~Graph(); virtual std::string getStorageFormat() = 0; + virtual size_t get_size_of_graph() = 0; + + // AdjacecenyList functions for ldbc-importer: virtual void add_vertex() = 0; virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel) = 0; virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property) = 0; diff --git a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp index 43036b8e..b4e452f7 100644 --- a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp @@ -33,12 +33,13 @@ int main( void ){ morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::AdjacencyList socialGraph; - morphstore::Graph *g; - g = &socialGraph; + // create abstract pointer to adjc_list (ldbc importer just has to handle with one input class and not adjcancyList, CSR, ....) + morphstore::Graph *graph; + graph = &socialGraph; // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.import(*g); + ldbcImport.import(*graph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time From cf785bbcf87949abd5a3d36f42470b9b9869b9fa Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 17 Jul 2019 11:52:38 +0200 Subject: [PATCH 041/216] fixed vertex-entity-redundancy: now, entity-number to lookup --- include/core/storage/graph/adj_list/graph.h | 30 +++++++++++++++++-- include/core/storage/graph/adj_list/vertex.h | 23 +++++++------- include/core/storage/graph/graph_abstract.h | 6 ++-- include/core/storage/graph/ldbc_import.h | 26 +++++++++++----- .../graph/generate_ldbc_graph_adj_list.cpp | 6 +++- 5 files changed, 66 insertions(+), 25 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index bd16747c..d4267bed 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,8 @@ namespace morphstore{ // main data structure: mapping global id -> vertex // unordered_map hast fast search time -> average = O(1); worst case = O(n): std::unordered_map vertices; + // lookup dictionary for entities of vertices, e.g. 0 -> Comment, ... + std::map entityDictionary; public: @@ -105,14 +108,28 @@ namespace morphstore{ } } - void add_entity_to_vertex(const uint64_t id, const std::string& entity){ + void add_entity_to_vertex(const uint64_t id, unsigned short int entity){ if(exist_id(id)){ - vertices.at(id).add_entity(entity); + vertices.at(id).setEntity(entity); }else{ std::cout << "Vertex with ID " << id << " does not exist in the database!"; } } + std::string get_entity_of_vertex_by_id(const uint64_t id){ + if(exist_id(id)){ + unsigned short int entity = vertices.at(id).getEntity(); + return entityDictionary.at(entity); + }else{ + return "No Matching of Vertex-ID in the database!"; + } + } + + void set_entity_dictionary(const std::map& entityList){ + this->entityDictionary = entityList; + } + + // function to check if the ID is present or not bool exist_id(const uint64_t id){ if(vertices.find(id) == vertices.end()){ @@ -138,13 +155,20 @@ namespace morphstore{ std::cout << "--------------------------------------------" << std::endl; } + void printEntities(){ + for(auto const& entity : entityDictionary){ + std::cout << entity.first << " -> " << entity.second << "\n"; + } + } + // for debugging void print_vertex_by_id(uint64_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; + std::cout << "Entity: \t"<< get_entity_of_vertex_by_id(v->getId()) << std::endl; std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; - std::cout << "Adj.List: "; + std::cout << "Adj_List: "; const std::vector& adjList = v->get_adjList(); for(const auto& e : adjList){ diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index 99b1a75e..0d7f45a6 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -53,8 +53,8 @@ namespace morphstore{ std::vector adjList; // properties std::unordered_map properties; - // a vertex can have multiple entites - std::unordered_set entities; + // entity-number for look-up + unsigned short int entity; public: @@ -69,6 +69,15 @@ namespace morphstore{ return id; } + // add entity to vertex + void setEntity(unsigned short int e){ + this->entity = e; + } + + unsigned short int getEntity(){ + return this->entity; + } + // calculate size of a vertex for memory usage in bytes size_t get_size_of_vertex() { size_t size = 0; @@ -83,10 +92,7 @@ namespace morphstore{ size += sizeof(char)*(property->first.length() + property->second.length()); } // entities: - size += sizeof( std::unordered_set ); - for(std::unordered_set::iterator iter = entities.begin(); iter != entities.end(); ++iter){ - size += sizeof(char)*(*iter).length(); - } + size += sizeof(unsigned short int); return size; } @@ -127,11 +133,6 @@ namespace morphstore{ this->adjList.push_back(e); } - // add entity to vertex - void add_entity(const std::string& e){ - this->entities.insert(e); - } - uint64_t get_number_of_edges(){ return adjList.size(); } diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h index a942733d..a0d9ad3c 100644 --- a/include/core/storage/graph/graph_abstract.h +++ b/include/core/storage/graph/graph_abstract.h @@ -25,12 +25,13 @@ #define MORPHSTORE_GRAPH_ABSTRACT_H #include +#include namespace morphstore{ class Graph{ public: - //virtual ~Graph(); + virtual ~Graph() { }; virtual std::string getStorageFormat() = 0; virtual size_t get_size_of_graph() = 0; @@ -39,8 +40,9 @@ namespace morphstore{ virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel) = 0; virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property) = 0; virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; - virtual void add_entity_to_vertex(const uint64_t id, const std::string& entity) = 0; + virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; + virtual void set_entity_dictionary(const std::map& entityList) = 0; }; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e645f093..17662424 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -29,6 +29,7 @@ #include #include #include +#include #include // hash function used to hash a pair of any kind using XOR (for verticesMap) @@ -50,7 +51,7 @@ namespace morphstore{ std::string directory; std::vector verticesPaths; std::vector relationsPaths; - std::vector entities; + std::map entitiesLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; @@ -110,6 +111,9 @@ namespace morphstore{ //std::cout << "(1/2) Generating LDBC-Vertices ..."; //std::cout.flush(); + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; + // iterate through vector of vertex-addresses for (const auto &address : verticesPaths) { @@ -182,8 +186,8 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity to vertex - graph.add_entity_to_vertex(systemID, entity); + // add entity number to vertex + graph.add_entity_to_vertex(systemID, entityNumber); // map entity and ldbc id to system generated id globalIdLookupMap.insert({{entity, ldbcID}, systemID}); //----------------------------------------------------- @@ -196,19 +200,25 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); - // insert entity into vector - entities.push_back(entity); + + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair( entityNumber, entity)); + ++entityNumber; } - //std::cout << " --> done" << std::endl; + // graph gets full entity-list here: + graph.set_entity_dictionary(entitiesLookup); } } // function which returns true, if parameter is a entity in ldbc-files bool isEntity(const std::string& entity){ // iterate through entities vector to look up for paramater - if (std::find(entities.begin(), entities.end(), entity) != entities.end()){ - return true; + for(auto const& entry : entitiesLookup){ + if(entry.second == entity){ + return true; + } } + return false; } diff --git a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp index b4e452f7..0ea749f4 100644 --- a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp @@ -25,6 +25,7 @@ #include #include #include // for high_resolution_clock +#include int main( void ){ @@ -37,7 +38,6 @@ int main( void ){ morphstore::Graph *graph; graph = &socialGraph; - // generate vertices & edges from LDBC files and insert into socialGraph ldbcImport.import(*graph); @@ -45,6 +45,10 @@ int main( void ){ auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time std::chrono::duration elapsed = finish - start; + // delete graph-pointer to adj_list socialGraph + graph = NULL; + delete graph; + socialGraph.statistics(); std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; From f0a29e1a8022ea10aea242eed8b83e8f5d724452 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 17 Jul 2019 14:04:33 +0200 Subject: [PATCH 042/216] fixed edge-relation-redundancy: now, relation-number to lookup --- include/core/storage/graph/adj_list/graph.h | 40 +++++++++++---- include/core/storage/graph/adj_list/vertex.h | 9 ++-- include/core/storage/graph/graph_abstract.h | 6 +-- include/core/storage/graph/ldbc_import.h | 50 ++++++++++++++++--- .../graph/generate_ldbc_graph_adj_list.cpp | 2 +- 5 files changed, 81 insertions(+), 26 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index d4267bed..78cfd742 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -44,8 +44,10 @@ namespace morphstore{ // main data structure: mapping global id -> vertex // unordered_map hast fast search time -> average = O(1); worst case = O(n): std::unordered_map vertices; - // lookup dictionary for entities of vertices, e.g. 0 -> Comment, ... + + // lookup dictionaries for entities of vertices / relation names of edges std::map entityDictionary; + std::map relationDictionary; public: @@ -70,7 +72,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices - void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel){ + void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int rel){ if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -81,7 +83,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices WITH property - void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property){ + void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int rel, const std::pair& property){ if(exist_id(sourceID) && exist_id(targetID)){ Vertex* sourceV = &vertices.at(sourceID); Vertex* targetV = &vertices.at(targetID); @@ -116,12 +118,11 @@ namespace morphstore{ } } - std::string get_entity_of_vertex_by_id(const uint64_t id){ - if(exist_id(id)){ - unsigned short int entity = vertices.at(id).getEntity(); - return entityDictionary.at(entity); + std::string get_entity_by_number(unsigned short int e){ + if(entityDictionary.find( e ) != entityDictionary.end()){ + return entityDictionary.at(e); }else{ - return "No Matching of Vertex-ID in the database!"; + return "No Matching of entity-number in the database!"; } } @@ -129,6 +130,17 @@ namespace morphstore{ this->entityDictionary = entityList; } + std::string get_relation_by_number(unsigned short int re){ + if(relationDictionary.find( re ) != relationDictionary.end()){ + return relationDictionary.at(re); + }else{ + return "No Matching of relation-number in the database!"; + } + } + + void set_relation_dictionary(const std::map& relationList){ + this->relationDictionary = relationList; + } // function to check if the ID is present or not bool exist_id(const uint64_t id){ @@ -155,24 +167,32 @@ namespace morphstore{ std::cout << "--------------------------------------------" << std::endl; } + // for debbuging void printEntities(){ for(auto const& entity : entityDictionary){ std::cout << entity.first << " -> " << entity.second << "\n"; } } + // for debbuging + void printRelations(){ + for(auto const& rel : relationDictionary){ + std::cout << rel.first << " -> " << rel.second << "\n"; + } + } + // for debugging void print_vertex_by_id(uint64_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; Vertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; - std::cout << "Entity: \t"<< get_entity_of_vertex_by_id(v->getId()) << std::endl; + std::cout << "Entity: \t"<< get_entity_by_number(v->getEntity()) << std::endl; std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; std::cout << "Adj_List: "; const std::vector& adjList = v->get_adjList(); for(const auto& e : adjList){ - std::cout << "(" << e.target->getId() << "," << e.relation << ") "; + std::cout << "(" << e.target->getId() << "," << get_relation_by_number(e.relation) << ") "; } std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index 0d7f45a6..27def7c5 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -37,11 +37,12 @@ namespace morphstore{ // this struct represents a relation to a target vertex; struct Edge{ Vertex* target; - std::string relation; + unsigned short int relation; + // make this optianl??: std::pair property; size_t size_in_bytes() const { - return sizeof(Vertex*) + sizeof(char)*relation.length() + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); + return sizeof(Vertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); }; }; @@ -117,7 +118,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices withouht properties - void add_edge(Vertex *target, const std::string& relation){ + void add_edge(Vertex *target, unsigned short int relation){ Edge e; e.target = target; e.relation = relation; @@ -125,7 +126,7 @@ namespace morphstore{ } // add edge with properties to vertex - void add_edge_with_property(Vertex *target, const std::string& relation, const std::pair& property){ + void add_edge_with_property(Vertex *target, unsigned short int relation, const std::pair& property){ Edge e; e.target = target; e.relation = relation; diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h index a0d9ad3c..e8003b54 100644 --- a/include/core/storage/graph/graph_abstract.h +++ b/include/core/storage/graph/graph_abstract.h @@ -37,13 +37,13 @@ namespace morphstore{ // AdjacecenyList functions for ldbc-importer: virtual void add_vertex() = 0; - virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, const std::string& rel) = 0; - virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, const std::string& rel, const std::pair& property) = 0; + virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation) = 0; + virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property) = 0; virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; virtual void set_entity_dictionary(const std::map& entityList) = 0; - + virtual void set_relation_dictionary(const std::map& relationList) = 0; }; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 17662424..23e8cbc6 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,7 +18,7 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph - * @todo process Multi-value attributes + * @todo */ #ifndef MORPHSTORE_LDBC_IMPORT_H @@ -52,6 +52,7 @@ namespace morphstore{ std::vector verticesPaths; std::vector relationsPaths; std::map entitiesLookup; + std::map relationsLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; @@ -211,8 +212,8 @@ namespace morphstore{ } // function which returns true, if parameter is a entity in ldbc-files - bool isEntity(const std::string& entity){ - // iterate through entities vector to look up for paramater + bool is_entity(const std::string &entity){ + // iterate through entities-map to look up for paramater for(auto const& entry : entitiesLookup){ if(entry.second == entity){ return true; @@ -222,6 +223,18 @@ namespace morphstore{ return false; } + // function which returns true, if the relation already exist + bool exist_relation_name(const std::string& relation){ + // iterate through relations-map to look up for paramater + for(auto const& entry : relationsLookup){ + if(entry.second == relation){ + return true; + } + } + + return false; + } + // this function reads the relation-files and generates edges in graph void generate_edges(morphstore::Graph &graph){ @@ -230,9 +243,15 @@ namespace morphstore{ //std::cout << "(2/2) Generating LDBC-Edges ..."; //std::cout.flush(); + //this variable is used for the relationLookup-keys, starting by 0 + unsigned short int relationNumber = 0; + bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) + // iterate through vector of vertex-addresses for (const auto &address : relationsPaths) { + isRelation = false; + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); std::string fromEntity = relation.substr(0, relation.find('_')); @@ -269,7 +288,7 @@ namespace morphstore{ std::string delimiter = "|"; // check from file name whether it's a relation file or multi value attribute file - if(!isEntity(toEntity)){ + if(!is_entity(toEntity)){ // Multi-value-attributes: just take the last recently one std::string propertyKey; std::unordered_map multiValueAttr; @@ -309,6 +328,8 @@ namespace morphstore{ // handling of relation-files ... else{ + isRelation = true; + bool hasProperties = false; std::string propertyKey; uint64_t fromID, toID; @@ -353,13 +374,13 @@ namespace morphstore{ toID = globalIdLookupMap.at({toEntity, row}); // Generate edge in graph - graph.add_edge(fromID, toID, relationName); + graph.add_edge(fromID, toID, relationNumber); }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - graph.add_edge_with_property(fromID, toID, relationName, {propertyKey, value}); + graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -368,9 +389,22 @@ namespace morphstore{ } delete[] buffer; // free memory relationFile.close(); + + //check if the relation name is a relation (no multi value file) + if(isRelation){ + // check if the name already exists + if(!exist_relation_name(relationName)){ + // insert relation-number with string into map + relationsLookup.insert(std::make_pair( relationNumber, relationName)); + ++relationNumber; + } + } + } - globalIdLookupMap.clear(); // we dont need the lookup anymore -> delete memory - //std::cout << " --> done" << std::endl; + // graph gets full relation-list here: + graph.set_relation_dictionary(relationsLookup); + + globalIdLookupMap.clear(); // we dont need the lookup anymore -> clear } } diff --git a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp index 0ea749f4..b77563cf 100644 --- a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp @@ -25,7 +25,6 @@ #include #include #include // for high_resolution_clock -#include int main( void ){ @@ -34,6 +33,7 @@ int main( void ){ morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::AdjacencyList socialGraph; + // create abstract pointer to adjc_list (ldbc importer just has to handle with one input class and not adjcancyList, CSR, ....) morphstore::Graph *graph; graph = &socialGraph; From 46adcd8c9a2a82ca135c6380c8e310fcd020dddf Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 17 Jul 2019 14:08:38 +0200 Subject: [PATCH 043/216] removed unused function in abstract graph class --- include/core/storage/graph/graph_abstract.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h index e8003b54..0c48b877 100644 --- a/include/core/storage/graph/graph_abstract.h +++ b/include/core/storage/graph/graph_abstract.h @@ -33,7 +33,6 @@ namespace morphstore{ public: virtual ~Graph() { }; virtual std::string getStorageFormat() = 0; - virtual size_t get_size_of_graph() = 0; // AdjacecenyList functions for ldbc-importer: virtual void add_vertex() = 0; From b5349356f3f579b80473ec442e5ff3541ce1821a Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Thu, 18 Jul 2019 16:59:56 +0200 Subject: [PATCH 044/216] started CSR-format --- include/core/storage/graph/adj_list/graph.h | 32 +++-- include/core/storage/graph/adj_list/vertex.h | 15 +- include/core/storage/graph/graph_abstract.h | 24 ++-- include/core/storage/graph/ldbc_import.h | 132 +++++++++++++++++- test/CMakeLists.txt | 3 +- .../graph/{ => adj_list}/CMakeLists.txt | 2 +- .../generate_ldbc_graph_adj_list.cpp | 0 7 files changed, 174 insertions(+), 34 deletions(-) rename test/core/storage/graph/{ => adj_list}/CMakeLists.txt (83%) rename test/core/storage/graph/{ => adj_list}/generate_ldbc_graph_adj_list.cpp (100%) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index 78cfd742..a19356bc 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -36,21 +36,25 @@ namespace morphstore{ - static const std::string storageFormat = "AdjacencyList"; - class AdjacencyList: public morphstore::Graph{ private: // main data structure: mapping global id -> vertex // unordered_map hast fast search time -> average = O(1); worst case = O(n): - std::unordered_map vertices; + std::unordered_map vertices; // lookup dictionaries for entities of vertices / relation names of edges std::map entityDictionary; std::map relationDictionary; + const std::string storageFormat = "AdjacencyList"; + public: + void init(){ + std::cout << "Nothing to do!!" << std::endl; + } + std::string getStorageFormat(){ return storageFormat; } @@ -58,8 +62,8 @@ namespace morphstore{ // calculate the graph size in bytes size_t get_size_of_graph(){ size_t size = 0; - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ size += it->second.get_size_of_vertex(); } return size; @@ -67,15 +71,15 @@ namespace morphstore{ // adds a vertex (without properties) void add_vertex(){ - Vertex v; + ADJLISTVertex v; vertices.insert(std::make_pair(v.getId(), v)); } // function that creates a new relation/edge between two (existing) vertices void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int rel){ if(exist_id(sourceID) && exist_id(targetID)){ - Vertex* sourceV = &vertices.at(sourceID); - Vertex* targetV = &vertices.at(targetID); + ADJLISTVertex* sourceV = &vertices.at(sourceID); + ADJLISTVertex* targetV = &vertices.at(targetID); sourceV->add_edge(targetV, rel); }else{ std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; @@ -85,8 +89,8 @@ namespace morphstore{ // function that creates a new relation/edge between two (existing) vertices WITH property void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int rel, const std::pair& property){ if(exist_id(sourceID) && exist_id(targetID)){ - Vertex* sourceV = &vertices.at(sourceID); - Vertex* targetV = &vertices.at(targetID); + ADJLISTVertex* sourceV = &vertices.at(sourceID); + ADJLISTVertex* targetV = &vertices.at(targetID); sourceV->add_edge_with_property(targetV, rel, property); }else{ std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; @@ -95,7 +99,7 @@ namespace morphstore{ // function to add a new (ldbc) vertex to the graph and returns system-ID uint64_t add_vertex_with_properties(const std::unordered_map& props ){ - Vertex v; + ADJLISTVertex v; v.add_properties(props); vertices.insert(std::make_pair(v.getId(), v)); return v.getId(); @@ -106,7 +110,7 @@ namespace morphstore{ if(exist_id(id)){ vertices.at(id).add_property(property); }else{ - std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; + std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; } } @@ -153,7 +157,7 @@ namespace morphstore{ // this function returns the total number of edges in the graph uint64_t get_total_number_of_edges(){ uint64_t totalNumberEdges = 0; - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ totalNumberEdges += it->second.get_number_of_edges(); } return totalNumberEdges; @@ -184,7 +188,7 @@ namespace morphstore{ // for debugging void print_vertex_by_id(uint64_t id){ std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; - Vertex* v = &vertices.at(id); + ADJLISTVertex* v = &vertices.at(id); std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; std::cout << "Entity: \t"<< get_entity_by_number(v->getEntity()) << std::endl; std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index 27def7c5..c43fd727 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -27,26 +27,25 @@ #include #include #include -#include namespace morphstore{ - class Vertex; + class ADJLISTVertex; // this struct represents a relation to a target vertex; struct Edge{ - Vertex* target; + ADJLISTVertex* target; unsigned short int relation; // make this optianl??: std::pair property; size_t size_in_bytes() const { - return sizeof(Vertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); + return sizeof(ADJLISTVertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); }; }; - class Vertex{ + class ADJLISTVertex{ private: // Vertex contains a (global) id; entity; vector adjList for the adjacency List @@ -60,7 +59,7 @@ namespace morphstore{ public: // constrcutor without the adjList (Vertex can contain no edges int the graph) - Vertex(){ + ADJLISTVertex(){ // unique ID generation static uint64_t startID = 0; id = startID++; @@ -118,7 +117,7 @@ namespace morphstore{ } // function that creates a new relation/edge between two (existing) vertices withouht properties - void add_edge(Vertex *target, unsigned short int relation){ + void add_edge(ADJLISTVertex *target, unsigned short int relation){ Edge e; e.target = target; e.relation = relation; @@ -126,7 +125,7 @@ namespace morphstore{ } // add edge with properties to vertex - void add_edge_with_property(Vertex *target, unsigned short int relation, const std::pair& property){ + void add_edge_with_property(ADJLISTVertex *target, unsigned short int relation, const std::pair& property){ Edge e; e.target = target; e.relation = relation; diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h index 0c48b877..1afd254b 100644 --- a/include/core/storage/graph/graph_abstract.h +++ b/include/core/storage/graph/graph_abstract.h @@ -33,16 +33,24 @@ namespace morphstore{ public: virtual ~Graph() { }; virtual std::string getStorageFormat() = 0; - - // AdjacecenyList functions for ldbc-importer: + + // init: + virtual void init() = 0; + + // Vertex-functions for ldbc-importer: virtual void add_vertex() = 0; - virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation) = 0; - virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property) = 0; - virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; - virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; + virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; // virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; - virtual void set_entity_dictionary(const std::map& entityList) = 0; - virtual void set_relation_dictionary(const std::map& relationList) = 0; + virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; // + + // edge functions: + virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation) = 0 ; + virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property) = 0; + + // dictionary functions: + virtual void set_entity_dictionary(const std::map& entityList) = 0; // + virtual void set_relation_dictionary(const std::map& relationList) = 0; // + }; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 23e8cbc6..c0086dc6 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -24,6 +24,9 @@ #ifndef MORPHSTORE_LDBC_IMPORT_H #define MORPHSTORE_LDBC_IMPORT_H +#include +#include + #include #include #include @@ -56,7 +59,6 @@ namespace morphstore{ // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; - public: LDBC_Import(const std::string& dir){ @@ -209,6 +211,12 @@ namespace morphstore{ // graph gets full entity-list here: graph.set_entity_dictionary(entitiesLookup); } + + // BE CAREFUL WITH THIS: if the graph structure is CSR, we do futher stuff (malloc node array,....) + if(graph.getStorageFormat() == "CSR"){ + graph.init(); + } + } // function which returns true, if parameter is a entity in ldbc-files @@ -235,10 +243,11 @@ namespace morphstore{ return false; } - // this function reads the relation-files and generates edges in graph void generate_edges(morphstore::Graph &graph){ + + if(!relationsPaths.empty()) { //std::cout << "(2/2) Generating LDBC-Edges ..."; //std::cout.flush(); @@ -421,6 +430,125 @@ namespace morphstore{ } + // function that returns number of vertices + uint64_t get_total_number_vertices(){ + + uint64_t result = 0; + + if(!verticesPaths.empty()) { + + // iterate through vector of vertex-addresses + for (const auto &address : verticesPaths) { + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + vertexFile.read(buffer, fileSize); // read data as one big block + bool firstLine = true; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // skip first line... + if(firstLine){ + firstLine = false; + }else{ + ++result; + } + } + } + + delete[] buffer; // free memory + vertexFile.close(); + } + } + return result; + } + + // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because entity lookup creation) + uint64_t get_total_number_edges(){ + + uint64_t result = 0 ; + + if(!relationsPaths.empty()) { + + // iterate through vector of relation-addresses + for (const auto &address : relationsPaths) { + + // TODO OPTIMIZE HERE: remove string operations + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string fromEntity = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string relationName = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string toEntity = relation; + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!relationFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + relationFile.read(buffer, fileSize); // read data as one big block + bool firstLine = true; + + // check from file name whether it's a relation file or multi value attribute file + if(is_entity(toEntity)){ + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // skip first line (attributes infos....) + if(firstLine){ + firstLine = false; + }else{ + ++result; + } + } + } + + } + + delete[] buffer; // free memory + relationFile.close(); + + } + } + return result; + } }; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 29695891..fc879669 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) -add_subdirectory( core/storage/graph ) +add_subdirectory(core/storage/graph/adj_list) +add_subdirectory( core/storage/graph/csr ) add_subdirectory(vector) \ No newline at end of file diff --git a/test/core/storage/graph/CMakeLists.txt b/test/core/storage/graph/adj_list/CMakeLists.txt similarity index 83% rename from test/core/storage/graph/CMakeLists.txt rename to test/core/storage/graph/adj_list/CMakeLists.txt index 5393e17c..93f16270 100644 --- a/test/core/storage/graph/CMakeLists.txt +++ b/test/core/storage/graph/adj_list/CMakeLists.txt @@ -1,5 +1,5 @@ if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc_graph_adj_list_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adj_list/ldbc_graph_adj_list_test_app ) add_executable( ldbc_graph_adj_list_test_app generate_ldbc_graph_adj_list.cpp) target_compile_options( ldbc_graph_adj_list_test_app PRIVATE diff --git a/test/core/storage/graph/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp similarity index 100% rename from test/core/storage/graph/generate_ldbc_graph_adj_list.cpp rename to test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp From 67b84d46e8da772920c4f89d9146f076a7e9b283 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Thu, 18 Jul 2019 17:00:45 +0200 Subject: [PATCH 045/216] directory csr --- include/core/storage/graph/csr/graph.h | 157 ++++++++++++++++++ include/core/storage/graph/csr/vertex.h | 100 +++++++++++ test/core/storage/graph/csr/CMakeLists.txt | 15 ++ .../graph/csr/generate_ldbc_graph_csr.cpp | 48 ++++++ 4 files changed, 320 insertions(+) create mode 100644 include/core/storage/graph/csr/graph.h create mode 100644 include/core/storage/graph/csr/vertex.h create mode 100644 test/core/storage/graph/csr/CMakeLists.txt create mode 100644 test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h new file mode 100644 index 00000000..33c05495 --- /dev/null +++ b/include/core/storage/graph/csr/graph.h @@ -0,0 +1,157 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph.h + * @brief CSR graph header file + * @todo +*/ + +#ifndef MORPHSTORE_GRAPH_CSR_H +#define MORPHSTORE_GRAPH_CSR_H + +#include +#include + +#include +#include +#include + +namespace morphstore{ + + class CSR: public morphstore::Graph{ + + private: + // main data structure: hash table (hash id to vertex) + // unordered_map has fast search time / look-up -> average = O(1); worst case = O(n): + std::unordered_map vertices; + + // graph-structure: 3 Arrays (row_array, col_array, val_array) + // row array('node array'): contains the offset in the col_array; vertex-system-id is index in the row_array + // col_array('edge array'): every cell represents an edge containing the vertex targets ID + // value_array: edge properties + uint64_t* node_array; + uint64_t* edge_array; + std::string* val_array; + + // lookup dictionaries for entities of vertices / relation names of edges + std::map entityDictionary; + std::map relationDictionary; + + const std::string storageFormat = "CSR"; + + + public: + + std::string getStorageFormat(){ + return storageFormat; + } + + // this functions allocates the memory for the graph structure arrays + void init(){ + // (1) get number of vertices from main data structure + uint64_t numberVertices = vertices.size(); + // (2) allocate node array memory + node_array = new uint64_t[numberVertices]; + } + + void add_vertex(){ + CSRVertex v; + vertices.insert(std::make_pair(v.getId(), v)); + } + + void add_property_to_vertex(uint64_t id, const std::pair& property){ + if(exist_id(id)){ + vertices.at(id).add_property(property); + }else{ + std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; + } + } + + void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation){ + // TODO + std::cout << sourceID << targetID << relation << std::endl; + } + + void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property){ + // TODO + std::cout << sourceID << targetID << relation << property.first << std::endl; + + } + + uint64_t add_vertex_with_properties(const std::unordered_map& props ){ + CSRVertex v; + v.add_properties(props); + vertices.insert(std::make_pair(v.getId(), v)); + return v.getId(); + } + + void add_entity_to_vertex(const uint64_t id, unsigned short int entity){ + if(exist_id(id)){ + vertices.at(id).setEntity(entity); + }else{ + std::cout << "Vertex with ID " << id << " does not exist in the database!"; + } + } + + // function to check if the ID is present or not + bool exist_id(const uint64_t id){ + if(vertices.find(id) == vertices.end()){ + return false; + } + return true; + } + std::string get_entity_by_number(unsigned short int e){ + if(entityDictionary.find( e ) != entityDictionary.end()){ + return entityDictionary.at(e); + }else{ + return "No Matching of entity-number in the database!"; + } + } + + void set_entity_dictionary(const std::map& entityList){ + this->entityDictionary = entityList; + } + + std::string get_relation_by_number(unsigned short int re){ + if(relationDictionary.find( re ) != relationDictionary.end()){ + return relationDictionary.at(re); + }else{ + return "No Matching of relation-number in the database!"; + } + } + + void set_relation_dictionary(const std::map& relationList){ + this->relationDictionary = relationList; + } + + + + // for debbuging + void statistics(){ + std::cout << "---------------- Statistics ----------------" << std::endl; + std::cout << "Number of vertices: " << vertices.size() << std::endl; + std::cout << "Number of relations/edges: " << std::endl; + std::cout << "--------------------------------------------" << std::endl; + } + + + }; + +} + +#endif //MORPHSTORE_GRAPH_CSR_H diff --git a/include/core/storage/graph/csr/vertex.h b/include/core/storage/graph/csr/vertex.h new file mode 100644 index 00000000..1e6017e9 --- /dev/null +++ b/include/core/storage/graph/csr/vertex.h @@ -0,0 +1,100 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertex.h + * @brief CSR vertex header file + * @todo +*/ + +#ifndef MORPHSTORE_VERTEX_CSR_H +#define MORPHSTORE_VERTEX_CSR_H + +#include +#include + +namespace morphstore{ + + class CSRVertex{ + + private: + // system-ID + uint64_t id; + // data 'properties' + std::unordered_map properties; + // entity-number for look-up + unsigned short int entity; + + public: + + CSRVertex(){ + // unique ID generation + static uint64_t startID = 0; + id = startID++; + } + + // add entity to vertex + void setEntity(unsigned short int e){ + this->entity = e; + } + + unsigned short int getEntity(){ + return this->entity; + } + + uint64_t getId() const{ + return id; + } + + // calculate size of a vertex for memory usage in bytes + size_t get_size_of_vertex() { + size_t size = 0; + size += sizeof(uint64_t); // id + // properties: + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ + size += sizeof(char)*(property->first.length() + property->second.length()); + } + // entities: + size += sizeof(unsigned short int); + + return size; + } + + // this function adds a whole property map to a vertex + void add_properties(const std::unordered_map &properties){ + if(!properties.empty()){ + this->properties = properties; + }else{ + std::cout << "The properties-list is empty!" << std::endl; + } + } + + // this adds one key-value pair to the vertex's property map + void add_property(const std::pair& property){ + this->properties[property.first] = std::move(property.second); + } + + void print_properties(){ + for(const auto& entry : properties){ + std::cout << "{" << entry.first << ": " << entry.second << "}"; + } + } + }; +} + +#endif //MORPHSTORE_VERTEX_CSR_H diff --git a/test/core/storage/graph/csr/CMakeLists.txt b/test/core/storage/graph/csr/CMakeLists.txt new file mode 100644 index 00000000..5189a2fa --- /dev/null +++ b/test/core/storage/graph/csr/CMakeLists.txt @@ -0,0 +1,15 @@ +if ( CTEST_ALL OR CTEST_STORAGE ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/ldbc_graph_csr_test_app ) + + add_executable( ldbc_graph_csr_test_app generate_ldbc_graph_csr.cpp) + target_compile_options( ldbc_graph_csr_test_app PRIVATE + -Werror + -Wall + -Wextra + -pedantic + -fstack-protector-all + $<$:-DDEBUG> ) + target_link_libraries( ldbc_graph_csr_test_app PRIVATE "-ldl" stdc++fs) + + add_test( ldbc_graph_csr_test ldbc_graph_csr_test_app ) +endif() \ No newline at end of file diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp new file mode 100644 index 00000000..a9b3ffbc --- /dev/null +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -0,0 +1,48 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file generate_ldbc_graph.cpp + * @brief Test for generating social network graph from LDBC files + * @todo + */ + +#include +#include +#include +#include // for high_resolution_clock + +int main( void ){ + + // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + + morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::CSR socialGraph; + + // create abstract pointer to adjc_list (ldbc importer just has to handle with one input class and not adjcancyList, CSR, ....) + morphstore::Graph *graph; + graph = &socialGraph; + + ldbcImport.generate_vertices(*graph); + + socialGraph.statistics(); + + std::cout << "Number of edges: " << ldbcImport.get_total_number_vertices() << std::endl; + std::cout << "Number of edges: " << ldbcImport.get_total_number_edges() << std::endl; + + return 0; +} \ No newline at end of file From 83f7b8e150e39e3a060d7cdcd32c7a87d3977092 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 19 Jul 2019 15:37:55 +0200 Subject: [PATCH 046/216] first woriking implementation of CSR storage format --- include/core/storage/graph/adj_list/graph.h | 5 +- .../core/storage/graph/adj_list/ldbc_import.h | 430 ++++++++++++++++++ include/core/storage/graph/csr/graph.h | 92 +++- .../storage/graph/{ => csr}/ldbc_import.h | 79 +++- include/core/storage/graph/graph_abstract.h | 57 --- .../adj_list/generate_ldbc_graph_adj_list.cpp | 20 +- .../graph/csr/generate_ldbc_graph_csr.cpp | 31 +- 7 files changed, 597 insertions(+), 117 deletions(-) create mode 100644 include/core/storage/graph/adj_list/ldbc_import.h rename include/core/storage/graph/{ => csr}/ldbc_import.h (88%) delete mode 100644 include/core/storage/graph/graph_abstract.h diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index a19356bc..6b865f2d 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -25,7 +25,6 @@ #define MORPHSTORE_GRAPH_ADJACENCY_LIST_H #include -#include #include #include @@ -36,7 +35,7 @@ namespace morphstore{ - class AdjacencyList: public morphstore::Graph{ + class AdjacencyList{ private: // main data structure: mapping global id -> vertex @@ -196,7 +195,7 @@ namespace morphstore{ const std::vector& adjList = v->get_adjList(); for(const auto& e : adjList){ - std::cout << "(" << e.target->getId() << "," << get_relation_by_number(e.relation) << ") "; + std::cout << "(" << e.target->getId() << "," << e.relation << "." << get_relation_by_number(e.relation) << ") "; } std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); diff --git a/include/core/storage/graph/adj_list/ldbc_import.h b/include/core/storage/graph/adj_list/ldbc_import.h new file mode 100644 index 00000000..c00d20e5 --- /dev/null +++ b/include/core/storage/graph/adj_list/ldbc_import.h @@ -0,0 +1,430 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file ldbc_import.h + * @brief this class reads the ldbc files and generates the graph in Adj-List format + * @todo +*/ + +#ifndef MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H +#define MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H + +#include + +#include +#include +#include +#include +#include +#include +#include + +// hash function used to hash a pair of any kind using XOR (for verticesMap) +struct hash_pair { + template + size_t operator()(const std::pair& p) const + { + auto hash1 = std::hash{}(p.first); + auto hash2 = std::hash{}(p.second); + return hash1 ^ hash2; + } +}; + +namespace morphstore{ + + class LDBCImportAdjList{ + + private: + std::string directory; + std::vector verticesPaths; + std::vector relationsPaths; + std::map entitiesLookup; + std::map relationsLookup; + // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id + std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; + + public: + + LDBCImportAdjList(const std::string& dir){ + directory = dir; + insert_file_names(directory); + } + + std::string getDirectory() const{ + return directory; + } + + // generate_vertices() + generate_edges() + void import(morphstore::AdjacencyList &graph){ + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); + + // (1) generate vertices + generate_vertices(graph); + // (2) generate edges + generate_edges(graph); + + std::cout << "--> done" << std::endl; + } + + // function which iterates through directory to receive file names (entire path) + void insert_file_names(std::string dir){ + for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ + // ignore files starting with a '.' + if(entry.path().string()[dir.size()] == '.'){ + continue; + }else{ + // insert file path to vertices or relations vector + differentiate(entry.path().string(), dir); + } + } + } + + // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector + void differentiate(std::string path, std::string dir){ + // if the string contains a '_' -> it's a relation file; otherwise a vertex file + // remove dir name to remain only the *.csv + if(path.substr(dir.size()).find('_') != std::string::npos ){ + relationsPaths.push_back(path); + }else{ + verticesPaths.push_back(path); + } + } + + // this function reads the vertices-files and creates vertices in a graph + void generate_vertices(morphstore::AdjacencyList &graph){ + + if(!verticesPaths.empty()) { + //std::cout << "(1/2) Generating LDBC-Vertices ..."; + //std::cout.flush(); + + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; + + // iterate through vector of vertex-addresses + for (const auto &address : verticesPaths) { + + // data structure for attributes of entity, e.g. taglass -> id, name, url + std::vector attributes; + + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + size_t last = 0; + size_t next = 0; + + // first line of *.csv contains the attributes -> write to attributes vector + if(start == 0){ + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos){ + attributes.push_back(row.substr(last, next-last)); + last = next + 1; + } + // last attribute + attributes.push_back(row.substr(last)); + }else{ + // actual data: + std::unordered_map properties; + size_t attrIndex = 0; + std::string ldbcID = row.substr(0, row.find(delimiter)); + while ((next = row.find(delimiter, last)) != std::string::npos){ + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); + last = next + 1; + ++attrIndex; + } + // last attribute + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + + //----------------------------------------------------- + // create vertex and insert into graph with properties + uint64_t systemID = graph.add_vertex_with_properties(properties); + // add entity number to vertex + graph.add_entity_to_vertex(systemID, entityNumber); + // map entity and ldbc id to system generated id + globalIdLookupMap.insert({{entity, ldbcID}, systemID}); + //----------------------------------------------------- + properties.clear(); // free memory + } + + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + vertexFile.close(); + + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair( entityNumber, entity)); + ++entityNumber; + } + // graph gets full entity-list here: + graph.set_entity_dictionary(entitiesLookup); + } + + } + + // function which returns true, if parameter is a entity in ldbc-files + bool is_entity(const std::string &entity){ + // iterate through entities-map to look up for paramater + for(auto const& entry : entitiesLookup){ + if(entry.second == entity){ + return true; + } + } + + return false; + } + + // function which returns true, if the relation already exist + bool exist_relation_name(const std::string& relation){ + // iterate through relations-map to look up for paramater + for(auto const& entry : relationsLookup){ + if(entry.second == relation){ + return true; + } + } + + return false; + } + + // this function reads the relation-files and generates edges in graph + void generate_edges(morphstore::AdjacencyList &graph){ + + + + if(!relationsPaths.empty()) { + //std::cout << "(2/2) Generating LDBC-Edges ..."; + //std::cout.flush(); + + //this variable is used for the relationLookup-keys, starting by 0 + unsigned short int relationNumber = 0; + bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) + + // iterate through vector of vertex-addresses + for (const auto &address : relationsPaths) { + + isRelation = false; + + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string fromEntity = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string relationName = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string toEntity = relation; + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!relationFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + relationFile.read(buffer, fileSize); // read data as one big block + + size_t start = 0; + std::string delimiter = "|"; + + // check from file name whether it's a relation file or multi value attribute file + if(!is_entity(toEntity)){ + // Multi-value-attributes: just take the last recently one + std::string propertyKey; + std::unordered_map multiValueAttr; + uint64_t systemID; + std::string value; + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' + if(start == 0){ + propertyKey = row.substr(row.find(delimiter) + 1); + }else{ + // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) + systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + value = row.substr(row.find(delimiter) + 1); + multiValueAttr[systemID] = std::move(value); + } + + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + // iterate through multiValue map and assign property to vertex + for(const auto &pair : multiValueAttr){ + const std::pair& keyValuePair = {propertyKey, pair.second}; + graph.add_property_to_vertex(pair.first, keyValuePair); + } + + } + // handling of relation-files ... + else{ + + isRelation = true; + + bool hasProperties = false; + std::string propertyKey; + uint64_t fromID, toID; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + size_t last = 0; + size_t next = 0; + size_t count = 0; + + // first line of *.csv: Differentiate whether it's + // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 + // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 + if(start == 0){ + // if there are 2 delimiter ('|') -> relation file with properties + while ((next = row.find(delimiter, last)) != std::string::npos){ + last = next + 1; + ++count; + } + if(count == 2){ + hasProperties = true; + propertyKey = row.substr(last); + } + }else{ + // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property + // get the system-(global) id's from local ids + fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + // remove from id from string + row.erase(0, row.find(delimiter) + delimiter.length()); + std::string value; + if(!hasProperties){ + // WITHOUT properties: just from the first delimiter on + toID = globalIdLookupMap.at({toEntity, row}); + + // Generate edge in graph + graph.add_edge(fromID, toID, relationNumber); + }else{ + // with properties means: toID is until the next delimiter, and then the value for the property + toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); + row.erase(0, row.find(delimiter) + delimiter.length()); + value = row; + graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); + } + } + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + } + delete[] buffer; // free memory + relationFile.close(); + + //check if the relation name is a relation (no multi value file) + if(isRelation){ + // check if the name already exists + if(!exist_relation_name(relationName)){ + // insert relation-number with string into map + relationsLookup.insert(std::make_pair( relationNumber, relationName)); + ++relationNumber; + } + } + + } + // graph gets full relation-list here: + graph.set_relation_dictionary(relationsLookup); + + globalIdLookupMap.clear(); // we dont need the lookup anymore -> clear + } + } + + // for debugging + void print_file_names(){ + std::cout << "Vertices-Files: " << std::endl; + for(const auto& v : verticesPaths){ + std::cout << "\t" << v << std::endl; + } + std::cout << "Relations-Files: " << std::endl; + for(const auto& rel : relationsPaths){ + std::cout << "\t" << rel << std::endl; + } + + } + + }; +} + +#endif //MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index 33c05495..fba0bd39 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -25,15 +25,15 @@ #define MORPHSTORE_GRAPH_CSR_H #include -#include #include #include +#include #include namespace morphstore{ - class CSR: public morphstore::Graph{ + class CSR{ private: // main data structure: hash table (hash id to vertex) @@ -43,10 +43,10 @@ namespace morphstore{ // graph-structure: 3 Arrays (row_array, col_array, val_array) // row array('node array'): contains the offset in the col_array; vertex-system-id is index in the row_array // col_array('edge array'): every cell represents an edge containing the vertex targets ID - // value_array: edge properties + // value_array: relation number uint64_t* node_array; uint64_t* edge_array; - std::string* val_array; + unsigned short int* val_array; // lookup dictionaries for entities of vertices / relation names of edges std::map entityDictionary; @@ -54,6 +54,8 @@ namespace morphstore{ const std::string storageFormat = "CSR"; + uint64_t numberEdges; + public: @@ -61,14 +63,29 @@ namespace morphstore{ return storageFormat; } + uint64_t getNumberEdges(){ + return numberEdges; + } + + void setNumberEdges(uint64_t edges){ + this->numberEdges = edges; + } + // this functions allocates the memory for the graph structure arrays - void init(){ - // (1) get number of vertices from main data structure - uint64_t numberVertices = vertices.size(); - // (2) allocate node array memory + void allocate_graph_structure(uint64_t numberVertices,uint64_t numberEdges){ + + // allocate node array: node_array = new uint64_t[numberVertices]; + + // allocate edge array: + edge_array = new uint64_t[numberEdges]; + setNumberEdges(numberEdges); + + // allocate val array: + val_array = new unsigned short int[numberEdges]; } + void add_vertex(){ CSRVertex v; vertices.insert(std::make_pair(v.getId(), v)); @@ -82,9 +99,14 @@ namespace morphstore{ } } - void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation){ - // TODO - std::cout << sourceID << targetID << relation << std::endl; + // this function adds the data in the CSR structure from LDBC-Importer + void add_edge_ldbc(uint64_t vertexID, uint64_t startOffset, const std::vector>& neighbors){ + node_array[vertexID] = startOffset; // offset in edge_array + for(auto const& pair : neighbors){ + edge_array[startOffset] = pair.first; // target id + val_array[startOffset] = pair.second; // relation number for lookup + ++startOffset; + } } void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property){ @@ -139,13 +161,57 @@ namespace morphstore{ this->relationDictionary = relationList; } + uint64_t getNumberVertices(){ + return vertices.size(); + } + + // calculate the graph size in bytes + size_t get_size_of_graph(){ + size_t size = 0; + // pointer to arrays: + size += sizeof(uint64_t*) * 2 + sizeof(unsigned short int*); + // vertices: + size += sizeof(uint64_t) * getNumberVertices(); + // edges: + size += sizeof(uint64_t) * getNumberEdges(); + // val array: + size += sizeof(unsigned short int) * getNumberEdges(); + + // vertex map wth actual data: + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size += it->second.get_size_of_vertex(); + } + + return size; + } + + + // for debugging + void print_vertex_by_id(uint64_t id){ + std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; + CSRVertex* v = &vertices.at(id); + uint64_t startOffset = node_array[id]; + uint64_t endOffset = node_array[id+1]; + std::cout << "Offset: " << startOffset << std::endl; + std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; + std::cout << "Entity: \t"<< get_entity_by_number(v->getEntity()) << std::endl; + std::cout << "#Edges: " << (endOffset-startOffset) << std::endl; + std::cout << "Relations: "; + for (uint64_t i = startOffset; i < endOffset; ++i) { + std::cout << "(" << edge_array[i] << "," << val_array[i] << "." << get_relation_by_number(val_array[i]) << ") "; + } + std::cout << "\n"; + std::cout << "Properties: "; v->print_properties(); + std::cout << "\n"; + std::cout << "-----------------------------------------------" << std::endl; + } // for debbuging void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << vertices.size() << std::endl; - std::cout << "Number of relations/edges: " << std::endl; + std::cout << "Number of vertices: " << getNumberVertices() << std::endl; + std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; std::cout << "--------------------------------------------" << std::endl; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/csr/ldbc_import.h similarity index 88% rename from include/core/storage/graph/ldbc_import.h rename to include/core/storage/graph/csr/ldbc_import.h index c0086dc6..b73928e5 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/csr/ldbc_import.h @@ -17,15 +17,14 @@ /** * @file ldbc_import.h - * @brief this class reads the ldbc files and generates the graph - * @todo + * @brief this class reads the ldbc files and generates the graph in CSR format + * @todo EDGE PROPERTIES ARE MISSING!!! */ -#ifndef MORPHSTORE_LDBC_IMPORT_H -#define MORPHSTORE_LDBC_IMPORT_H +#ifndef MORPHSTORE_LDBC_IMPORT_CSR_H +#define MORPHSTORE_LDBC_IMPORT_CSR_H #include -#include #include #include @@ -48,7 +47,7 @@ struct hash_pair { namespace morphstore{ - class LDBC_Import{ + class LDBCImportCSR{ private: std::string directory; @@ -58,10 +57,13 @@ namespace morphstore{ std::map relationsLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; + // main data structure + // map for lookup every system-id, the neigbors in the graph (for further processing, e.g. filling the edge_array in the right order) + std::unordered_map< uint64_t, std::vector>> vertexNeighborsLookup; public: - LDBC_Import(const std::string& dir){ + LDBCImportCSR(const std::string& dir){ directory = dir; insert_file_names(directory); } @@ -71,13 +73,15 @@ namespace morphstore{ } // generate_vertices() + generate_edges() - void import(morphstore::Graph &graph){ + void import(morphstore::CSR &graph){ std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); // (1) generate vertices generate_vertices(graph); - // (2) generate edges + // (2) allocate memory + allocate_graph_memory(graph); + // (3) generate edges generate_edges(graph); std::cout << "--> done" << std::endl; @@ -108,7 +112,7 @@ namespace morphstore{ } // this function reads the vertices-files and creates vertices in a graph - void generate_vertices(morphstore::Graph &graph){ + void generate_vertices(morphstore::CSR &graph){ if(!verticesPaths.empty()) { //std::cout << "(1/2) Generating LDBC-Vertices ..."; @@ -212,11 +216,6 @@ namespace morphstore{ graph.set_entity_dictionary(entitiesLookup); } - // BE CAREFUL WITH THIS: if the graph structure is CSR, we do futher stuff (malloc node array,....) - if(graph.getStorageFormat() == "CSR"){ - graph.init(); - } - } // function which returns true, if parameter is a entity in ldbc-files @@ -244,7 +243,7 @@ namespace morphstore{ } // this function reads the relation-files and generates edges in graph - void generate_edges(morphstore::Graph &graph){ + void generate_edges(morphstore::CSR &graph){ @@ -383,13 +382,18 @@ namespace morphstore{ toID = globalIdLookupMap.at({toEntity, row}); // Generate edge in graph - graph.add_edge(fromID, toID, relationNumber); + //graph.add_edge(fromID, toID, relationNumber); + + // insert relation into vertexNeighborsLookup + vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); + // add to graph + //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); + vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -414,6 +418,35 @@ namespace morphstore{ graph.set_relation_dictionary(relationsLookup); globalIdLookupMap.clear(); // we dont need the lookup anymore -> clear + + // do actual edge generation here: + write_vertexNeighborsLookup_into_graph(graph); + } + } + + void write_vertexNeighborsLookup_into_graph(morphstore::CSR &graph){ + // Write CSR arrays with data (offsets, number of relation,....): + uint64_t lastVertexID = graph.getNumberVertices() - 1; + uint64_t startOffset = 0; + + for(uint64_t vertexID = 0; vertexID < lastVertexID; ++vertexID){ + // get the list of target vertices + std::vector> neighbors; + neighbors = vertexNeighborsLookup[vertexID]; + //store the number for the offset in edge array + uint64_t endOffset = neighbors.size() + startOffset -1 ; + // VERTICES WITHOUT ANY EDGES -< TODO ? how to handle? + graph.add_edge_ldbc(vertexID, startOffset, neighbors); + + startOffset = endOffset + 1 ; + } + } + + void print_vertexNeighborsLookup_by_id(uint64_t id){ + std::cout << "Vertex-ID: " << id << std::endl; + std::cout << "#Neighbors: " << vertexNeighborsLookup[id].size() << std::endl; + for(auto const& entry : vertexNeighborsLookup[id]){ + std::cout << "( " << entry.first << ", " << entry.second << " ) "; } } @@ -549,7 +582,15 @@ namespace morphstore{ } return result; } + + // this function allocates the memory used for the graph structure in CSR (arrays) + void allocate_graph_memory(morphstore::CSR &graph){ + // get number of vertices and number of edges + uint64_t numberVertices = graph.getNumberVertices(); + uint64_t numberEdges = get_total_number_edges(); + graph.allocate_graph_structure(numberVertices, numberEdges); + } }; } -#endif //MORPHSTORE_LDBC_IMPORT_H +#endif //MORPHSTORE_LDBC_IMPORT_CSR_H diff --git a/include/core/storage/graph/graph_abstract.h b/include/core/storage/graph/graph_abstract.h deleted file mode 100644 index 1afd254b..00000000 --- a/include/core/storage/graph/graph_abstract.h +++ /dev/null @@ -1,57 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file graph_abstract.h - * @brief this abstract class is for the ldbc importer for polymorphism of different storage formats (pointer to derived classes) - * @todo add all used functions of graphs in ldbc importer class -*/ - -#ifndef MORPHSTORE_GRAPH_ABSTRACT_H -#define MORPHSTORE_GRAPH_ABSTRACT_H - -#include -#include - -namespace morphstore{ - - class Graph{ - public: - virtual ~Graph() { }; - virtual std::string getStorageFormat() = 0; - - // init: - virtual void init() = 0; - - // Vertex-functions for ldbc-importer: - virtual void add_vertex() = 0; - virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; // - virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; - virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; // - - // edge functions: - virtual void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int relation) = 0 ; - virtual void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property) = 0; - - // dictionary functions: - virtual void set_entity_dictionary(const std::map& entityList) = 0; // - virtual void set_relation_dictionary(const std::map& relationList) = 0; // - - }; -} - -#endif //MORPHSTORE_GRAPH_ABSTRACT_H diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp index b77563cf..0475511b 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp @@ -17,13 +17,12 @@ /** * @file generate_ldbc_graph.cpp - * @brief Test for generating social network graph from LDBC files + * @brief Test for generating social network graph as Adj-List from LDBC files * @todo */ +#include #include -#include -#include #include // for high_resolution_clock int main( void ){ @@ -31,24 +30,16 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImportAdjList ldbcImportAdjList("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::AdjacencyList socialGraph; - // create abstract pointer to adjc_list (ldbc importer just has to handle with one input class and not adjcancyList, CSR, ....) - morphstore::Graph *graph; - graph = &socialGraph; - // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.import(*graph); + ldbcImportAdjList.import(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time std::chrono::duration elapsed = finish - start; - // delete graph-pointer to adj_list socialGraph - graph = NULL; - delete graph; - socialGraph.statistics(); std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; @@ -57,8 +48,9 @@ int main( void ){ socialGraph.print_vertex_by_id(100454); socialGraph.print_vertex_by_id(100450); socialGraph.print_vertex_by_id(100168); + socialGraph.print_vertex_by_id(2000100); */ - + // calculate size of social graph std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp index a9b3ffbc..28df5ca2 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -17,32 +17,41 @@ /** * @file generate_ldbc_graph.cpp - * @brief Test for generating social network graph from LDBC files + * @brief Test for generating social network graph in CSR format from LDBC files * @todo */ +#include #include -#include -#include #include // for high_resolution_clock int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBC_Import ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImportCSR ldbcImportCsr("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::CSR socialGraph; - // create abstract pointer to adjc_list (ldbc importer just has to handle with one input class and not adjcancyList, CSR, ....) - morphstore::Graph *graph; - graph = &socialGraph; + ldbcImportCsr.import(socialGraph); - ldbcImport.generate_vertices(*graph); + // measuring time... + auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time + std::chrono::duration elapsed = finish - start; socialGraph.statistics(); - - std::cout << "Number of edges: " << ldbcImport.get_total_number_vertices() << std::endl; - std::cout << "Number of edges: " << ldbcImport.get_total_number_edges() << std::endl; + std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; + + /* + // test vertices: + socialGraph.print_vertex_by_id(100454); + socialGraph.print_vertex_by_id(100450); + socialGraph.print_vertex_by_id(100168); + socialGraph.print_vertex_by_id(2000100); + */ + + // calculate size of social graph + std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; return 0; } \ No newline at end of file From af388bf3a7fb5f8bd3ee0cd0d9468c4dd4a9b84b Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 10:33:39 +0200 Subject: [PATCH 047/216] csr: now sorting neighbors of vertex ASC in the intermediates --- .../core/storage/graph/adj_list/ldbc_import.h | 13 +++++- include/core/storage/graph/csr/graph.h | 2 +- include/core/storage/graph/csr/ldbc_import.h | 41 +++++++++++++------ .../graph/csr/generate_ldbc_graph_csr.cpp | 2 + 4 files changed, 42 insertions(+), 16 deletions(-) diff --git a/include/core/storage/graph/adj_list/ldbc_import.h b/include/core/storage/graph/adj_list/ldbc_import.h index c00d20e5..6aa62583 100644 --- a/include/core/storage/graph/adj_list/ldbc_import.h +++ b/include/core/storage/graph/adj_list/ldbc_import.h @@ -79,6 +79,9 @@ namespace morphstore{ // (2) generate edges generate_edges(graph); + // (3) clear intermediates + clear_intermediates(); + std::cout << "--> done" << std::endl; } @@ -406,8 +409,6 @@ namespace morphstore{ } // graph gets full relation-list here: graph.set_relation_dictionary(relationsLookup); - - globalIdLookupMap.clear(); // we dont need the lookup anymore -> clear } } @@ -424,6 +425,14 @@ namespace morphstore{ } + void clear_intermediates(){ + globalIdLookupMap.clear(); + relationsLookup.clear(); + entitiesLookup.clear(); + relationsPaths.clear(); + verticesPaths.clear(); + } + }; } diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index fba0bd39..ce381c1c 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -72,7 +72,7 @@ namespace morphstore{ } // this functions allocates the memory for the graph structure arrays - void allocate_graph_structure(uint64_t numberVertices,uint64_t numberEdges){ + void allocate_graph_structure_memory(uint64_t numberVertices,uint64_t numberEdges){ // allocate node array: node_array = new uint64_t[numberVertices]; diff --git a/include/core/storage/graph/csr/ldbc_import.h b/include/core/storage/graph/csr/ldbc_import.h index b73928e5..a3891b6e 100644 --- a/include/core/storage/graph/csr/ldbc_import.h +++ b/include/core/storage/graph/csr/ldbc_import.h @@ -80,10 +80,13 @@ namespace morphstore{ // (1) generate vertices generate_vertices(graph); // (2) allocate memory - allocate_graph_memory(graph); + allocate_graph_structure_memory(graph); // (3) generate edges generate_edges(graph); + // (4) remove intermediates + clear_intermediates(); + std::cout << "--> done" << std::endl; } @@ -417,14 +420,16 @@ namespace morphstore{ // graph gets full relation-list here: graph.set_relation_dictionary(relationsLookup); - globalIdLookupMap.clear(); // we dont need the lookup anymore -> clear - // do actual edge generation here: write_vertexNeighborsLookup_into_graph(graph); } } + // this function writes the actual data from the intermediate vertexNeighborsLookup int to the arrays in the csr format void write_vertexNeighborsLookup_into_graph(morphstore::CSR &graph){ + // firstly, sorting the intermediates with their target IDs ASC + sort_VertexNeighborsLookup(); + // Write CSR arrays with data (offsets, number of relation,....): uint64_t lastVertexID = graph.getNumberVertices() - 1; uint64_t startOffset = 0; @@ -442,14 +447,6 @@ namespace morphstore{ } } - void print_vertexNeighborsLookup_by_id(uint64_t id){ - std::cout << "Vertex-ID: " << id << std::endl; - std::cout << "#Neighbors: " << vertexNeighborsLookup[id].size() << std::endl; - for(auto const& entry : vertexNeighborsLookup[id]){ - std::cout << "( " << entry.first << ", " << entry.second << " ) "; - } - } - // for debugging void print_file_names(){ std::cout << "Vertices-Files: " << std::endl; @@ -584,11 +581,29 @@ namespace morphstore{ } // this function allocates the memory used for the graph structure in CSR (arrays) - void allocate_graph_memory(morphstore::CSR &graph){ + void allocate_graph_structure_memory(morphstore::CSR &graph){ // get number of vertices and number of edges uint64_t numberVertices = graph.getNumberVertices(); uint64_t numberEdges = get_total_number_edges(); - graph.allocate_graph_structure(numberVertices, numberEdges); + graph.allocate_graph_structure_memory(numberVertices, numberEdges); + } + + // this function clears all intermediates + void clear_intermediates(){ + globalIdLookupMap.clear(); + vertexNeighborsLookup.clear(); + relationsLookup.clear(); + entitiesLookup.clear(); + relationsPaths.clear(); + verticesPaths.clear(); + } + + // function for sorting the vertexNeighborsLookup ASC + void sort_VertexNeighborsLookup(){ + // sorting the first element of the pair (target-id) + for(auto &it: vertexNeighborsLookup){ + std::sort(it.second.begin(), it.second.end()); + } } }; } diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp index 28df5ca2..3943d98a 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -50,6 +50,8 @@ int main( void ){ socialGraph.print_vertex_by_id(2000100); */ + socialGraph.print_vertex_by_id(100168); + // calculate size of social graph std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; From bf0992f5c40f746c89b4d7d252fb48ce30129b17 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 10:41:47 +0200 Subject: [PATCH 048/216] remove test-print vertex --- test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp index 3943d98a..28df5ca2 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -50,8 +50,6 @@ int main( void ){ socialGraph.print_vertex_by_id(2000100); */ - socialGraph.print_vertex_by_id(100168); - // calculate size of social graph std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; From 1c9da54514d32722bf5e250c4c214638b79d2927 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 11:13:01 +0200 Subject: [PATCH 049/216] remove specific ldbc-importer header for CSR/ADJ; now one importer for both --- .../core/storage/graph/adj_list/ldbc_import.h | 439 --------------- .../storage/graph/{csr => }/ldbc_import.h | 506 +++++++++++++----- .../adj_list/generate_ldbc_graph_adj_list.cpp | 6 +- .../graph/csr/generate_ldbc_graph_csr.cpp | 6 +- 4 files changed, 387 insertions(+), 570 deletions(-) delete mode 100644 include/core/storage/graph/adj_list/ldbc_import.h rename include/core/storage/graph/{csr => }/ldbc_import.h (64%) diff --git a/include/core/storage/graph/adj_list/ldbc_import.h b/include/core/storage/graph/adj_list/ldbc_import.h deleted file mode 100644 index 6aa62583..00000000 --- a/include/core/storage/graph/adj_list/ldbc_import.h +++ /dev/null @@ -1,439 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file ldbc_import.h - * @brief this class reads the ldbc files and generates the graph in Adj-List format - * @todo -*/ - -#ifndef MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H -#define MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H - -#include - -#include -#include -#include -#include -#include -#include -#include - -// hash function used to hash a pair of any kind using XOR (for verticesMap) -struct hash_pair { - template - size_t operator()(const std::pair& p) const - { - auto hash1 = std::hash{}(p.first); - auto hash2 = std::hash{}(p.second); - return hash1 ^ hash2; - } -}; - -namespace morphstore{ - - class LDBCImportAdjList{ - - private: - std::string directory; - std::vector verticesPaths; - std::vector relationsPaths; - std::map entitiesLookup; - std::map relationsLookup; - // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id - std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; - - public: - - LDBCImportAdjList(const std::string& dir){ - directory = dir; - insert_file_names(directory); - } - - std::string getDirectory() const{ - return directory; - } - - // generate_vertices() + generate_edges() - void import(morphstore::AdjacencyList &graph){ - std::cout << "Importing LDBC-files into graph ... "; - std::cout.flush(); - - // (1) generate vertices - generate_vertices(graph); - // (2) generate edges - generate_edges(graph); - - // (3) clear intermediates - clear_intermediates(); - - std::cout << "--> done" << std::endl; - } - - // function which iterates through directory to receive file names (entire path) - void insert_file_names(std::string dir){ - for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ - // ignore files starting with a '.' - if(entry.path().string()[dir.size()] == '.'){ - continue; - }else{ - // insert file path to vertices or relations vector - differentiate(entry.path().string(), dir); - } - } - } - - // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector - void differentiate(std::string path, std::string dir){ - // if the string contains a '_' -> it's a relation file; otherwise a vertex file - // remove dir name to remain only the *.csv - if(path.substr(dir.size()).find('_') != std::string::npos ){ - relationsPaths.push_back(path); - }else{ - verticesPaths.push_back(path); - } - } - - // this function reads the vertices-files and creates vertices in a graph - void generate_vertices(morphstore::AdjacencyList &graph){ - - if(!verticesPaths.empty()) { - //std::cout << "(1/2) Generating LDBC-Vertices ..."; - //std::cout.flush(); - - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; - - // iterate through vector of vertex-addresses - for (const auto &address : verticesPaths) { - - // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; - - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - - char* buffer; - - uint64_t fileSize = 0; - - std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - - if (!vertexFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - vertexFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; - - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - size_t last = 0; - size_t next = 0; - - // first line of *.csv contains the attributes -> write to attributes vector - if(start == 0){ - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes.push_back(row.substr(last, next-last)); - last = next + 1; - } - // last attribute - attributes.push_back(row.substr(last)); - }else{ - // actual data: - std::unordered_map properties; - size_t attrIndex = 0; - std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos){ - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); - last = next + 1; - ++attrIndex; - } - // last attribute - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); - - //----------------------------------------------------- - // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); - // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, systemID}); - //----------------------------------------------------- - properties.clear(); // free memory - } - - start = i; // set new starting point for buffer (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - vertexFile.close(); - - // insert entity-number with string into map - entitiesLookup.insert(std::make_pair( entityNumber, entity)); - ++entityNumber; - } - // graph gets full entity-list here: - graph.set_entity_dictionary(entitiesLookup); - } - - } - - // function which returns true, if parameter is a entity in ldbc-files - bool is_entity(const std::string &entity){ - // iterate through entities-map to look up for paramater - for(auto const& entry : entitiesLookup){ - if(entry.second == entity){ - return true; - } - } - - return false; - } - - // function which returns true, if the relation already exist - bool exist_relation_name(const std::string& relation){ - // iterate through relations-map to look up for paramater - for(auto const& entry : relationsLookup){ - if(entry.second == relation){ - return true; - } - } - - return false; - } - - // this function reads the relation-files and generates edges in graph - void generate_edges(morphstore::AdjacencyList &graph){ - - - - if(!relationsPaths.empty()) { - //std::cout << "(2/2) Generating LDBC-Edges ..."; - //std::cout.flush(); - - //this variable is used for the relationLookup-keys, starting by 0 - unsigned short int relationNumber = 0; - bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) - - // iterate through vector of vertex-addresses - for (const auto &address : relationsPaths) { - - isRelation = false; - - // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - std::string fromEntity = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); - - std::string relationName = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); - - std::string toEntity = relation; - - char* buffer; - - uint64_t fileSize = 0; - - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - - if (!relationFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - relationFile.read(buffer, fileSize); // read data as one big block - - size_t start = 0; - std::string delimiter = "|"; - - // check from file name whether it's a relation file or multi value attribute file - if(!is_entity(toEntity)){ - // Multi-value-attributes: just take the last recently one - std::string propertyKey; - std::unordered_map multiValueAttr; - uint64_t systemID; - std::string value; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' - if(start == 0){ - propertyKey = row.substr(row.find(delimiter) + 1); - }else{ - // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) - systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); - value = row.substr(row.find(delimiter) + 1); - multiValueAttr[systemID] = std::move(value); - } - - start = i; // set new starting point for buffer (otherwise it's concatenated) - } - } - // iterate through multiValue map and assign property to vertex - for(const auto &pair : multiValueAttr){ - const std::pair& keyValuePair = {propertyKey, pair.second}; - graph.add_property_to_vertex(pair.first, keyValuePair); - } - - } - // handling of relation-files ... - else{ - - isRelation = true; - - bool hasProperties = false; - std::string propertyKey; - uint64_t fromID, toID; - - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - size_t last = 0; - size_t next = 0; - size_t count = 0; - - // first line of *.csv: Differentiate whether it's - // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 - // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 - if(start == 0){ - // if there are 2 delimiter ('|') -> relation file with properties - while ((next = row.find(delimiter, last)) != std::string::npos){ - last = next + 1; - ++count; - } - if(count == 2){ - hasProperties = true; - propertyKey = row.substr(last); - } - }else{ - // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property - // get the system-(global) id's from local ids - fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); - // remove from id from string - row.erase(0, row.find(delimiter) + delimiter.length()); - std::string value; - if(!hasProperties){ - // WITHOUT properties: just from the first delimiter on - toID = globalIdLookupMap.at({toEntity, row}); - - // Generate edge in graph - graph.add_edge(fromID, toID, relationNumber); - }else{ - // with properties means: toID is until the next delimiter, and then the value for the property - toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); - row.erase(0, row.find(delimiter) + delimiter.length()); - value = row; - graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); - } - } - start = i; // set new starting point for buffer (otherwise it's concatenated) - } - } - } - delete[] buffer; // free memory - relationFile.close(); - - //check if the relation name is a relation (no multi value file) - if(isRelation){ - // check if the name already exists - if(!exist_relation_name(relationName)){ - // insert relation-number with string into map - relationsLookup.insert(std::make_pair( relationNumber, relationName)); - ++relationNumber; - } - } - - } - // graph gets full relation-list here: - graph.set_relation_dictionary(relationsLookup); - } - } - - // for debugging - void print_file_names(){ - std::cout << "Vertices-Files: " << std::endl; - for(const auto& v : verticesPaths){ - std::cout << "\t" << v << std::endl; - } - std::cout << "Relations-Files: " << std::endl; - for(const auto& rel : relationsPaths){ - std::cout << "\t" << rel << std::endl; - } - - } - - void clear_intermediates(){ - globalIdLookupMap.clear(); - relationsLookup.clear(); - entitiesLookup.clear(); - relationsPaths.clear(); - verticesPaths.clear(); - } - - }; -} - -#endif //MORPHSTORE_LDBC_IMPORT_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/csr/ldbc_import.h b/include/core/storage/graph/ldbc_import.h similarity index 64% rename from include/core/storage/graph/csr/ldbc_import.h rename to include/core/storage/graph/ldbc_import.h index a3891b6e..fa2d21da 100644 --- a/include/core/storage/graph/csr/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -17,13 +17,14 @@ /** * @file ldbc_import.h - * @brief this class reads the ldbc files and generates the graph in CSR format - * @todo EDGE PROPERTIES ARE MISSING!!! + * @brief this class reads the ldbc files and generates the graph in CSR or AdjList + * @todo CSR-EDGE PROPERTIES ARE MISSING!!! */ -#ifndef MORPHSTORE_LDBC_IMPORT_CSR_H -#define MORPHSTORE_LDBC_IMPORT_CSR_H +#ifndef MORPHSTORE_LDBC_IMPORT_H +#define MORPHSTORE_LDBC_IMPORT_H +#include #include #include @@ -47,7 +48,7 @@ struct hash_pair { namespace morphstore{ - class LDBCImportCSR{ + class LDBCImport{ private: std::string directory; @@ -57,13 +58,14 @@ namespace morphstore{ std::map relationsLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; - // main data structure + + // for CSR data structure // map for lookup every system-id, the neigbors in the graph (for further processing, e.g. filling the edge_array in the right order) std::unordered_map< uint64_t, std::vector>> vertexNeighborsLookup; public: - LDBCImportCSR(const std::string& dir){ + LDBCImport(const std::string& dir){ directory = dir; insert_file_names(directory); } @@ -72,24 +74,6 @@ namespace morphstore{ return directory; } - // generate_vertices() + generate_edges() - void import(morphstore::CSR &graph){ - std::cout << "Importing LDBC-files into graph ... "; - std::cout.flush(); - - // (1) generate vertices - generate_vertices(graph); - // (2) allocate memory - allocate_graph_structure_memory(graph); - // (3) generate edges - generate_edges(graph); - - // (4) remove intermediates - clear_intermediates(); - - std::cout << "--> done" << std::endl; - } - // function which iterates through directory to receive file names (entire path) void insert_file_names(std::string dir){ for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ @@ -114,8 +98,143 @@ namespace morphstore{ } } + // function which returns true, if parameter is a entity in ldbc-files + bool is_entity(const std::string &entity){ + // iterate through entities-map to look up for paramater + for(auto const& entry : entitiesLookup){ + if(entry.second == entity){ + return true; + } + } + + return false; + } + + // function which returns true, if the relation already exist + bool exist_relation_name(const std::string& relation){ + // iterate through relations-map to look up for paramater + for(auto const& entry : relationsLookup){ + if(entry.second == relation){ + return true; + } + } + + return false; + } + + // for debugging + void print_file_names(){ + std::cout << "Vertices-Files: " << std::endl; + for(const auto& v : verticesPaths){ + std::cout << "\t" << v << std::endl; + } + std::cout << "Relations-Files: " << std::endl; + for(const auto& rel : relationsPaths){ + std::cout << "\t" << rel << std::endl; + } + + } + + // function which clears all intermediates after import + void clear_intermediates(){ + globalIdLookupMap.clear(); + relationsLookup.clear(); + entitiesLookup.clear(); + relationsPaths.clear(); + verticesPaths.clear(); + } + + // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the entity lookup creation) + uint64_t get_total_number_edges(){ + + uint64_t result = 0 ; + + if(!relationsPaths.empty()) { + + // iterate through vector of relation-addresses + for (const auto &address : relationsPaths) { + + // TODO OPTIMIZE HERE: remove string operations + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string fromEntity = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string relationName = relation.substr(0, relation.find('_')); + relation.erase(0, relation.find('_') + 1); + + std::string toEntity = relation; + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!relationFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (relationFile.is_open()) { + fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + relationFile.clear(); + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + relationFile.read(buffer, fileSize); // read data as one big block + bool firstLine = true; + + // check from file name whether it's a relation file or multi value attribute file + if(is_entity(toEntity)){ + + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // skip first line (attributes infos....) + if(firstLine){ + firstLine = false; + }else{ + ++result; + } + } + } + + } + + delete[] buffer; // free memory + relationFile.close(); + + } + } + return result; + } + + + + // -------------------------------- Adj-List-specific functions -------------------------------- + + // Import into Adj-List-Format: + // generate_vertices() + generate_edges() + void import_adj_list(morphstore::AdjacencyList &graph){ + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); + + // (1) generate vertices + generate_vertices_adj_list(graph); + // (2) generate edges + generate_edges_adj_list(graph); + + // (3) clear intermediates + clear_intermediates(); + + std::cout << "--> done" << std::endl; + } + // this function reads the vertices-files and creates vertices in a graph - void generate_vertices(morphstore::CSR &graph){ + void generate_vertices_adj_list(morphstore::AdjacencyList &graph){ if(!verticesPaths.empty()) { //std::cout << "(1/2) Generating LDBC-Vertices ..."; @@ -221,32 +340,8 @@ namespace morphstore{ } - // function which returns true, if parameter is a entity in ldbc-files - bool is_entity(const std::string &entity){ - // iterate through entities-map to look up for paramater - for(auto const& entry : entitiesLookup){ - if(entry.second == entity){ - return true; - } - } - - return false; - } - - // function which returns true, if the relation already exist - bool exist_relation_name(const std::string& relation){ - // iterate through relations-map to look up for paramater - for(auto const& entry : relationsLookup){ - if(entry.second == relation){ - return true; - } - } - - return false; - } - // this function reads the relation-files and generates edges in graph - void generate_edges(morphstore::CSR &graph){ + void generate_edges_adj_list(morphstore::AdjacencyList &graph){ @@ -385,18 +480,13 @@ namespace morphstore{ toID = globalIdLookupMap.at({toEntity, row}); // Generate edge in graph - //graph.add_edge(fromID, toID, relationNumber); - - // insert relation into vertexNeighborsLookup - vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + graph.add_edge(fromID, toID, relationNumber); }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // add to graph - //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); - vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -419,57 +509,51 @@ namespace morphstore{ } // graph gets full relation-list here: graph.set_relation_dictionary(relationsLookup); - - // do actual edge generation here: - write_vertexNeighborsLookup_into_graph(graph); } } - // this function writes the actual data from the intermediate vertexNeighborsLookup int to the arrays in the csr format - void write_vertexNeighborsLookup_into_graph(morphstore::CSR &graph){ - // firstly, sorting the intermediates with their target IDs ASC - sort_VertexNeighborsLookup(); - // Write CSR arrays with data (offsets, number of relation,....): - uint64_t lastVertexID = graph.getNumberVertices() - 1; - uint64_t startOffset = 0; - for(uint64_t vertexID = 0; vertexID < lastVertexID; ++vertexID){ - // get the list of target vertices - std::vector> neighbors; - neighbors = vertexNeighborsLookup[vertexID]; - //store the number for the offset in edge array - uint64_t endOffset = neighbors.size() + startOffset -1 ; - // VERTICES WITHOUT ANY EDGES -< TODO ? how to handle? - graph.add_edge_ldbc(vertexID, startOffset, neighbors); + // -------------------------------- CSR-specific functions -------------------------------- - startOffset = endOffset + 1 ; - } - } + // Import into CSR-Format: + // generate_vertices() + generate_edges() + void import_csr(morphstore::CSR &graph){ + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); - // for debugging - void print_file_names(){ - std::cout << "Vertices-Files: " << std::endl; - for(const auto& v : verticesPaths){ - std::cout << "\t" << v << std::endl; - } - std::cout << "Relations-Files: " << std::endl; - for(const auto& rel : relationsPaths){ - std::cout << "\t" << rel << std::endl; - } + // (1) generate vertices + generate_vertices_csr(graph); + // (2) allocate memory + allocate_graph_structure_memory_csr(graph); + // (3) generate edges + generate_edges_csr(graph); - } + // (4) remove intermediates + clear_intermediates(); - // function that returns number of vertices - uint64_t get_total_number_vertices(){ + std::cout << "--> done" << std::endl; + } - uint64_t result = 0; + // this function reads the vertices-files and creates vertices in a graph + void generate_vertices_csr(morphstore::CSR &graph){ if(!verticesPaths.empty()) { + //std::cout << "(1/2) Generating LDBC-Vertices ..."; + //std::cout.flush(); + + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; // iterate through vector of vertex-addresses for (const auto &address : verticesPaths) { + // data structure for attributes of entity, e.g. taglass -> id, name, url + std::vector attributes; + + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + char* buffer; uint64_t fileSize = 0; @@ -491,38 +575,91 @@ namespace morphstore{ // allocate memory buffer = (char*) malloc( fileSize * sizeof( char ) ); vertexFile.read(buffer, fileSize); // read data as one big block - bool firstLine = true; + size_t start = 0; + std::string delimiter = "|"; // read buffer and do the magic ... for(size_t i = 0; i < fileSize; ++i){ if(buffer[i] == '\n'){ - // skip first line... - if(firstLine){ - firstLine = false; + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + size_t last = 0; + size_t next = 0; + + // first line of *.csv contains the attributes -> write to attributes vector + if(start == 0){ + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos){ + attributes.push_back(row.substr(last, next-last)); + last = next + 1; + } + // last attribute + attributes.push_back(row.substr(last)); }else{ - ++result; + // actual data: + std::unordered_map properties; + size_t attrIndex = 0; + std::string ldbcID = row.substr(0, row.find(delimiter)); + while ((next = row.find(delimiter, last)) != std::string::npos){ + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); + last = next + 1; + ++attrIndex; + } + // last attribute + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + + //----------------------------------------------------- + // create vertex and insert into graph with properties + uint64_t systemID = graph.add_vertex_with_properties(properties); + // add entity number to vertex + graph.add_entity_to_vertex(systemID, entityNumber); + // map entity and ldbc id to system generated id + globalIdLookupMap.insert({{entity, ldbcID}, systemID}); + //----------------------------------------------------- + properties.clear(); // free memory } + + start = i; // set new starting point for buffer (otherwise it's concatenated) } } delete[] buffer; // free memory vertexFile.close(); + + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair( entityNumber, entity)); + ++entityNumber; } + // graph gets full entity-list here: + graph.set_entity_dictionary(entitiesLookup); } - return result; + } - // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because entity lookup creation) - uint64_t get_total_number_edges(){ + // this function reads the relation-files and generates edges in graph + void generate_edges_csr(morphstore::CSR &graph){ + - uint64_t result = 0 ; if(!relationsPaths.empty()) { + //std::cout << "(2/2) Generating LDBC-Edges ..."; + //std::cout.flush(); - // iterate through vector of relation-addresses + //this variable is used for the relationLookup-keys, starting by 0 + unsigned short int relationNumber = 0; + bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) + + // iterate through vector of vertex-addresses for (const auto &address : relationsPaths) { - // TODO OPTIMIZE HERE: remove string operations + isRelation = false; + // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); std::string fromEntity = relation.substr(0, relation.find('_')); @@ -554,58 +691,177 @@ namespace morphstore{ // allocate memory buffer = (char*) malloc( fileSize * sizeof( char ) ); relationFile.read(buffer, fileSize); // read data as one big block - bool firstLine = true; + + size_t start = 0; + std::string delimiter = "|"; // check from file name whether it's a relation file or multi value attribute file - if(is_entity(toEntity)){ + if(!is_entity(toEntity)){ + // Multi-value-attributes: just take the last recently one + std::string propertyKey; + std::unordered_map multiValueAttr; + uint64_t systemID; + std::string value; for(size_t i = 0; i < fileSize; ++i){ if(buffer[i] == '\n'){ - // skip first line (attributes infos....) - if(firstLine){ - firstLine = false; + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' + if(start == 0){ + propertyKey = row.substr(row.find(delimiter) + 1); }else{ - ++result; + // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) + systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + value = row.substr(row.find(delimiter) + 1); + multiValueAttr[systemID] = std::move(value); } + + start = i; // set new starting point for buffer (otherwise it's concatenated) } } + // iterate through multiValue map and assign property to vertex + for(const auto &pair : multiValueAttr){ + const std::pair& keyValuePair = {propertyKey, pair.second}; + graph.add_property_to_vertex(pair.first, keyValuePair); + } } + // handling of relation-files ... + else{ + + isRelation = true; + + bool hasProperties = false; + std::string propertyKey; + uint64_t fromID, toID; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + size_t last = 0; + size_t next = 0; + size_t count = 0; + // first line of *.csv: Differentiate whether it's + // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 + // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 + if(start == 0){ + // if there are 2 delimiter ('|') -> relation file with properties + while ((next = row.find(delimiter, last)) != std::string::npos){ + last = next + 1; + ++count; + } + if(count == 2){ + hasProperties = true; + propertyKey = row.substr(last); + } + }else{ + // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property + // get the system-(global) id's from local ids + fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + // remove from id from string + row.erase(0, row.find(delimiter) + delimiter.length()); + std::string value; + if(!hasProperties){ + // WITHOUT properties: just from the first delimiter on + toID = globalIdLookupMap.at({toEntity, row}); + + // Generate edge in graph + //graph.add_edge(fromID, toID, relationNumber); + + // insert relation into vertexNeighborsLookup + vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + }else{ + // with properties means: toID is until the next delimiter, and then the value for the property + toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); + row.erase(0, row.find(delimiter) + delimiter.length()); + value = row; + // add to graph + //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); + vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + } + } + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + } delete[] buffer; // free memory relationFile.close(); + //check if the relation name is a relation (no multi value file) + if(isRelation){ + // check if the name already exists + if(!exist_relation_name(relationName)){ + // insert relation-number with string into map + relationsLookup.insert(std::make_pair( relationNumber, relationName)); + ++relationNumber; + } + } + } + // graph gets full relation-list here: + graph.set_relation_dictionary(relationsLookup); + + // do actual edge generation here: + write_vertexNeighborsLookup_into_graph_csr(graph); } - return result; } // this function allocates the memory used for the graph structure in CSR (arrays) - void allocate_graph_structure_memory(morphstore::CSR &graph){ + void allocate_graph_structure_memory_csr(morphstore::CSR &graph){ // get number of vertices and number of edges uint64_t numberVertices = graph.getNumberVertices(); uint64_t numberEdges = get_total_number_edges(); graph.allocate_graph_structure_memory(numberVertices, numberEdges); } - // this function clears all intermediates - void clear_intermediates(){ - globalIdLookupMap.clear(); - vertexNeighborsLookup.clear(); - relationsLookup.clear(); - entitiesLookup.clear(); - relationsPaths.clear(); - verticesPaths.clear(); - } - - // function for sorting the vertexNeighborsLookup ASC - void sort_VertexNeighborsLookup(){ + // function for sorting the vertexNeighborsLookup ASC in CSR + void sort_VertexNeighborsLookup_csr(){ // sorting the first element of the pair (target-id) for(auto &it: vertexNeighborsLookup){ std::sort(it.second.begin(), it.second.end()); } } + + // this function writes the actual data from the intermediate vertexNeighborsLookup int to the arrays in the csr format + void write_vertexNeighborsLookup_into_graph_csr(morphstore::CSR &graph){ + // firstly, sorting the intermediates with their target IDs ASC + sort_VertexNeighborsLookup_csr(); + + // Write CSR arrays with data (offsets, number of relation,....): + uint64_t lastVertexID = graph.getNumberVertices() - 1; + uint64_t startOffset = 0; + + for(uint64_t vertexID = 0; vertexID < lastVertexID; ++vertexID){ + // get the list of target vertices + std::vector> neighbors; + neighbors = vertexNeighborsLookup[vertexID]; + //store the number for the offset in edge array + uint64_t endOffset = neighbors.size() + startOffset -1 ; + // VERTICES WITHOUT ANY EDGES -< TODO ? how to handle? + graph.add_edge_ldbc(vertexID, startOffset, neighbors); + + startOffset = endOffset + 1 ; + } + } + + }; } -#endif //MORPHSTORE_LDBC_IMPORT_CSR_H +#endif //MORPHSTORE_LDBC_IMPORT_H diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp index 0475511b..07884cba 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp @@ -21,7 +21,7 @@ * @todo */ -#include +#include #include #include // for high_resolution_clock @@ -30,11 +30,11 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBCImportAdjList ldbcImportAdjList("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::AdjacencyList socialGraph; // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImportAdjList.import(socialGraph); + ldbcImport.import_adj_list(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp index 28df5ca2..dfe2637c 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -21,7 +21,7 @@ * @todo */ -#include +#include #include #include // for high_resolution_clock @@ -30,10 +30,10 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBCImportCSR ldbcImportCsr("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::CSR socialGraph; - ldbcImportCsr.import(socialGraph); + ldbcImport.import_csr(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time From 3505a6c5e3f8a70f9785fddff6c5de1f03f06332 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 12:02:43 +0200 Subject: [PATCH 050/216] clean-up --- include/core/storage/graph/adj_list/graph.h | 6 +- include/core/storage/graph/csr/graph.h | 6 +- include/core/storage/graph/ldbc_import.h | 336 ++++++------------ .../adj_list/generate_ldbc_graph_adj_list.cpp | 4 +- .../graph/csr/generate_ldbc_graph_csr.cpp | 4 +- 5 files changed, 122 insertions(+), 234 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index 6b865f2d..05b42971 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -46,16 +46,14 @@ namespace morphstore{ std::map entityDictionary; std::map relationDictionary; - const std::string storageFormat = "AdjacencyList"; - public: void init(){ std::cout << "Nothing to do!!" << std::endl; } - std::string getStorageFormat(){ - return storageFormat; + std::string getStorageFormat() const{ + return "AdjacencyList"; } // calculate the graph size in bytes diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index ce381c1c..b420a391 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -52,15 +52,13 @@ namespace morphstore{ std::map entityDictionary; std::map relationDictionary; - const std::string storageFormat = "CSR"; - uint64_t numberEdges; public: - std::string getStorageFormat(){ - return storageFormat; + std::string getStorageFormat() const{ + return "CSR"; } uint64_t getNumberEdges(){ diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index fa2d21da..284fdbe1 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -98,6 +98,114 @@ namespace morphstore{ } } + // this function reads the vertices-files and creates vertices in a graph + template + void generate_vertices(T &graph){ + + if(!verticesPaths.empty()) { + //std::cout << "(1/2) Generating LDBC-Vertices ..."; + //std::cout.flush(); + + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; + + // iterate through vector of vertex-addresses + for (const auto &address : verticesPaths) { + + // data structure for attributes of entity, e.g. taglass -> id, name, url + std::vector attributes; + + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + + char* buffer; + + uint64_t fileSize = 0; + + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } + + // calculate file size + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } + + // allocate memory + buffer = (char*) malloc( fileSize * sizeof( char ) ); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; + + // read buffer and do the magic ... + for(size_t i = 0; i < fileSize; ++i){ + if(buffer[i] == '\n'){ + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if(row.find('\n') != std::string::npos){ + row.erase(0,1); + } + + size_t last = 0; + size_t next = 0; + + // first line of *.csv contains the attributes -> write to attributes vector + if(start == 0){ + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos){ + attributes.push_back(row.substr(last, next-last)); + last = next + 1; + } + // last attribute + attributes.push_back(row.substr(last)); + }else{ + // actual data: + std::unordered_map properties; + size_t attrIndex = 0; + std::string ldbcID = row.substr(0, row.find(delimiter)); + while ((next = row.find(delimiter, last)) != std::string::npos){ + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); + last = next + 1; + ++attrIndex; + } + // last attribute + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + + //----------------------------------------------------- + // create vertex and insert into graph with properties + uint64_t systemID = graph.add_vertex_with_properties(properties); + // add entity number to vertex + graph.add_entity_to_vertex(systemID, entityNumber); + // map entity and ldbc id to system generated id + globalIdLookupMap.insert({{entity, ldbcID}, systemID}); + //----------------------------------------------------- + properties.clear(); // free memory + } + + start = i; // set new starting point for buffer (otherwise it's concatenated) + } + } + + delete[] buffer; // free memory + vertexFile.close(); + + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair( entityNumber, entity)); + ++entityNumber; + } + // graph gets full entity-list here: + graph.set_entity_dictionary(entitiesLookup); + } + + } + // function which returns true, if parameter is a entity in ldbc-files bool is_entity(const std::string &entity){ // iterate through entities-map to look up for paramater @@ -218,12 +326,12 @@ namespace morphstore{ // Import into Adj-List-Format: // generate_vertices() + generate_edges() - void import_adj_list(morphstore::AdjacencyList &graph){ + void import(morphstore::AdjacencyList &graph){ std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); // (1) generate vertices - generate_vertices_adj_list(graph); + generate_vertices(graph); // (2) generate edges generate_edges_adj_list(graph); @@ -233,113 +341,6 @@ namespace morphstore{ std::cout << "--> done" << std::endl; } - // this function reads the vertices-files and creates vertices in a graph - void generate_vertices_adj_list(morphstore::AdjacencyList &graph){ - - if(!verticesPaths.empty()) { - //std::cout << "(1/2) Generating LDBC-Vertices ..."; - //std::cout.flush(); - - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; - - // iterate through vector of vertex-addresses - for (const auto &address : verticesPaths) { - - // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; - - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - - char* buffer; - - uint64_t fileSize = 0; - - std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - - if (!vertexFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - vertexFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; - - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - size_t last = 0; - size_t next = 0; - - // first line of *.csv contains the attributes -> write to attributes vector - if(start == 0){ - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes.push_back(row.substr(last, next-last)); - last = next + 1; - } - // last attribute - attributes.push_back(row.substr(last)); - }else{ - // actual data: - std::unordered_map properties; - size_t attrIndex = 0; - std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos){ - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); - last = next + 1; - ++attrIndex; - } - // last attribute - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); - - //----------------------------------------------------- - // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); - // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, systemID}); - //----------------------------------------------------- - properties.clear(); // free memory - } - - start = i; // set new starting point for buffer (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - vertexFile.close(); - - // insert entity-number with string into map - entitiesLookup.insert(std::make_pair( entityNumber, entity)); - ++entityNumber; - } - // graph gets full entity-list here: - graph.set_entity_dictionary(entitiesLookup); - } - - } - // this function reads the relation-files and generates edges in graph void generate_edges_adj_list(morphstore::AdjacencyList &graph){ @@ -518,12 +519,12 @@ namespace morphstore{ // Import into CSR-Format: // generate_vertices() + generate_edges() - void import_csr(morphstore::CSR &graph){ + void import(morphstore::CSR &graph){ std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); // (1) generate vertices - generate_vertices_csr(graph); + generate_vertices(graph); // (2) allocate memory allocate_graph_structure_memory_csr(graph); // (3) generate edges @@ -535,113 +536,6 @@ namespace morphstore{ std::cout << "--> done" << std::endl; } - // this function reads the vertices-files and creates vertices in a graph - void generate_vertices_csr(morphstore::CSR &graph){ - - if(!verticesPaths.empty()) { - //std::cout << "(1/2) Generating LDBC-Vertices ..."; - //std::cout.flush(); - - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; - - // iterate through vector of vertex-addresses - for (const auto &address : verticesPaths) { - - // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; - - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - - char* buffer; - - uint64_t fileSize = 0; - - std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - - if (!vertexFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - vertexFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; - - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - size_t last = 0; - size_t next = 0; - - // first line of *.csv contains the attributes -> write to attributes vector - if(start == 0){ - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes.push_back(row.substr(last, next-last)); - last = next + 1; - } - // last attribute - attributes.push_back(row.substr(last)); - }else{ - // actual data: - std::unordered_map properties; - size_t attrIndex = 0; - std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos){ - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); - last = next + 1; - ++attrIndex; - } - // last attribute - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); - - //----------------------------------------------------- - // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); - // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, systemID}); - //----------------------------------------------------- - properties.clear(); // free memory - } - - start = i; // set new starting point for buffer (otherwise it's concatenated) - } - } - - delete[] buffer; // free memory - vertexFile.close(); - - // insert entity-number with string into map - entitiesLookup.insert(std::make_pair( entityNumber, entity)); - ++entityNumber; - } - // graph gets full entity-list here: - graph.set_entity_dictionary(entitiesLookup); - } - - } - // this function reads the relation-files and generates edges in graph void generate_edges_csr(morphstore::CSR &graph){ @@ -818,7 +712,7 @@ namespace morphstore{ graph.set_relation_dictionary(relationsLookup); // do actual edge generation here: - write_vertexNeighborsLookup_into_graph_csr(graph); + write_intermediates_into_graph_csr(graph); } } @@ -839,7 +733,7 @@ namespace morphstore{ } // this function writes the actual data from the intermediate vertexNeighborsLookup int to the arrays in the csr format - void write_vertexNeighborsLookup_into_graph_csr(morphstore::CSR &graph){ + void write_intermediates_into_graph_csr(morphstore::CSR &graph){ // firstly, sorting the intermediates with their target IDs ASC sort_VertexNeighborsLookup_csr(); @@ -859,8 +753,6 @@ namespace morphstore{ startOffset = endOffset + 1 ; } } - - }; } diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp index 07884cba..c8f884e3 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp @@ -34,7 +34,7 @@ int main( void ){ morphstore::AdjacencyList socialGraph; // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.import_adj_list(socialGraph); + ldbcImport.import(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -52,7 +52,7 @@ int main( void ){ */ // calculate size of social graph - std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; + std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; return 0; } \ No newline at end of file diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp index dfe2637c..6b39fcf3 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp @@ -33,7 +33,7 @@ int main( void ){ morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); morphstore::CSR socialGraph; - ldbcImport.import_csr(socialGraph); + ldbcImport.import(socialGraph); // measuring time... auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -51,7 +51,7 @@ int main( void ){ */ // calculate size of social graph - std::cout << "Size of socialGraph: " << socialGraph.get_size_of_graph() << " Bytes\n"; + std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; return 0; } \ No newline at end of file From 0f22a214f2ae36ad320f2c943974b499cc105dc6 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 14:35:08 +0200 Subject: [PATCH 051/216] some little changes --- include/core/storage/graph/adj_list/graph.h | 6 +----- include/core/storage/graph/adj_list/vertex.h | 2 +- include/core/storage/graph/csr/graph.h | 5 ++--- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index 05b42971..dfdc66d2 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -47,11 +47,7 @@ namespace morphstore{ std::map relationDictionary; public: - - void init(){ - std::cout << "Nothing to do!!" << std::endl; - } - + std::string getStorageFormat() const{ return "AdjacencyList"; } diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index c43fd727..c08c582b 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -37,7 +37,7 @@ namespace morphstore{ struct Edge{ ADJLISTVertex* target; unsigned short int relation; - // make this optianl??: + // make this optional??: std::pair property; size_t size_in_bytes() const { diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index b420a391..9d949a4f 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -54,7 +54,6 @@ namespace morphstore{ uint64_t numberEdges; - public: std::string getStorageFormat() const{ @@ -70,7 +69,7 @@ namespace morphstore{ } // this functions allocates the memory for the graph structure arrays - void allocate_graph_structure_memory(uint64_t numberVertices,uint64_t numberEdges){ + void allocate_graph_structure_memory(uint64_t numberVertices, uint64_t numberEdges){ // allocate node array: node_array = new uint64_t[numberVertices]; @@ -108,7 +107,7 @@ namespace morphstore{ } void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property){ - // TODO + // TODO IMPLEMENT std::cout << sourceID << targetID << relation << property.first << std::endl; } From 52c8daefb87aa66571a2513868fe8b98876980bd Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 17:38:15 +0200 Subject: [PATCH 052/216] code structure --- include/core/storage/graph/csr/graph.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index 9d949a4f..6bcf9b2b 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -56,16 +56,10 @@ namespace morphstore{ public: - std::string getStorageFormat() const{ - return "CSR"; - } - - uint64_t getNumberEdges(){ - return numberEdges; - } - - void setNumberEdges(uint64_t edges){ - this->numberEdges = edges; + ~CSR(){ + delete [] node_array; + delete [] edge_array; + delete [] val_array; } // this functions allocates the memory for the graph structure arrays @@ -82,6 +76,18 @@ namespace morphstore{ val_array = new unsigned short int[numberEdges]; } + std::string getStorageFormat() const{ + return "CSR"; + } + + uint64_t getNumberEdges(){ + return numberEdges; + } + + void setNumberEdges(uint64_t edges){ + this->numberEdges = edges; + } + void add_vertex(){ CSRVertex v; From 0d9af4800cc288f2e5335296efaa4e1c9168b7af Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 23 Jul 2019 17:49:23 +0200 Subject: [PATCH 053/216] initialize array pointer as nullptr --- include/core/storage/graph/csr/graph.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index 6bcf9b2b..ff5ae044 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -44,9 +44,9 @@ namespace morphstore{ // row array('node array'): contains the offset in the col_array; vertex-system-id is index in the row_array // col_array('edge array'): every cell represents an edge containing the vertex targets ID // value_array: relation number - uint64_t* node_array; - uint64_t* edge_array; - unsigned short int* val_array; + uint64_t* node_array = nullptr; + uint64_t* edge_array = nullptr; + unsigned short int* val_array = nullptr; // lookup dictionaries for entities of vertices / relation names of edges std::map entityDictionary; From f9e7b074b288071259218bf819a2fe50dfaf9f6e Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 24 Jul 2019 10:20:32 +0200 Subject: [PATCH 054/216] introducing std::experimental::optional to make edge-property optional -> saves a lot of memory --- include/core/storage/graph/adj_list/graph.h | 16 ++++++++++++++++ include/core/storage/graph/adj_list/vertex.h | 14 ++++++++++---- include/core/storage/graph/csr/graph.h | 2 -- .../adj_list/generate_ldbc_graph_adj_list.cpp | 1 + 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index dfdc66d2..fc1d115d 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -196,6 +196,22 @@ namespace morphstore{ std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } + + // test if the std::experimental::optional in the vertex class works: + void print_edges_with_property(){ + for(auto& it : vertices){ + // check if the vertex has neighbors + if(!it.second.get_adjList().empty()){ + // check if there is an edge with property + for(auto const& edge : it.second.get_adjList()){ + // std::optional property true? -> exist! + if(edge.property){ + std::cout << "Edge has Property: ( " << it.second.getId() << " -[ " << get_relation_by_number(edge.relation) << " ]->" << edge.target->getId() << ", " << edge.property->first << ": " << edge.property->second << ")" << std::endl; + } + } + } + } + } }; } diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index c08c582b..0963aa7b 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -27,6 +27,7 @@ #include #include #include +#include namespace morphstore{ @@ -37,11 +38,16 @@ namespace morphstore{ struct Edge{ ADJLISTVertex* target; unsigned short int relation; - // make this optional??: - std::pair property; + // make this optional: + std::experimental::optional> property; size_t size_in_bytes() const { - return sizeof(ADJLISTVertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); + size_t result = 0; + result = sizeof(ADJLISTVertex*) + sizeof(unsigned short int); + if(property){ + result = sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property->first.length() + property->second.length()); + } + return result; }; }; @@ -57,7 +63,7 @@ namespace morphstore{ unsigned short int entity; public: - + // constrcutor without the adjList (Vertex can contain no edges int the graph) ADJLISTVertex(){ // unique ID generation diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index ff5ae044..6ce135f7 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -217,8 +217,6 @@ namespace morphstore{ std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; std::cout << "--------------------------------------------" << std::endl; } - - }; } diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp index c8f884e3..8961b318 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp @@ -50,6 +50,7 @@ int main( void ){ socialGraph.print_vertex_by_id(100168); socialGraph.print_vertex_by_id(2000100); */ + //socialGraph.print_edges_with_property(); // calculate size of social graph std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; From fdfe1ccf99fd045424809c8b64f7b3080ec78f99 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 24 Jul 2019 11:56:02 +0200 Subject: [PATCH 055/216] remove std::optional -> bad decision, needs moore memory as expected --- include/core/storage/graph/adj_list/graph.h | 16 ---------------- include/core/storage/graph/adj_list/vertex.h | 13 +++---------- include/core/storage/graph/ldbc_import.h | 2 +- .../adj_list/generate_ldbc_graph_adj_list.cpp | 1 - 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h index fc1d115d..dfdc66d2 100644 --- a/include/core/storage/graph/adj_list/graph.h +++ b/include/core/storage/graph/adj_list/graph.h @@ -196,22 +196,6 @@ namespace morphstore{ std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } - - // test if the std::experimental::optional in the vertex class works: - void print_edges_with_property(){ - for(auto& it : vertices){ - // check if the vertex has neighbors - if(!it.second.get_adjList().empty()){ - // check if there is an edge with property - for(auto const& edge : it.second.get_adjList()){ - // std::optional property true? -> exist! - if(edge.property){ - std::cout << "Edge has Property: ( " << it.second.getId() << " -[ " << get_relation_by_number(edge.relation) << " ]->" << edge.target->getId() << ", " << edge.property->first << ": " << edge.property->second << ")" << std::endl; - } - } - } - } - } }; } diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h index 0963aa7b..844b50d5 100644 --- a/include/core/storage/graph/adj_list/vertex.h +++ b/include/core/storage/graph/adj_list/vertex.h @@ -27,7 +27,6 @@ #include #include #include -#include namespace morphstore{ @@ -38,16 +37,10 @@ namespace morphstore{ struct Edge{ ADJLISTVertex* target; unsigned short int relation; - // make this optional: - std::experimental::optional> property; + std::pair property; size_t size_in_bytes() const { - size_t result = 0; - result = sizeof(ADJLISTVertex*) + sizeof(unsigned short int); - if(property){ - result = sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property->first.length() + property->second.length()); - } - return result; + return sizeof(ADJLISTVertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); }; }; @@ -63,7 +56,7 @@ namespace morphstore{ unsigned short int entity; public: - + // constrcutor without the adjList (Vertex can contain no edges int the graph) ADJLISTVertex(){ // unique ID generation diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 284fdbe1..bce2e930 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -60,7 +60,7 @@ namespace morphstore{ std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; // for CSR data structure - // map for lookup every system-id, the neigbors in the graph (for further processing, e.g. filling the edge_array in the right order) + // map for lookup every system-id, the neighbors in the graph (for further processing, e.g. filling the edge_array in the right order) std::unordered_map< uint64_t, std::vector>> vertexNeighborsLookup; public: diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp index 8961b318..c8f884e3 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp @@ -50,7 +50,6 @@ int main( void ){ socialGraph.print_vertex_by_id(100168); socialGraph.print_vertex_by_id(2000100); */ - //socialGraph.print_edges_with_property(); // calculate size of social graph std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; From 3b1cb55a6134cad1ad2cb5c2ffd682856f0ea67d Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 24 Jul 2019 13:00:05 +0200 Subject: [PATCH 056/216] changed array allocation: new [...] to malloc(...) --- include/core/storage/graph/csr/graph.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h index 6ce135f7..8048633d 100644 --- a/include/core/storage/graph/csr/graph.h +++ b/include/core/storage/graph/csr/graph.h @@ -66,14 +66,14 @@ namespace morphstore{ void allocate_graph_structure_memory(uint64_t numberVertices, uint64_t numberEdges){ // allocate node array: - node_array = new uint64_t[numberVertices]; + node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); // allocate edge array: - edge_array = new uint64_t[numberEdges]; + edge_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); setNumberEdges(numberEdges); // allocate val array: - val_array = new unsigned short int[numberEdges]; + val_array = (unsigned short int *) malloc(numberEdges * sizeof(unsigned short int)); } std::string getStorageFormat() const{ From cb94809d4146fc02f46950228b7a8498df8c1189 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 6 Aug 2019 12:09:52 +0200 Subject: [PATCH 057/216] Adding Graph-Polymorphism; --- include/core/storage/graph/adj_list/graph.h | 203 --------- include/core/storage/graph/adj_list/vertex.h | 147 ------ include/core/storage/graph/csr/graph.h | 224 --------- include/core/storage/graph/edge/edge.h | 84 ++++ .../storage/graph/formats/adjacencylist.h | 121 +++++ include/core/storage/graph/formats/csr.h | 150 ++++++ include/core/storage/graph/graph.h | 151 ++++++ include/core/storage/graph/ldbc_import.h | 429 +++++++----------- .../graph/{csr/vertex.h => vertex/avertex.h} | 73 ++- include/core/storage/graph/vertex/cvertex.h | 64 +++ include/core/storage/graph/vertex/vertex.h | 98 ++++ test/CMakeLists.txt | 2 +- .../storage/graph/adj_list/CMakeLists.txt | 15 - .../graph/adjacencylist/CMakeLists.txt | 15 + .../ldbc_graph_adjacencylist.cpp} | 34 +- test/core/storage/graph/csr/CMakeLists.txt | 2 +- .../ldbc_graph_csr.cpp} | 35 +- 17 files changed, 909 insertions(+), 938 deletions(-) delete mode 100644 include/core/storage/graph/adj_list/graph.h delete mode 100644 include/core/storage/graph/adj_list/vertex.h delete mode 100644 include/core/storage/graph/csr/graph.h create mode 100644 include/core/storage/graph/edge/edge.h create mode 100644 include/core/storage/graph/formats/adjacencylist.h create mode 100644 include/core/storage/graph/formats/csr.h create mode 100644 include/core/storage/graph/graph.h rename include/core/storage/graph/{csr/vertex.h => vertex/avertex.h} (60%) create mode 100644 include/core/storage/graph/vertex/cvertex.h create mode 100644 include/core/storage/graph/vertex/vertex.h delete mode 100644 test/core/storage/graph/adj_list/CMakeLists.txt create mode 100644 test/core/storage/graph/adjacencylist/CMakeLists.txt rename test/core/storage/graph/{csr/generate_ldbc_graph_csr.cpp => adjacencylist/ldbc_graph_adjacencylist.cpp} (70%) rename test/core/storage/graph/{adj_list/generate_ldbc_graph_adj_list.cpp => csr/ldbc_graph_csr.cpp} (73%) diff --git a/include/core/storage/graph/adj_list/graph.h b/include/core/storage/graph/adj_list/graph.h deleted file mode 100644 index dfdc66d2..00000000 --- a/include/core/storage/graph/adj_list/graph.h +++ /dev/null @@ -1,203 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file graph.h - * @brief Graph storage format -> adjacency Lists - * @todo - */ - -#ifndef MORPHSTORE_GRAPH_ADJACENCY_LIST_H -#define MORPHSTORE_GRAPH_ADJACENCY_LIST_H - -#include - -#include -#include -#include -#include -#include - - -namespace morphstore{ - - class AdjacencyList{ - - private: - // main data structure: mapping global id -> vertex - // unordered_map hast fast search time -> average = O(1); worst case = O(n): - std::unordered_map vertices; - - // lookup dictionaries for entities of vertices / relation names of edges - std::map entityDictionary; - std::map relationDictionary; - - public: - - std::string getStorageFormat() const{ - return "AdjacencyList"; - } - - // calculate the graph size in bytes - size_t get_size_of_graph(){ - size_t size = 0; - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - size += it->second.get_size_of_vertex(); - } - return size; - } - - // adds a vertex (without properties) - void add_vertex(){ - ADJLISTVertex v; - vertices.insert(std::make_pair(v.getId(), v)); - } - - // function that creates a new relation/edge between two (existing) vertices - void add_edge(const uint64_t sourceID, const uint64_t targetID, unsigned short int rel){ - if(exist_id(sourceID) && exist_id(targetID)){ - ADJLISTVertex* sourceV = &vertices.at(sourceID); - ADJLISTVertex* targetV = &vertices.at(targetID); - sourceV->add_edge(targetV, rel); - }else{ - std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; - } - } - - // function that creates a new relation/edge between two (existing) vertices WITH property - void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int rel, const std::pair& property){ - if(exist_id(sourceID) && exist_id(targetID)){ - ADJLISTVertex* sourceV = &vertices.at(sourceID); - ADJLISTVertex* targetV = &vertices.at(targetID); - sourceV->add_edge_with_property(targetV, rel, property); - }else{ - std::cout << "Source-/Target-Vertex-ID does not exist in the database!"; - } - } - - // function to add a new (ldbc) vertex to the graph and returns system-ID - uint64_t add_vertex_with_properties(const std::unordered_map& props ){ - ADJLISTVertex v; - v.add_properties(props); - vertices.insert(std::make_pair(v.getId(), v)); - return v.getId(); - } - - // this adds a specific key-value pair (property) to a vertex given by its id - void add_property_to_vertex(uint64_t id, const std::pair& property){ - if(exist_id(id)){ - vertices.at(id).add_property(property); - }else{ - std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; - } - } - - void add_entity_to_vertex(const uint64_t id, unsigned short int entity){ - if(exist_id(id)){ - vertices.at(id).setEntity(entity); - }else{ - std::cout << "Vertex with ID " << id << " does not exist in the database!"; - } - } - - std::string get_entity_by_number(unsigned short int e){ - if(entityDictionary.find( e ) != entityDictionary.end()){ - return entityDictionary.at(e); - }else{ - return "No Matching of entity-number in the database!"; - } - } - - void set_entity_dictionary(const std::map& entityList){ - this->entityDictionary = entityList; - } - - std::string get_relation_by_number(unsigned short int re){ - if(relationDictionary.find( re ) != relationDictionary.end()){ - return relationDictionary.at(re); - }else{ - return "No Matching of relation-number in the database!"; - } - } - - void set_relation_dictionary(const std::map& relationList){ - this->relationDictionary = relationList; - } - - // function to check if the ID is present or not - bool exist_id(const uint64_t id){ - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } - - // this function returns the total number of edges in the graph - uint64_t get_total_number_of_edges(){ - uint64_t totalNumberEdges = 0; - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - totalNumberEdges += it->second.get_number_of_edges(); - } - return totalNumberEdges; - } - - // for debbuging - void statistics(){ - std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << vertices.size() << std::endl; - std::cout << "Number of relations/edges: " << get_total_number_of_edges() << std::endl; - std::cout << "--------------------------------------------" << std::endl; - } - - // for debbuging - void printEntities(){ - for(auto const& entity : entityDictionary){ - std::cout << entity.first << " -> " << entity.second << "\n"; - } - } - - // for debbuging - void printRelations(){ - for(auto const& rel : relationDictionary){ - std::cout << rel.first << " -> " << rel.second << "\n"; - } - } - - // for debugging - void print_vertex_by_id(uint64_t id){ - std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; - ADJLISTVertex* v = &vertices.at(id); - std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; - std::cout << "Entity: \t"<< get_entity_by_number(v->getEntity()) << std::endl; - std::cout << "#Edges: \t" << v->get_adjList().size() << std::endl; - std::cout << "Adj_List: "; - - const std::vector& adjList = v->get_adjList(); - for(const auto& e : adjList){ - std::cout << "(" << e.target->getId() << "," << e.relation << "." << get_relation_by_number(e.relation) << ") "; - } - std::cout << "\n"; - std::cout << "Properties: "; v->print_properties(); - std::cout << "\n"; - std::cout << "-----------------------------------------------" << std::endl; - } - }; - -} - -#endif //MORPHSTORE_GRAPH_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/adj_list/vertex.h b/include/core/storage/graph/adj_list/vertex.h deleted file mode 100644 index 844b50d5..00000000 --- a/include/core/storage/graph/adj_list/vertex.h +++ /dev/null @@ -1,147 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file vertex.h - * @brief vertex class and its functions + Edge struct - * @todo -*/ - -#ifndef MORPHSTORE_VERTEX_ADJACENCY_LIST_H -#define MORPHSTORE_VERTEX_ADJACENCY_LIST_H - -#include -#include -#include - - -namespace morphstore{ - - class ADJLISTVertex; - - // this struct represents a relation to a target vertex; - struct Edge{ - ADJLISTVertex* target; - unsigned short int relation; - std::pair property; - - size_t size_in_bytes() const { - return sizeof(ADJLISTVertex*) + sizeof(unsigned short int) + sizeof(std::pair< std::string, std::string >) + sizeof(char)*(property.first.length() + property.second.length()); - }; - }; - - class ADJLISTVertex{ - - private: - // Vertex contains a (global) id; entity; vector adjList for the adjacency List - uint64_t id; - std::vector adjList; - // properties - std::unordered_map properties; - // entity-number for look-up - unsigned short int entity; - - public: - - // constrcutor without the adjList (Vertex can contain no edges int the graph) - ADJLISTVertex(){ - // unique ID generation - static uint64_t startID = 0; - id = startID++; - } - - uint64_t getId() const{ - return id; - } - - // add entity to vertex - void setEntity(unsigned short int e){ - this->entity = e; - } - - unsigned short int getEntity(){ - return this->entity; - } - - // calculate size of a vertex for memory usage in bytes - size_t get_size_of_vertex() { - size_t size = 0; - size += sizeof(uint64_t); // id - // Adj.List: - for(const auto& e : adjList){ - size += e.size_in_bytes(); - } - // properties: - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ - size += sizeof(char)*(property->first.length() + property->second.length()); - } - // entities: - size += sizeof(unsigned short int); - - return size; - } - - // returns a reference (read-only) of the adjacency list - const std::vector& get_adjList() const{ - return adjList; - } - - // this function adds a whole property map to a vertex - void add_properties(const std::unordered_map &properties){ - if(!properties.empty()){ - this->properties = properties; - }else{ - std::cout << "The properties-list is empty!" << std::endl; - } - } - - // this adds one key-value pair to the vertex's property map - void add_property(const std::pair& property){ - this->properties[property.first] = std::move(property.second); - } - - // function that creates a new relation/edge between two (existing) vertices withouht properties - void add_edge(ADJLISTVertex *target, unsigned short int relation){ - Edge e; - e.target = target; - e.relation = relation; - this->adjList.push_back(e); - } - - // add edge with properties to vertex - void add_edge_with_property(ADJLISTVertex *target, unsigned short int relation, const std::pair& property){ - Edge e; - e.target = target; - e.relation = relation; - e.property = property; - this->adjList.push_back(e); - } - - uint64_t get_number_of_edges(){ - return adjList.size(); - } - - void print_properties(){ - for(const auto& entry : properties){ - std::cout << "{" << entry.first << ": " << entry.second << "}"; - } - } - }; -} - -#endif //MORPHSTORE_VERTEX_ADJACENCY_LIST_H diff --git a/include/core/storage/graph/csr/graph.h b/include/core/storage/graph/csr/graph.h deleted file mode 100644 index 8048633d..00000000 --- a/include/core/storage/graph/csr/graph.h +++ /dev/null @@ -1,224 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file graph.h - * @brief CSR graph header file - * @todo -*/ - -#ifndef MORPHSTORE_GRAPH_CSR_H -#define MORPHSTORE_GRAPH_CSR_H - -#include - -#include -#include -#include -#include - -namespace morphstore{ - - class CSR{ - - private: - // main data structure: hash table (hash id to vertex) - // unordered_map has fast search time / look-up -> average = O(1); worst case = O(n): - std::unordered_map vertices; - - // graph-structure: 3 Arrays (row_array, col_array, val_array) - // row array('node array'): contains the offset in the col_array; vertex-system-id is index in the row_array - // col_array('edge array'): every cell represents an edge containing the vertex targets ID - // value_array: relation number - uint64_t* node_array = nullptr; - uint64_t* edge_array = nullptr; - unsigned short int* val_array = nullptr; - - // lookup dictionaries for entities of vertices / relation names of edges - std::map entityDictionary; - std::map relationDictionary; - - uint64_t numberEdges; - - public: - - ~CSR(){ - delete [] node_array; - delete [] edge_array; - delete [] val_array; - } - - // this functions allocates the memory for the graph structure arrays - void allocate_graph_structure_memory(uint64_t numberVertices, uint64_t numberEdges){ - - // allocate node array: - node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); - - // allocate edge array: - edge_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); - setNumberEdges(numberEdges); - - // allocate val array: - val_array = (unsigned short int *) malloc(numberEdges * sizeof(unsigned short int)); - } - - std::string getStorageFormat() const{ - return "CSR"; - } - - uint64_t getNumberEdges(){ - return numberEdges; - } - - void setNumberEdges(uint64_t edges){ - this->numberEdges = edges; - } - - - void add_vertex(){ - CSRVertex v; - vertices.insert(std::make_pair(v.getId(), v)); - } - - void add_property_to_vertex(uint64_t id, const std::pair& property){ - if(exist_id(id)){ - vertices.at(id).add_property(property); - }else{ - std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; - } - } - - // this function adds the data in the CSR structure from LDBC-Importer - void add_edge_ldbc(uint64_t vertexID, uint64_t startOffset, const std::vector>& neighbors){ - node_array[vertexID] = startOffset; // offset in edge_array - for(auto const& pair : neighbors){ - edge_array[startOffset] = pair.first; // target id - val_array[startOffset] = pair.second; // relation number for lookup - ++startOffset; - } - } - - void add_edge_with_property(uint64_t sourceID, uint64_t targetID, unsigned short int relation, const std::pair& property){ - // TODO IMPLEMENT - std::cout << sourceID << targetID << relation << property.first << std::endl; - - } - - uint64_t add_vertex_with_properties(const std::unordered_map& props ){ - CSRVertex v; - v.add_properties(props); - vertices.insert(std::make_pair(v.getId(), v)); - return v.getId(); - } - - void add_entity_to_vertex(const uint64_t id, unsigned short int entity){ - if(exist_id(id)){ - vertices.at(id).setEntity(entity); - }else{ - std::cout << "Vertex with ID " << id << " does not exist in the database!"; - } - } - - // function to check if the ID is present or not - bool exist_id(const uint64_t id){ - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } - std::string get_entity_by_number(unsigned short int e){ - if(entityDictionary.find( e ) != entityDictionary.end()){ - return entityDictionary.at(e); - }else{ - return "No Matching of entity-number in the database!"; - } - } - - void set_entity_dictionary(const std::map& entityList){ - this->entityDictionary = entityList; - } - - std::string get_relation_by_number(unsigned short int re){ - if(relationDictionary.find( re ) != relationDictionary.end()){ - return relationDictionary.at(re); - }else{ - return "No Matching of relation-number in the database!"; - } - } - - void set_relation_dictionary(const std::map& relationList){ - this->relationDictionary = relationList; - } - - uint64_t getNumberVertices(){ - return vertices.size(); - } - - // calculate the graph size in bytes - size_t get_size_of_graph(){ - size_t size = 0; - // pointer to arrays: - size += sizeof(uint64_t*) * 2 + sizeof(unsigned short int*); - // vertices: - size += sizeof(uint64_t) * getNumberVertices(); - // edges: - size += sizeof(uint64_t) * getNumberEdges(); - // val array: - size += sizeof(unsigned short int) * getNumberEdges(); - - // vertex map wth actual data: - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - size += it->second.get_size_of_vertex(); - } - - return size; - } - - - // for debugging - void print_vertex_by_id(uint64_t id){ - std::cout << "-------------- Vertex ID: " << id <<" --------------" << std::endl; - CSRVertex* v = &vertices.at(id); - uint64_t startOffset = node_array[id]; - uint64_t endOffset = node_array[id+1]; - std::cout << "Offset: " << startOffset << std::endl; - std::cout << "Vertex-ID: \t"<< v->getId() << std::endl; - std::cout << "Entity: \t"<< get_entity_by_number(v->getEntity()) << std::endl; - std::cout << "#Edges: " << (endOffset-startOffset) << std::endl; - std::cout << "Relations: "; - for (uint64_t i = startOffset; i < endOffset; ++i) { - std::cout << "(" << edge_array[i] << "," << val_array[i] << "." << get_relation_by_number(val_array[i]) << ") "; - } - std::cout << "\n"; - std::cout << "Properties: "; v->print_properties(); - std::cout << "\n"; - std::cout << "-----------------------------------------------" << std::endl; - } - - - // for debbuging - void statistics(){ - std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << getNumberVertices() << std::endl; - std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; - std::cout << "--------------------------------------------" << std::endl; - } - }; - -} - -#endif //MORPHSTORE_GRAPH_CSR_H diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h new file mode 100644 index 00000000..c84f86cf --- /dev/null +++ b/include/core/storage/graph/edge/edge.h @@ -0,0 +1,84 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file edge.h + * @brief Edge class which represents a relationship + * @todo +*/ + +#ifndef MORPHSTORE_EDGE_H +#define MORPHSTORE_EDGE_H + +namespace morphstore{ + + class Edge{ + + protected: + // Edge characteristics + uint64_t sourceID, targetID; + unsigned short int relation; + std::pair property; + + public: + + Edge(){}; + + // Constructor with parameters + Edge(uint64_t from, uint64_t to, unsigned short int rel){ + setSourceId(from); + setTargetId(to); + setRelation(rel); + } + + // --------------- Getter and Setter --------------- + + uint64_t getSourceId() const { + return sourceID; + } + + void setSourceId(uint64_t sourceId) { + sourceID = sourceId; + } + + uint64_t getTargetId() const { + return targetID; + } + + void setTargetId(uint64_t targetId) { + targetID = targetId; + } + + unsigned short getRelation() const { + return relation; + } + + void setRelation(unsigned short relation) { + Edge::relation = relation; + } + + const std::pair &getProperty() const { + return property; + } + + void setProperty(const std::pair &property) { + Edge::property = property; + } + }; +} + +#endif //MORPHSTORE_EDGE_H diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h new file mode 100644 index 00000000..59aaec8b --- /dev/null +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -0,0 +1,121 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file adjacencylist.h + * @brief Derived ADJ-List storage format class. Base: graph.h + * @todo +*/ + +#ifndef MORPHSTORE_ADJACENCYLIST_H +#define MORPHSTORE_ADJACENCYLIST_H + +#include "../graph.h" +#include "../vertex/avertex.h" + +namespace morphstore{ + + class AdjacencyList: public Graph { + + public: + + + storageFormat getStorageFormat() const override { + return adjacencylist; + } + + // function: to set graph allocations + void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { + vertices.reserve(numberVertices); + setNumberEdges(numberEdges); + setNumberVertices(numberVertices); + } + + // adding a single vertex + void add_vertex() override { + std::shared_ptr v = std::make_shared(); + vertices[v->getID()] = v; + } + + // adding a vertex with its properties + int add_vertex_with_properties(const std::unordered_map &props) override { + std::shared_ptr v = std::make_shared(); + v->setProperties(props); + vertices[v->getID()] = v; + return v->getID(); + } + + // function to add a single property to vertex + void add_property_to_vertex(uint64_t id, const std::pair &property) override { + if (exist_id(id)) { + vertices[id]->add_property(property); + } else { + std::cout << "Vertex with ID " << id << " not found." << std::endl; + } + } + + // adding entity to vertex + void add_entity_to_vertex(const uint64_t id, unsigned short int entity) override { + if (exist_id(id)) { + vertices[id]->setEntity(entity); + } else { + std::cout << "Vertex with ID " << id << " not found." << std::endl; + } + } + + // adding a single edge to vertex: + void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { + if (exist_id(from) && exist_id(to)) { + vertices[from]->add_edge(from, to, rel); + } else { + std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; + } + } + + // function that adds multiple edges (list of neighbors) at once to vertex + void add_edges(const uint64_t source, std::vector> &listOfNeighbors) override { + if (exist_id(source)) { + if (listOfNeighbors.size() != 0) { + for (auto &pair : listOfNeighbors) { + vertices[source]->add_edge(source, pair.first, pair.second); + } + } + } else { + std::cout << "Vertex with ID " << source << " not found." << std::endl; + } + } + + // get number of neighbors of vertex with id + uint64_t get_number_edges(uint64_t id) override { + return vertices[id]->get_number_edges(); + } + + /* old-calculation of the graph size in bytes + size_t get_size_of_graph(){ + size_t size = 0; + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size += it->second.get_size_of_vertex(); + } + return size; + } + */ + + }; +} + +#endif //MORPHSTORE_ADJACENCYLIST_H diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h new file mode 100644 index 00000000..fbe24000 --- /dev/null +++ b/include/core/storage/graph/formats/csr.h @@ -0,0 +1,150 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file csr.h + * @brief Derived CSR storage format class. Base: graph.h + * @todo add_edge() functionality is missing -> needs a realloc()-strategy +*/ + +#ifndef MORPHSTORE_CSR_H +#define MORPHSTORE_CSR_H + +#include "../graph.h" +#include "../vertex/cvertex.h" + +namespace morphstore{ + + class CSR: public Graph{ + + private: + /* graph topology: hybrid approach + * node array: index is vertex-id; array cell contains offset in edge_array + * edge array: every cell contains pointer to edge object of vertex + */ + // TODO: construct a graph-topology struct ? + // TODO: free memory in destrcutor + uint64_t* node_array = nullptr; + Edge** edge_array = nullptr; + + public: + + storageFormat getStorageFormat() const override { + return csr; + } + + // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays + void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { + setNumberVertices(numberVertices); + setNumberEdges(numberEdges); + + vertices.reserve(numberVertices); + + node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); + edge_array = (Edge**) malloc(numberEdges * sizeof(Edge*)); + } + + // adding a single vertex (without any properties, etc...) + void add_vertex() override { + std::shared_ptr v = std::make_shared(); + vertices[v->getID()] = v; + } + + // adding a vertex with its properties + int add_vertex_with_properties(const std::unordered_map& props ) override { + std::shared_ptr v = std::make_shared(); + v->setProperties(props); + vertices[v->getID()] = v; + return v->getID(); + } + + // TODO: add a single edge in graph arrays -> needs a memory reallocating stragety + void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { + if(exist_id(from) && exist_id(to)){ + std::cout << rel << std::endl; + } + } + + // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC + // every vertex id contains a list of neighbors + void add_edges(uint64_t source, std::vector>& listOfNeighbors) override { + if(source == 0){ + node_array[source] = 0; + } + uint64_t offset = node_array[source]; + uint64_t nextOffset = offset + listOfNeighbors.size(); + + for(auto const& pair : listOfNeighbors){ + Edge* e = new Edge(source, pair.first, pair.second); + edge_array[offset] = e; + ++offset; + } + node_array[source+1] = nextOffset; + } + + // function to add a single property to vertex + void add_property_to_vertex(uint64_t id, const std::pair& property) override { + if(exist_id(id)){ + vertices[id]->add_property(property); + }else{ + std::cout << "Vertex with ID " << id << " not found./property_to_vertex" << std::endl; + } + } + + // adding entity to vertex + void add_entity_to_vertex(const uint64_t id, unsigned short int entity) override { + if(exist_id(id)){ + vertices[id]->setEntity(entity); + }else{ + std::cout << "Vertex with ID " << id << " not found./entity_to_vertex." << std::endl; + } + } + + // get number of edges of vertex with id + uint64_t get_number_edges(uint64_t id) override { + uint64_t offset = node_array[id]; + uint64_t nextOffset = node_array[id+1]; + uint64_t numberEdges = nextOffset - offset; + return numberEdges; + } + + /* old-calculation of the graph size in bytes + size_t get_size_of_graph(){ + size_t size = 0; + // pointer to arrays: + size += sizeof(uint64_t*) * 2 + sizeof(unsigned short int*); + // vertices: + size += sizeof(uint64_t) * getNumberVertices(); + // edges: + size += sizeof(uint64_t) * getNumberEdges(); + // val array: + size += sizeof(unsigned short int) * getNumberEdges(); + + // vertex map wth actual data: + for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ + size += it->second.get_size_of_vertex(); + } + + return size; + } + */ + + }; + +} + +#endif //MORPHSTORE_CSR_H diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h new file mode 100644 index 00000000..de23f94d --- /dev/null +++ b/include/core/storage/graph/graph.h @@ -0,0 +1,151 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph.h + * @brief abstract graph class for any storage format --> CSR,ADJ + * @todo graph-size calculation is missing +*/ + +#ifndef MORPHSTORE_GRAPH_H +#define MORPHSTORE_GRAPH_H + +#include "vertex/vertex.h" + +#include +#include +#include +#include +#include + +namespace morphstore{ + + class Graph{ + + protected: + uint64_t numberVertices; + uint64_t numberEdges; + + // Data-structure for Vertex-Properties + std::unordered_map> vertices; + + // Lookup for entities and relations: number to string + std::map entityDictionary; + std::map relationDictionary; + + public: + + enum storageFormat {csr, adjacencylist }; + + // -------------------- Setters & Getters -------------------- + + const std::map &getEntityDictionary() const { + return entityDictionary; + } + + void setEntityDictionary(const std::map &ent) { + this->entityDictionary = ent; + } + + const std::map &getRelationDictionary() const { + return relationDictionary; + } + + void setRelationDictionary(const std::map &rel) { + this->relationDictionary = rel; + } + + uint64_t getNumberVertices() const { + return numberVertices; + } + + void setNumberVertices(uint64_t numV) { + Graph::numberVertices = numV; + } + + uint64_t getNumberEdges() const { + return numberEdges; + } + + void setNumberEdges(uint64_t numE) { + Graph::numberEdges = numE; + } + + std::string get_entity_by_number(unsigned short int e){ + if(entityDictionary.find( e ) != entityDictionary.end()){ + return entityDictionary.at(e); + }else{ + return "No Matching of entity-number in the database!"; + } + } + + std::string get_relation_by_number(unsigned short int re){ + if(relationDictionary.find( re ) != relationDictionary.end()){ + return relationDictionary.at(re); + }else{ + return "No Matching of relation-number in the database!"; + } + } + + // function to check if the vertex-ID is present or not (exists) + bool exist_id(const uint64_t id){ + if(vertices.find(id) == vertices.end()){ + return false; + } + return true; + } + + // -------------------- pure virtual functions -------------------- + + virtual storageFormat getStorageFormat() const = 0; + virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; + virtual void add_vertex() = 0; + virtual int add_vertex_with_properties(const std::unordered_map& props ) = 0; + virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; + virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; + virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; + virtual void add_edges(uint64_t source, std::vector>& listOfNeighbors) = 0; + virtual uint64_t get_number_edges(uint64_t id) = 0; + + // -------------------- debugging functions -------------------- + + void statistics(){ + std::cout << "---------------- Statistics ----------------" << std::endl; + std::cout << "Number of vertices: " << getNumberVertices() << std::endl; + std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; + std::cout << "--------------------------------------------" << std::endl; + } + + void print_vertex_by_id(uint64_t id) { + std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; + std::shared_ptr v = vertices[id]; + std::cout << "Vertex-ID: \t" << v->getID() << std::endl; + std::cout << "Entity: \t" << get_entity_by_number(v->getEntity()) << std::endl; + std::cout << "\n"; + std::cout << "Properties: "; + v->print_properties(); + std::cout << "#Edges: " << this->get_number_edges(v->getID()); + std::cout << "\n"; + std::cout << "-----------------------------------------------" << std::endl; + } + + }; + +} + + +#endif //MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index bce2e930..dc5103bf 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,14 +18,14 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph in CSR or AdjList - * @todo CSR-EDGE PROPERTIES ARE MISSING!!! + * @todo EDGE PROPERTIES ARE MISSING in fillVertexLookup(); OPTIMIZATIONS */ #ifndef MORPHSTORE_LDBC_IMPORT_H #define MORPHSTORE_LDBC_IMPORT_H -#include -#include +#include +#include #include #include @@ -48,7 +48,7 @@ struct hash_pair { namespace morphstore{ - class LDBCImport{ + class LDBCImport { private: std::string directory; @@ -57,30 +57,30 @@ namespace morphstore{ std::map entitiesLookup; std::map relationsLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id - std::unordered_map< std::pair , uint64_t , hash_pair> globalIdLookupMap; + std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; // for CSR data structure // map for lookup every system-id, the neighbors in the graph (for further processing, e.g. filling the edge_array in the right order) - std::unordered_map< uint64_t, std::vector>> vertexNeighborsLookup; + std::unordered_map>> vertexNeighborsLookup; public: - LDBCImport(const std::string& dir){ + LDBCImport(const std::string &dir) { directory = dir; insert_file_names(directory); } - std::string getDirectory() const{ + std::string getDirectory() const { return directory; } // function which iterates through directory to receive file names (entire path) - void insert_file_names(std::string dir){ - for (const auto & entry : std::experimental::filesystem::directory_iterator(dir)){ + void insert_file_names(std::string dir) { + for (const auto &entry : std::experimental::filesystem::directory_iterator(dir)) { // ignore files starting with a '.' - if(entry.path().string()[dir.size()] == '.'){ + if (entry.path().string()[dir.size()] == '.') { continue; - }else{ + } else { // insert file path to vertices or relations vector differentiate(entry.path().string(), dir); } @@ -88,21 +88,20 @@ namespace morphstore{ } // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector - void differentiate(std::string path, std::string dir){ + void differentiate(std::string path, std::string dir) { // if the string contains a '_' -> it's a relation file; otherwise a vertex file // remove dir name to remain only the *.csv - if(path.substr(dir.size()).find('_') != std::string::npos ){ + if (path.substr(dir.size()).find('_') != std::string::npos) { relationsPaths.push_back(path); - }else{ + } else { verticesPaths.push_back(path); } } // this function reads the vertices-files and creates vertices in a graph - template - void generate_vertices(T &graph){ + void generate_vertices(std::unique_ptr& graph) { - if(!verticesPaths.empty()) { + if (!verticesPaths.empty()) { //std::cout << "(1/2) Generating LDBC-Vertices ..."; //std::cout.flush(); @@ -116,13 +115,15 @@ namespace morphstore{ std::vector attributes; // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string entity = address.substr(getDirectory().size(), + address.size() - getDirectory().size() - 4); - char* buffer; + char *buffer; uint64_t fileSize = 0; - std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream vertexFile(address, std::ios::binary | + std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!vertexFile) { std::cerr << "Error, opening file. "; @@ -133,45 +134,47 @@ namespace morphstore{ if (vertexFile.is_open()) { fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. vertexFile.clear(); - vertexFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + vertexFile.seekg(0, + std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); + buffer = (char *) malloc(fileSize * sizeof(char)); vertexFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); + if (row.find('\n') != std::string::npos) { + row.erase(0, 1); } size_t last = 0; size_t next = 0; // first line of *.csv contains the attributes -> write to attributes vector - if(start == 0){ + if (start == 0) { // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos){ - attributes.push_back(row.substr(last, next-last)); + while ((next = row.find(delimiter, last)) != std::string::npos) { + attributes.push_back(row.substr(last, next - last)); last = next + 1; } // last attribute attributes.push_back(row.substr(last)); - }else{ + } else { // actual data: std::unordered_map properties; size_t attrIndex = 0; std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos){ - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next-last))); + while ((next = row.find(delimiter, last)) != std::string::npos) { + properties.insert( + std::make_pair(attributes[attrIndex], row.substr(last, next - last))); last = next + 1; ++attrIndex; } @@ -180,9 +183,9 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); + uint64_t systemID = graph->add_vertex_with_properties(properties); // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); + graph->add_entity_to_vertex(systemID, entityNumber); // map entity and ldbc id to system generated id globalIdLookupMap.insert({{entity, ldbcID}, systemID}); //----------------------------------------------------- @@ -197,20 +200,20 @@ namespace morphstore{ vertexFile.close(); // insert entity-number with string into map - entitiesLookup.insert(std::make_pair( entityNumber, entity)); + entitiesLookup.insert(std::make_pair(entityNumber, entity)); ++entityNumber; } // graph gets full entity-list here: - graph.set_entity_dictionary(entitiesLookup); + graph->setEntityDictionary(entitiesLookup); } } // function which returns true, if parameter is a entity in ldbc-files - bool is_entity(const std::string &entity){ + bool is_entity(const std::string &entity) { // iterate through entities-map to look up for paramater - for(auto const& entry : entitiesLookup){ - if(entry.second == entity){ + for (auto const &entry : entitiesLookup) { + if (entry.second == entity) { return true; } } @@ -219,10 +222,10 @@ namespace morphstore{ } // function which returns true, if the relation already exist - bool exist_relation_name(const std::string& relation){ + bool exist_relation_name(const std::string &relation) { // iterate through relations-map to look up for paramater - for(auto const& entry : relationsLookup){ - if(entry.second == relation){ + for (auto const &entry : relationsLookup) { + if (entry.second == relation) { return true; } } @@ -231,40 +234,42 @@ namespace morphstore{ } // for debugging - void print_file_names(){ + void print_file_names() { std::cout << "Vertices-Files: " << std::endl; - for(const auto& v : verticesPaths){ + for (const auto &v : verticesPaths) { std::cout << "\t" << v << std::endl; } std::cout << "Relations-Files: " << std::endl; - for(const auto& rel : relationsPaths){ + for (const auto &rel : relationsPaths) { std::cout << "\t" << rel << std::endl; } } // function which clears all intermediates after import - void clear_intermediates(){ + void clear_intermediates() { globalIdLookupMap.clear(); relationsLookup.clear(); entitiesLookup.clear(); relationsPaths.clear(); verticesPaths.clear(); + vertexNeighborsLookup.clear(); } // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the entity lookup creation) - uint64_t get_total_number_edges(){ + uint64_t get_total_number_edges() { - uint64_t result = 0 ; + uint64_t result = 0; - if(!relationsPaths.empty()) { + if (!relationsPaths.empty()) { // iterate through vector of relation-addresses for (const auto &address : relationsPaths) { // TODO OPTIMIZE HERE: remove string operations // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string relation = address.substr(getDirectory().size(), + address.size() - getDirectory().size() - 4); std::string fromEntity = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); @@ -273,11 +278,12 @@ namespace morphstore{ std::string toEntity = relation; - char* buffer; + char *buffer; uint64_t fileSize = 0; - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream relationFile(address, std::ios::binary | + std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!relationFile) { std::cerr << "Error, opening file. "; @@ -288,23 +294,24 @@ namespace morphstore{ if (relationFile.is_open()) { fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + relationFile.seekg(0, + std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); + buffer = (char *) malloc(fileSize * sizeof(char)); relationFile.read(buffer, fileSize); // read data as one big block bool firstLine = true; // check from file name whether it's a relation file or multi value attribute file - if(is_entity(toEntity)){ + if (is_entity(toEntity)) { - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { // skip first line (attributes infos....) - if(firstLine){ + if (firstLine) { firstLine = false; - }else{ + } else { ++result; } } @@ -320,226 +327,82 @@ namespace morphstore{ return result; } + // get number of vertices from files + uint64_t get_total_number_vertices() { + uint64_t result = 0; - // -------------------------------- Adj-List-specific functions -------------------------------- - - // Import into Adj-List-Format: - // generate_vertices() + generate_edges() - void import(morphstore::AdjacencyList &graph){ - std::cout << "Importing LDBC-files into graph ... "; - std::cout.flush(); - - // (1) generate vertices - generate_vertices(graph); - // (2) generate edges - generate_edges_adj_list(graph); - - // (3) clear intermediates - clear_intermediates(); - - std::cout << "--> done" << std::endl; - } - - // this function reads the relation-files and generates edges in graph - void generate_edges_adj_list(morphstore::AdjacencyList &graph){ - - + if (!verticesPaths.empty()) { - if(!relationsPaths.empty()) { - //std::cout << "(2/2) Generating LDBC-Edges ..."; - //std::cout.flush(); - - //this variable is used for the relationLookup-keys, starting by 0 - unsigned short int relationNumber = 0; - bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; // iterate through vector of vertex-addresses - for (const auto &address : relationsPaths) { - - isRelation = false; - - // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - std::string fromEntity = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); - - std::string relationName = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); + for (const auto &address : verticesPaths) { - std::string toEntity = relation; + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); - char* buffer; + char *buffer; uint64_t fileSize = 0; - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream vertexFile(address, std::ios::binary | + std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!relationFile) { + if (!vertexFile) { std::cerr << "Error, opening file. "; exit(EXIT_FAILURE); } // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, + std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); - relationFile.read(buffer, fileSize); // read data as one big block - + buffer = (char *) malloc(fileSize * sizeof(char)); + vertexFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; - // check from file name whether it's a relation file or multi value attribute file - if(!is_entity(toEntity)){ - // Multi-value-attributes: just take the last recently one - std::string propertyKey; - std::unordered_map multiValueAttr; - uint64_t systemID; - std::string value; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' - if(start == 0){ - propertyKey = row.substr(row.find(delimiter) + 1); - }else{ - // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) - systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); - value = row.substr(row.find(delimiter) + 1); - multiValueAttr[systemID] = std::move(value); - } + // read buffer and do the magic ... + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); - start = i; // set new starting point for buffer (otherwise it's concatenated) + // remove unnecessary '\n' at the beginning of a string + if (row.find('\n') != std::string::npos) { + row.erase(0, 1); } - } - // iterate through multiValue map and assign property to vertex - for(const auto &pair : multiValueAttr){ - const std::pair& keyValuePair = {propertyKey, pair.second}; - graph.add_property_to_vertex(pair.first, keyValuePair); - } - } - // handling of relation-files ... - else{ - - isRelation = true; - - bool hasProperties = false; - std::string propertyKey; - uint64_t fromID, toID; - - // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); - - // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); - } - - size_t last = 0; - size_t next = 0; - size_t count = 0; - - // first line of *.csv: Differentiate whether it's - // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 - // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 - if(start == 0){ - // if there are 2 delimiter ('|') -> relation file with properties - while ((next = row.find(delimiter, last)) != std::string::npos){ - last = next + 1; - ++count; - } - if(count == 2){ - hasProperties = true; - propertyKey = row.substr(last); - } - }else{ - // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property - // get the system-(global) id's from local ids - fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); - // remove from id from string - row.erase(0, row.find(delimiter) + delimiter.length()); - std::string value; - if(!hasProperties){ - // WITHOUT properties: just from the first delimiter on - toID = globalIdLookupMap.at({toEntity, row}); - - // Generate edge in graph - graph.add_edge(fromID, toID, relationNumber); - }else{ - // with properties means: toID is until the next delimiter, and then the value for the property - toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); - row.erase(0, row.find(delimiter) + delimiter.length()); - value = row; - graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); - } - } - start = i; // set new starting point for buffer (otherwise it's concatenated) + // first line of *.csv contains the attributes -> write to attributes vector + if (start != 0) { + ++result; } + + start = i; // set new starting point for buffer (otherwise it's concatenated) } } + delete[] buffer; // free memory - relationFile.close(); + vertexFile.close(); - //check if the relation name is a relation (no multi value file) - if(isRelation){ - // check if the name already exists - if(!exist_relation_name(relationName)){ - // insert relation-number with string into map - relationsLookup.insert(std::make_pair( relationNumber, relationName)); - ++relationNumber; - } - } + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair( entityNumber, entity)); + ++entityNumber; } - // graph gets full relation-list here: - graph.set_relation_dictionary(relationsLookup); } - } - - - - // -------------------------------- CSR-specific functions -------------------------------- - - // Import into CSR-Format: - // generate_vertices() + generate_edges() - void import(morphstore::CSR &graph){ - std::cout << "Importing LDBC-files into graph ... "; - std::cout.flush(); - - // (1) generate vertices - generate_vertices(graph); - // (2) allocate memory - allocate_graph_structure_memory_csr(graph); - // (3) generate edges - generate_edges_csr(graph); - - // (4) remove intermediates - clear_intermediates(); - - std::cout << "--> done" << std::endl; + return result; } // this function reads the relation-files and generates edges in graph - void generate_edges_csr(morphstore::CSR &graph){ - - + void fill_vertexNeighborsLookup(std::unique_ptr& graph){ if(!relationsPaths.empty()) { //std::cout << "(2/2) Generating LDBC-Edges ..."; @@ -623,7 +486,7 @@ namespace morphstore{ // iterate through multiValue map and assign property to vertex for(const auto &pair : multiValueAttr){ const std::pair& keyValuePair = {propertyKey, pair.second}; - graph.add_property_to_vertex(pair.first, keyValuePair); + graph->add_property_to_vertex(pair.first, keyValuePair); } } @@ -675,17 +538,16 @@ namespace morphstore{ // WITHOUT properties: just from the first delimiter on toID = globalIdLookupMap.at({toEntity, row}); - // Generate edge in graph - //graph.add_edge(fromID, toID, relationNumber); - // insert relation into vertexNeighborsLookup vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // add to graph + // add to + // TODO: DONT FORGET TO HANDLE PROPERTIES //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); } @@ -709,50 +571,67 @@ namespace morphstore{ } // graph gets full relation-list here: - graph.set_relation_dictionary(relationsLookup); - - // do actual edge generation here: - write_intermediates_into_graph_csr(graph); + graph->setRelationDictionary(relationsLookup); } } - // this function allocates the memory used for the graph structure in CSR (arrays) - void allocate_graph_structure_memory_csr(morphstore::CSR &graph){ - // get number of vertices and number of edges - uint64_t numberVertices = graph.getNumberVertices(); - uint64_t numberEdges = get_total_number_edges(); - graph.allocate_graph_structure_memory(numberVertices, numberEdges); - } - // function for sorting the vertexNeighborsLookup ASC in CSR - void sort_VertexNeighborsLookup_csr(){ + void sort_VertexNeighborsLookup(){ // sorting the first element of the pair (target-id) for(auto &it: vertexNeighborsLookup){ std::sort(it.second.begin(), it.second.end()); } } - // this function writes the actual data from the intermediate vertexNeighborsLookup int to the arrays in the csr format - void write_intermediates_into_graph_csr(morphstore::CSR &graph){ + /* TODO: write function, that additionally fills the VertexNeighborsLookup with ids, that have no neighbors + * After that, we can iterate through the lookup and can sequently fill the CSR or AdjacencyList + * */ + + // this function writes the actual data from the intermediate vertexNeighborsLookup + void generate_edges(std::unique_ptr& graph){ // firstly, sorting the intermediates with their target IDs ASC - sort_VertexNeighborsLookup_csr(); + sort_VertexNeighborsLookup(); - // Write CSR arrays with data (offsets, number of relation,....): - uint64_t lastVertexID = graph.getNumberVertices() - 1; - uint64_t startOffset = 0; + uint64_t graphSize = graph->getNumberVertices(); - for(uint64_t vertexID = 0; vertexID < lastVertexID; ++vertexID){ + for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ + // Problem: there are some vertexIDs which have no entry in the lookup -> segmentation fault // get the list of target vertices std::vector> neighbors; - neighbors = vertexNeighborsLookup[vertexID]; - //store the number for the offset in edge array - uint64_t endOffset = neighbors.size() + startOffset -1 ; + if(vertexNeighborsLookup[vertexID].size() != 0){ + neighbors = vertexNeighborsLookup.at(vertexID); + } // VERTICES WITHOUT ANY EDGES -< TODO ? how to handle? - graph.add_edge_ldbc(vertexID, startOffset, neighbors); - - startOffset = endOffset + 1 ; + graph->add_edges(vertexID, neighbors); } } + + // main import function: see steps in comments + void import(std::unique_ptr& graph) { + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); + + // (1) get number vertices and number edges: + uint64_t numberVertices = get_total_number_vertices(); + uint64_t numberEdges = get_total_number_edges(); + + // (2) allocate graph memory + graph->allocate_graph_structure(numberVertices, numberEdges); + + // (3) generate vertices + generate_vertices(graph); + + // (4) read relations and write to intermediate results + fill_vertexNeighborsLookup(graph); + + // (5) read intermediates and write edges + generate_edges(graph); + + // (6) clear intermediates + clear_intermediates(); + + std::cout << "--> done" << std::endl; + } }; } diff --git a/include/core/storage/graph/csr/vertex.h b/include/core/storage/graph/vertex/avertex.h similarity index 60% rename from include/core/storage/graph/csr/vertex.h rename to include/core/storage/graph/vertex/avertex.h index 1e6017e9..4152802c 100644 --- a/include/core/storage/graph/csr/vertex.h +++ b/include/core/storage/graph/vertex/avertex.h @@ -16,54 +16,55 @@ **********************************************************************************************/ /** - * @file vertex.h - * @brief CSR vertex header file - * @todo + * @file avertex.h + * @brief Derived vertex calss for ADJ_LIST storage format + * @todo change adjlist (vector of Edges) to vector of Edge* ????? */ -#ifndef MORPHSTORE_VERTEX_CSR_H -#define MORPHSTORE_VERTEX_CSR_H +#ifndef MORPHSTORE_AVERTEX_H +#define MORPHSTORE_AVERTEX_H -#include -#include +#include "../edge/edge.h" namespace morphstore{ - class CSRVertex{ + class AVertex: public Vertex{ - private: - // system-ID - uint64_t id; - // data 'properties' - std::unordered_map properties; - // entity-number for look-up - unsigned short int entity; + protected: + std::vector adjList; public: - - CSRVertex(){ + // constructor with unique id generation + AVertex(){ // unique ID generation static uint64_t startID = 0; id = startID++; } - // add entity to vertex - void setEntity(unsigned short int e){ - this->entity = e; + // returns a reference (read-only) of the adjacency list + const std::vector& get_adjList() const{ + return adjList; } - unsigned short int getEntity(){ - return this->entity; + // add edge to vertexs' adjacencylist + void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { + Edge e(from, to, rel); + this->adjList.push_back(e); } - uint64_t getId() const{ - return id; + // function which returns the number of edges + uint64_t get_number_edges() override { + return adjList.size(); } - // calculate size of a vertex for memory usage in bytes + /* old-calculation of vertex size size_t get_size_of_vertex() { size_t size = 0; size += sizeof(uint64_t); // id + // Adj.List: + for(const auto& e : adjList){ + size += e.size_in_bytes(); + } // properties: size += sizeof(std::unordered_map); for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ @@ -74,27 +75,9 @@ namespace morphstore{ return size; } + */ - // this function adds a whole property map to a vertex - void add_properties(const std::unordered_map &properties){ - if(!properties.empty()){ - this->properties = properties; - }else{ - std::cout << "The properties-list is empty!" << std::endl; - } - } - - // this adds one key-value pair to the vertex's property map - void add_property(const std::pair& property){ - this->properties[property.first] = std::move(property.second); - } - - void print_properties(){ - for(const auto& entry : properties){ - std::cout << "{" << entry.first << ": " << entry.second << "}"; - } - } }; } -#endif //MORPHSTORE_VERTEX_CSR_H +#endif //MORPHSTORE_AVERTEX_H diff --git a/include/core/storage/graph/vertex/cvertex.h b/include/core/storage/graph/vertex/cvertex.h new file mode 100644 index 00000000..710d68ed --- /dev/null +++ b/include/core/storage/graph/vertex/cvertex.h @@ -0,0 +1,64 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file cvertex.h + * @brief Derived vertex calss for CSR storage format + * @todo +*/ + +#ifndef MORPHSTORE_CVERTEX_H +#define MORPHSTORE_CVERTEX_H + +namespace morphstore{ + + class CVertex: public Vertex{ + + public: + // constructor with unique id generation + CVertex(){ + // unique ID generation + static uint64_t startID = 0; + id = startID++; + } + + // this function has no usage here: the adding of edges happens in the graph file -> csr.h + // it's just here because its a pure function in Vertex.h + void add_edge(uint64_t from, uint64_t to,unsigned short int rel) override { + std::cout << " virtual add_edge - no usage: " << from << ", " << to << ", " << rel << std::endl; + } + + /* old-calculation of size of a vertex in bytes + size_t get_size_of_vertex() { + size_t size = 0; + size += sizeof(uint64_t); // id + // properties: + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ + size += sizeof(char)*(property->first.length() + property->second.length()); + } + // entities: + size += sizeof(unsigned short int); + + return size; + } + */ + + }; +} + +#endif //MORPHSTORE_CVERTEX_H diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h new file mode 100644 index 00000000..51048c87 --- /dev/null +++ b/include/core/storage/graph/vertex/vertex.h @@ -0,0 +1,98 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertex.h + * @brief abstract vertex class: CSR has cVertex; ADJ-List has aVertex + * @todo add vertex size calculation +*/ + +#ifndef MORPHSTORE_VERTEX_H +#define MORPHSTORE_VERTEX_H + +#include +#include +#include + +namespace morphstore{ + + class Vertex{ + + protected: + // vertex: id, + // optional: entity, properties + uint64_t id; + unsigned short int entity; + std::unordered_map properties; + + + public: + + // ----------------- Setter & Getter ----------------- + + uint64_t getID(){ + return id; + } + + unsigned short getEntity() const { + return entity; + } + + void setEntity(unsigned short e) { + Vertex::entity = e; + } + + const std::unordered_map &getProperties() const { + return properties; + } + + void setProperties(const std::unordered_map &props) { + Vertex::properties = props; + } + + // function that adds a single property key-value pair to vertex + void add_property(const std::pair& property){ + /* + auto it = properties.find(property.first); + if(it != properties.end()){ + it->second = property.second; + } + */ + this->properties[property.first] = std::move(property.second); + } + + + // ----------------- (pure) virtual functions ----------------- + virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; + + virtual uint64_t get_number_edges(){ + return 0; + }; + + + // ----------------- DEBUGGING ----------------- + void print_properties() { + for (const auto &entry : properties) { + std::cout << "{" << entry.first << ": " << entry.second << "}"; + } + std::cout << "\n"; + } + }; + +} + +#endif //MORPHSTORE_VERTEX_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fc879669..c879fdbb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) -add_subdirectory(core/storage/graph/adj_list) +add_subdirectory(core/storage/graph/adjacencylist) add_subdirectory( core/storage/graph/csr ) add_subdirectory(vector) \ No newline at end of file diff --git a/test/core/storage/graph/adj_list/CMakeLists.txt b/test/core/storage/graph/adj_list/CMakeLists.txt deleted file mode 100644 index 93f16270..00000000 --- a/test/core/storage/graph/adj_list/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adj_list/ldbc_graph_adj_list_test_app ) - - add_executable( ldbc_graph_adj_list_test_app generate_ldbc_graph_adj_list.cpp) - target_compile_options( ldbc_graph_adj_list_test_app PRIVATE - -Werror - -Wall - -Wextra - -pedantic - -fstack-protector-all - $<$:-DDEBUG> ) - target_link_libraries( ldbc_graph_adj_list_test_app PRIVATE "-ldl" stdc++fs) - - add_test( ldbc_graph_adj_list_test ldbc_graph_adj_list_test_app ) -endif() \ No newline at end of file diff --git a/test/core/storage/graph/adjacencylist/CMakeLists.txt b/test/core/storage/graph/adjacencylist/CMakeLists.txt new file mode 100644 index 00000000..bb2e4d94 --- /dev/null +++ b/test/core/storage/graph/adjacencylist/CMakeLists.txt @@ -0,0 +1,15 @@ +if ( CTEST_ALL OR CTEST_STORAGE ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist_test_app ) + + add_executable( ldbc_graph_adjacencylist_test_app ldbc_graph_adjacencylist.cpp) + target_compile_options( ldbc_graph_adjacencylist_test_app PRIVATE + -Werror + -Wall + -Wextra + -pedantic + -fstack-protector-all + $<$:-DDEBUG> ) + target_link_libraries( ldbc_graph_adjacencylist_test_app PRIVATE "-ldl" stdc++fs) + + add_test( ldbc_graph_adjacency_test ldbc_graph_adjacencylist_test_app ) +endif() \ No newline at end of file diff --git a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp similarity index 70% rename from test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp rename to test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 6b39fcf3..08b784b4 100644 --- a/test/core/storage/graph/csr/generate_ldbc_graph_csr.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -16,42 +16,50 @@ **********************************************************************************************/ /** - * @file generate_ldbc_graph.cpp - * @brief Test for generating social network graph in CSR format from LDBC files + * @file ldbc_graph_adjacency.cpp + * @brief Test for generating social network graph in ADJ_LIST format * @todo */ #include -#include +#include #include // for high_resolution_clock int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + std::cout << "*********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: AdjacencyList-storage-format *" << std::endl; + std::cout << "*********************************************************" << std::endl; + std::cout << "\n"; + auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::CSR socialGraph; - ldbcImport.import(socialGraph); + // Graph init: + std::unique_ptr g1 = std::make_unique(); + + // generate vertices & edges from LDBC files and insert into graph + ldbcImport.import(g1); - // measuring time... + // measuring time: auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time std::chrono::duration elapsed = finish - start; - socialGraph.statistics(); + g1->statistics(); std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; /* // test vertices: - socialGraph.print_vertex_by_id(100454); - socialGraph.print_vertex_by_id(100450); - socialGraph.print_vertex_by_id(100168); - socialGraph.print_vertex_by_id(2000100); - */ + g1->print_vertex_by_id(100454); + g1->print_vertex_by_id(100450); + g1->print_vertex_by_id(100168); + g1->print_vertex_by_id(2000100); + */ // calculate size of social graph - std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; return 0; } \ No newline at end of file diff --git a/test/core/storage/graph/csr/CMakeLists.txt b/test/core/storage/graph/csr/CMakeLists.txt index 5189a2fa..209120d9 100644 --- a/test/core/storage/graph/csr/CMakeLists.txt +++ b/test/core/storage/graph/csr/CMakeLists.txt @@ -1,7 +1,7 @@ if ( CTEST_ALL OR CTEST_STORAGE ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/ldbc_graph_csr_test_app ) - add_executable( ldbc_graph_csr_test_app generate_ldbc_graph_csr.cpp) + add_executable( ldbc_graph_csr_test_app ldbc_graph_csr.cpp) target_compile_options( ldbc_graph_csr_test_app PRIVATE -Werror -Wall diff --git a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp similarity index 73% rename from test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp rename to test/core/storage/graph/csr/ldbc_graph_csr.cpp index c8f884e3..b2ad2002 100644 --- a/test/core/storage/graph/adj_list/generate_ldbc_graph_adj_list.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -16,43 +16,50 @@ **********************************************************************************************/ /** - * @file generate_ldbc_graph.cpp - * @brief Test for generating social network graph as Adj-List from LDBC files + * @file ldbc_graph_csr.cpp + * @brief Test for generating social network graph in CSR format * @todo */ #include -#include +#include #include // for high_resolution_clock int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + std::cout << "***********************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: CSR-storage-format *" << std::endl; + std::cout << "***********************************************" << std::endl; + std::cout << "\n"; + auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::AdjacencyList socialGraph; - // generate vertices & edges from LDBC files and insert into socialGraph - ldbcImport.import(socialGraph); + // Graph init: + std::unique_ptr g1 = std::make_unique(); + + // generate vertices & edges from LDBC files and insert into graph + ldbcImport.import(g1); - // measuring time... + // measuring time: auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time std::chrono::duration elapsed = finish - start; - socialGraph.statistics(); + g1->statistics(); std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; /* // test vertices: - socialGraph.print_vertex_by_id(100454); - socialGraph.print_vertex_by_id(100450); - socialGraph.print_vertex_by_id(100168); - socialGraph.print_vertex_by_id(2000100); - */ + g1->print_vertex_by_id(100454); + g1->print_vertex_by_id(100450); + g1->print_vertex_by_id(100168); + g1->print_vertex_by_id(2000100); + */ // calculate size of social graph - std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; return 0; } \ No newline at end of file From b46ef6b7759eda6c0ecc5e02446546160b44e406 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 6 Aug 2019 13:31:24 +0200 Subject: [PATCH 058/216] little changes; --- include/core/storage/graph/ldbc_import.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index dc5103bf..41935c4e 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,7 +18,7 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph in CSR or AdjList - * @todo EDGE PROPERTIES ARE MISSING in fillVertexLookup(); OPTIMIZATIONS + * @todo EDGE-properties still missing!!! -> OPTIMIZATIONS */ #ifndef MORPHSTORE_LDBC_IMPORT_H @@ -59,7 +59,6 @@ namespace morphstore{ // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; - // for CSR data structure // map for lookup every system-id, the neighbors in the graph (for further processing, e.g. filling the edge_array in the right order) std::unordered_map>> vertexNeighborsLookup; @@ -538,7 +537,7 @@ namespace morphstore{ // WITHOUT properties: just from the first delimiter on toID = globalIdLookupMap.at({toEntity, row}); - // insert relation into vertexNeighborsLookup + // insert relation into vertexNeighborsLookup with has EdgeProperty = false vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); }else{ @@ -546,9 +545,9 @@ namespace morphstore{ toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // add to // TODO: DONT FORGET TO HANDLE PROPERTIES //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); + // insert relation into vertexNeighborsLookup with has EdgeProperty = TRUE vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); } } From 8cc735de682df71ddbe19d14fe453b02c7bd277c Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 7 Aug 2019 10:56:59 +0200 Subject: [PATCH 059/216] changing vertex class names --- include/core/storage/graph/formats/adjacencylist.h | 6 +++--- include/core/storage/graph/formats/csr.h | 8 ++++---- .../vertex/{avertex.h => adjacencylist_vertex.h} | 12 ++++++------ .../storage/graph/vertex/{cvertex.h => csr_vertex.h} | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) rename include/core/storage/graph/vertex/{avertex.h => adjacencylist_vertex.h} (93%) rename include/core/storage/graph/vertex/{cvertex.h => csr_vertex.h} (98%) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 59aaec8b..8fddda7c 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -25,7 +25,7 @@ #define MORPHSTORE_ADJACENCYLIST_H #include "../graph.h" -#include "../vertex/avertex.h" +#include "../vertex/adjacencylist_vertex.h" namespace morphstore{ @@ -47,13 +47,13 @@ namespace morphstore{ // adding a single vertex void add_vertex() override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(); vertices[v->getID()] = v; } // adding a vertex with its properties int add_vertex_with_properties(const std::unordered_map &props) override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; return v->getID(); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index fbe24000..5ac7c436 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -25,7 +25,7 @@ #define MORPHSTORE_CSR_H #include "../graph.h" -#include "../vertex/cvertex.h" +#include "../vertex/csr_vertex.h" namespace morphstore{ @@ -37,7 +37,7 @@ namespace morphstore{ * edge array: every cell contains pointer to edge object of vertex */ // TODO: construct a graph-topology struct ? - // TODO: free memory in destrcutor + // TODO: free memory in destructor uint64_t* node_array = nullptr; Edge** edge_array = nullptr; @@ -60,13 +60,13 @@ namespace morphstore{ // adding a single vertex (without any properties, etc...) void add_vertex() override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(); vertices[v->getID()] = v; } // adding a vertex with its properties int add_vertex_with_properties(const std::unordered_map& props ) override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; return v->getID(); diff --git a/include/core/storage/graph/vertex/avertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h similarity index 93% rename from include/core/storage/graph/vertex/avertex.h rename to include/core/storage/graph/vertex/adjacencylist_vertex.h index 4152802c..efd62fd2 100644 --- a/include/core/storage/graph/vertex/avertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -28,14 +28,14 @@ namespace morphstore{ - class AVertex: public Vertex{ + class AdjacencyListVertex: public Vertex{ protected: - std::vector adjList; + std::vector adjacencylist; public: // constructor with unique id generation - AVertex(){ + AdjacencyListVertex(){ // unique ID generation static uint64_t startID = 0; id = startID++; @@ -43,18 +43,18 @@ namespace morphstore{ // returns a reference (read-only) of the adjacency list const std::vector& get_adjList() const{ - return adjList; + return adjacencylist; } // add edge to vertexs' adjacencylist void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { Edge e(from, to, rel); - this->adjList.push_back(e); + this->adjacencylist.push_back(e); } // function which returns the number of edges uint64_t get_number_edges() override { - return adjList.size(); + return adjacencylist.size(); } /* old-calculation of vertex size diff --git a/include/core/storage/graph/vertex/cvertex.h b/include/core/storage/graph/vertex/csr_vertex.h similarity index 98% rename from include/core/storage/graph/vertex/cvertex.h rename to include/core/storage/graph/vertex/csr_vertex.h index 710d68ed..9b3024dc 100644 --- a/include/core/storage/graph/vertex/cvertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -26,11 +26,11 @@ namespace morphstore{ - class CVertex: public Vertex{ + class CSRVertex: public Vertex{ public: // constructor with unique id generation - CVertex(){ + CSRVertex(){ // unique ID generation static uint64_t startID = 0; id = startID++; From 4024db43124f4666a660cad635374f653838f1ad Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 7 Aug 2019 16:56:18 +0200 Subject: [PATCH 060/216] additional function; deleted stuff... --- include/core/storage/graph/graph.h | 5 +++++ include/core/storage/graph/ldbc_import.h | 10 ++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index de23f94d..0834c870 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -109,6 +109,11 @@ namespace morphstore{ return true; } + // function which returns a pointer to vertex by id + std::shared_ptr get_vertex_by_id(uint64_t id){ + return vertices[id]; + } + // -------------------- pure virtual functions -------------------- virtual storageFormat getStorageFormat() const = 0; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 41935c4e..e726b34a 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -594,14 +594,8 @@ namespace morphstore{ uint64_t graphSize = graph->getNumberVertices(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ - // Problem: there are some vertexIDs which have no entry in the lookup -> segmentation fault - // get the list of target vertices - std::vector> neighbors; - if(vertexNeighborsLookup[vertexID].size() != 0){ - neighbors = vertexNeighborsLookup.at(vertexID); - } - // VERTICES WITHOUT ANY EDGES -< TODO ? how to handle? - graph->add_edges(vertexID, neighbors); + // add edge data: + graph->add_edges(vertexID, vertexNeighborsLookup[vertexID]); } } From aaab4d5d59ed89ba10bdc8462b25ef66d5e53887 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 19 Aug 2019 15:50:53 +0200 Subject: [PATCH 061/216] Importing edge-property now working... --- include/core/storage/graph/edge/edge.h | 20 +++++++++- .../storage/graph/formats/adjacencylist.h | 12 +++--- include/core/storage/graph/formats/csr.h | 16 ++++---- include/core/storage/graph/graph.h | 3 +- include/core/storage/graph/ldbc_import.h | 39 ++++++++----------- .../graph/vertex/adjacencylist_vertex.h | 6 ++- .../core/storage/graph/vertex/csr_vertex.h | 5 +++ include/core/storage/graph/vertex/vertex.h | 3 ++ .../ldbc_graph_adjacencylist.cpp | 3 +- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 1 + 10 files changed, 66 insertions(+), 42 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index c84f86cf..8318ba25 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -24,6 +24,10 @@ #ifndef MORPHSTORE_EDGE_H #define MORPHSTORE_EDGE_H +#include +#include +#include + namespace morphstore{ class Edge{ @@ -36,8 +40,6 @@ namespace morphstore{ public: - Edge(){}; - // Constructor with parameters Edge(uint64_t from, uint64_t to, unsigned short int rel){ setSourceId(from); @@ -45,6 +47,13 @@ namespace morphstore{ setRelation(rel); } + Edge(uint64_t from, uint64_t to, unsigned short int rel, std::pair prop){ + setSourceId(from); + setTargetId(to); + setRelation(rel); + setProperty(prop); + } + // --------------- Getter and Setter --------------- uint64_t getSourceId() const { @@ -78,6 +87,13 @@ namespace morphstore{ void setProperty(const std::pair &property) { Edge::property = property; } + + // function for sorting algorithms in the importer: + // compare the target ids + bool operator<(const Edge& e) const + { + return getTargetId() < e.getTargetId(); + } }; } diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 8fddda7c..7b56b264 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -87,15 +87,13 @@ namespace morphstore{ } // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(const uint64_t source, std::vector> &listOfNeighbors) override { - if (exist_id(source)) { - if (listOfNeighbors.size() != 0) { - for (auto &pair : listOfNeighbors) { - vertices[source]->add_edge(source, pair.first, pair.second); - } + void add_edges(uint64_t sourceID, std::vector& relations) override { + if (exist_id(sourceID)) { + if (relations.size() != 0) { + vertices[sourceID]->add_edges(relations); } } else { - std::cout << "Vertex with ID " << source << " not found." << std::endl; + std::cout << "Vertex with ID " << sourceID << " not found." << std::endl; } } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 5ac7c436..8bbaf778 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -81,19 +81,19 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of neighbors - void add_edges(uint64_t source, std::vector>& listOfNeighbors) override { - if(source == 0){ - node_array[source] = 0; + void add_edges(uint64_t sourceID, std::vector& relations) override { + if(sourceID == 0){ + node_array[sourceID] = 0; } - uint64_t offset = node_array[source]; - uint64_t nextOffset = offset + listOfNeighbors.size(); + uint64_t offset = node_array[sourceID]; + uint64_t nextOffset = offset + relations.size(); - for(auto const& pair : listOfNeighbors){ - Edge* e = new Edge(source, pair.first, pair.second); + for(auto & edge : relations){ + Edge* e = &edge; edge_array[offset] = e; ++offset; } - node_array[source+1] = nextOffset; + node_array[sourceID+1] = nextOffset; } // function to add a single property to vertex diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 0834c870..1f2eab96 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -25,6 +25,7 @@ #define MORPHSTORE_GRAPH_H #include "vertex/vertex.h" +#include "edge/edge.h" #include #include @@ -123,7 +124,7 @@ namespace morphstore{ virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t source, std::vector>& listOfNeighbors) = 0; + virtual void add_edges(uint64_t sourceID, std::vector& relations) = 0; virtual uint64_t get_number_edges(uint64_t id) = 0; // -------------------- debugging functions -------------------- diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e726b34a..3e786de9 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -59,8 +59,8 @@ namespace morphstore{ // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; - // map for lookup every system-id, the neighbors in the graph (for further processing, e.g. filling the edge_array in the right order) - std::unordered_map>> vertexNeighborsLookup; + // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) + std::unordered_map> vertexRelationsLookup; public: @@ -252,7 +252,7 @@ namespace morphstore{ entitiesLookup.clear(); relationsPaths.clear(); verticesPaths.clear(); - vertexNeighborsLookup.clear(); + vertexRelationsLookup.clear(); } // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the entity lookup creation) @@ -401,7 +401,7 @@ namespace morphstore{ } // this function reads the relation-files and generates edges in graph - void fill_vertexNeighborsLookup(std::unique_ptr& graph){ + void fill_vertexRelationsLookup(std::unique_ptr& graph){ if(!relationsPaths.empty()) { //std::cout << "(2/2) Generating LDBC-Edges ..."; @@ -537,18 +537,17 @@ namespace morphstore{ // WITHOUT properties: just from the first delimiter on toID = globalIdLookupMap.at({toEntity, row}); - // insert relation into vertexNeighborsLookup with has EdgeProperty = false - vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + // insert relation into vertexRealtionsLookup: + vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // TODO: DONT FORGET TO HANDLE PROPERTIES - //graph.add_edge_with_property(fromID, toID, relationNumber, {propertyKey, value}); - // insert relation into vertexNeighborsLookup with has EdgeProperty = TRUE - vertexNeighborsLookup[fromID].push_back({toID, relationNumber}); + + // insert relation into vertexRealtionsLookup with its edge-property: + vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber, {propertyKey, value})); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -574,28 +573,24 @@ namespace morphstore{ } } - // function for sorting the vertexNeighborsLookup ASC in CSR - void sort_VertexNeighborsLookup(){ + // function for sorting the vertexRelationsLookup ASC (needed in CSR) + void sort_VertexRelationsLookup(){ // sorting the first element of the pair (target-id) - for(auto &it: vertexNeighborsLookup){ - std::sort(it.second.begin(), it.second.end()); + for(auto &rel: vertexRelationsLookup){ + std::sort(rel.second.begin(), rel.second.end()); } } - /* TODO: write function, that additionally fills the VertexNeighborsLookup with ids, that have no neighbors - * After that, we can iterate through the lookup and can sequently fill the CSR or AdjacencyList - * */ - - // this function writes the actual data from the intermediate vertexNeighborsLookup + // this function writes the actual data from the intermediate vertexRelationsLookup void generate_edges(std::unique_ptr& graph){ // firstly, sorting the intermediates with their target IDs ASC - sort_VertexNeighborsLookup(); + sort_VertexRelationsLookup(); uint64_t graphSize = graph->getNumberVertices(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ // add edge data: - graph->add_edges(vertexID, vertexNeighborsLookup[vertexID]); + graph->add_edges(vertexID, vertexRelationsLookup[vertexID]); } } @@ -615,7 +610,7 @@ namespace morphstore{ generate_vertices(graph); // (4) read relations and write to intermediate results - fill_vertexNeighborsLookup(graph); + fill_vertexRelationsLookup(graph); // (5) read intermediates and write edges generate_edges(graph); diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index efd62fd2..ebcfbcda 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -46,12 +46,16 @@ namespace morphstore{ return adjacencylist; } - // add edge to vertexs' adjacencylist void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { Edge e(from, to, rel); this->adjacencylist.push_back(e); } + // add edges to vertexs' adjacencylist + void add_edges(std::vector& edges) override { + this->adjacencylist = edges; + } + // function which returns the number of edges uint64_t get_number_edges() override { return adjacencylist.size(); diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index 9b3024dc..06aa7769 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -42,6 +42,11 @@ namespace morphstore{ std::cout << " virtual add_edge - no usage: " << from << ", " << to << ", " << rel << std::endl; } + // pure function -> no functionality + void add_edges(std::vector& edges) override { + std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; + } + /* old-calculation of size of a vertex in bytes size_t get_size_of_vertex() { size_t size = 0; diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 51048c87..79fe67ee 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -24,6 +24,8 @@ #ifndef MORPHSTORE_VERTEX_H #define MORPHSTORE_VERTEX_H +#include "../edge/edge.h" + #include #include #include @@ -77,6 +79,7 @@ namespace morphstore{ // ----------------- (pure) virtual functions ----------------- + virtual void add_edges(std::vector& edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual uint64_t get_number_edges(){ diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 08b784b4..300abe15 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -27,7 +27,8 @@ int main( void ){ - // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- + std::cout << "\n"; std::cout << "*********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: AdjacencyList-storage-format *" << std::endl; std::cout << "*********************************************************" << std::endl; diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index b2ad2002..cfd64ff1 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -28,6 +28,7 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + std::cout << "\n"; std::cout << "***********************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: CSR-storage-format *" << std::endl; std::cout << "***********************************************" << std::endl; From f40cc028ddfa2a31cdd75881dbcddc633fb444f7 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 20 Aug 2019 12:03:03 +0200 Subject: [PATCH 062/216] tidy-up some comments... --- include/core/storage/graph/edge/edge.h | 8 ++++---- include/core/storage/graph/graph.h | 2 +- include/core/storage/graph/ldbc_import.h | 12 ++++++++---- .../core/storage/graph/vertex/adjacencylist_vertex.h | 5 +++-- include/core/storage/graph/vertex/vertex.h | 2 +- .../graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 6 +++--- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 6 +++--- 7 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 8318ba25..a3f9a03c 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -17,7 +17,7 @@ /** * @file edge.h - * @brief Edge class which represents a relationship + * @brief Edge class which represents a relationship between 2 Vertices * @todo */ @@ -40,7 +40,7 @@ namespace morphstore{ public: - // Constructor with parameters + // Constructors with parameters Edge(uint64_t from, uint64_t to, unsigned short int rel){ setSourceId(from); setTargetId(to); @@ -88,8 +88,8 @@ namespace morphstore{ Edge::property = property; } - // function for sorting algorithms in the importer: - // compare the target ids + // function for sorting algorithms in the ldbc-importer: + // compare target-ids and return if it's "lower" (we need the sorting for the CSR) bool operator<(const Edge& e) const { return getTargetId() < e.getTargetId(); diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 1f2eab96..25f096d5 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -18,7 +18,7 @@ /** * @file graph.h * @brief abstract graph class for any storage format --> CSR,ADJ - * @todo graph-size calculation is missing + * @todo graph-size calculation!! */ #ifndef MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 3e786de9..54aeb851 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,7 +18,7 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph in CSR or AdjList - * @todo EDGE-properties still missing!!! -> OPTIMIZATIONS + * @todo */ #ifndef MORPHSTORE_LDBC_IMPORT_H @@ -64,6 +64,7 @@ namespace morphstore{ public: + // Constructor: needs the address of the csv files LDBCImport(const std::string &dir) { directory = dir; insert_file_names(directory); @@ -98,6 +99,7 @@ namespace morphstore{ } // this function reads the vertices-files and creates vertices in a graph + // + creates the entityLookup (number to string) for the graph void generate_vertices(std::unique_ptr& graph) { if (!verticesPaths.empty()) { @@ -400,7 +402,8 @@ namespace morphstore{ return result; } - // this function reads the relation-files and generates edges in graph + // this function reads the relation-files and fills the intermediate: vertexRelationLookup + // + creates the relationLookup (number to string) for the graph void fill_vertexRelationsLookup(std::unique_ptr& graph){ if(!relationsPaths.empty()) { @@ -574,6 +577,7 @@ namespace morphstore{ } // function for sorting the vertexRelationsLookup ASC (needed in CSR) + // sorting for every vertex its vector list with target-ids ASC void sort_VertexRelationsLookup(){ // sorting the first element of the pair (target-id) for(auto &rel: vertexRelationsLookup){ @@ -581,7 +585,7 @@ namespace morphstore{ } } - // this function writes the actual data from the intermediate vertexRelationsLookup + // this function writes the actual data from the intermediate vertexRelationsLookup into the graph void generate_edges(std::unique_ptr& graph){ // firstly, sorting the intermediates with their target IDs ASC sort_VertexRelationsLookup(); @@ -594,7 +598,7 @@ namespace morphstore{ } } - // main import function: see steps in comments + // MAIN import function: see steps in comments void import(std::unique_ptr& graph) { std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index ebcfbcda..6e55ff53 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -17,8 +17,8 @@ /** * @file avertex.h - * @brief Derived vertex calss for ADJ_LIST storage format - * @todo change adjlist (vector of Edges) to vector of Edge* ????? + * @brief Derived vertex calss for ADJ_LIST storage format: base-class: vertex + * @todo */ #ifndef MORPHSTORE_AVERTEX_H @@ -46,6 +46,7 @@ namespace morphstore{ return adjacencylist; } + // function to add a single edge to vertexs adjlist void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { Edge e(from, to, rel); this->adjacencylist.push_back(e); diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 79fe67ee..06638dee 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -17,7 +17,7 @@ /** * @file vertex.h - * @brief abstract vertex class: CSR has cVertex; ADJ-List has aVertex + * @brief abstract vertex class for storage formats * @todo add vertex size calculation */ diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 300abe15..715df15d 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -29,9 +29,9 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- std::cout << "\n"; - std::cout << "*********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: AdjacencyList-storage-format *" << std::endl; - std::cout << "*********************************************************" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; std::cout << "\n"; auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index cfd64ff1..e194ed7a 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -29,9 +29,9 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ std::cout << "\n"; - std::cout << "***********************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: CSR-storage-format *" << std::endl; - std::cout << "***********************************************" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: Compressed Row Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; std::cout << "\n"; auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time From 73efd01afca51834bb936f2d6e0e3821464060af Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 21 Aug 2019 18:05:33 +0200 Subject: [PATCH 063/216] fixed SegFaults of Edge-Properties; Finally working but slowly... --- include/core/storage/graph/edge/edge.h | 31 +++++++++++++++-- .../storage/graph/formats/adjacencylist.h | 6 +++- include/core/storage/graph/formats/csr.h | 34 +++++++++++++------ include/core/storage/graph/graph.h | 4 ++- .../graph/vertex/adjacencylist_vertex.h | 10 +++++- .../core/storage/graph/vertex/csr_vertex.h | 8 +++-- include/core/storage/graph/vertex/vertex.h | 3 +- 7 files changed, 77 insertions(+), 19 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index a3f9a03c..371810ab 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -54,6 +54,30 @@ namespace morphstore{ setProperty(prop); } + // Copy constructor + Edge(const Edge& edge){ + setSourceId(edge.sourceID); + setTargetId(edge.targetID); + setRelation(edge.relation); + setProperty(edge.property); + } + + // this is needed for csr when doing edge_array[offset] = edge... + Edge& operator= (const Edge &edge){ + // self-assignment guard + if (this == &edge) + return *this; + + // do the copy + setSourceId(edge.sourceID); + setTargetId(edge.targetID); + setRelation(edge.relation); + setProperty(edge.property); + + // return the existing object so we can chain this operator + return *this; + } + // --------------- Getter and Setter --------------- uint64_t getSourceId() const { @@ -84,8 +108,11 @@ namespace morphstore{ return property; } - void setProperty(const std::pair &property) { - Edge::property = property; + void setProperty(const std::pair &prop) { + // first check if there is any key value data, otherwise problems with segfaults + if(prop.first != "" && prop.second != ""){ + Edge::property = prop; + } } // function for sorting algorithms in the ldbc-importer: diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 7b56b264..4804936b 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -87,7 +87,7 @@ namespace morphstore{ } // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(uint64_t sourceID, std::vector& relations) override { + void add_edges(uint64_t sourceID, const std::vector& relations) override { if (exist_id(sourceID)) { if (relations.size() != 0) { vertices[sourceID]->add_edges(relations); @@ -97,6 +97,10 @@ namespace morphstore{ } } + void print_neighbors_of_vertex(uint64_t id) override{ + vertices[id]->print_neighbors(); + } + // get number of neighbors of vertex with id uint64_t get_number_edges(uint64_t id) override { return vertices[id]->get_number_edges(); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 8bbaf778..610d62db 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -39,7 +39,7 @@ namespace morphstore{ // TODO: construct a graph-topology struct ? // TODO: free memory in destructor uint64_t* node_array = nullptr; - Edge** edge_array = nullptr; + Edge* edge_array = nullptr; public: @@ -55,7 +55,10 @@ namespace morphstore{ vertices.reserve(numberVertices); node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); - edge_array = (Edge**) malloc(numberEdges * sizeof(Edge*)); + edge_array = (Edge*) malloc(numberEdges * sizeof(Edge)); + + // init node array: + node_array[0] = 0; } // adding a single vertex (without any properties, etc...) @@ -81,19 +84,19 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of neighbors - void add_edges(uint64_t sourceID, std::vector& relations) override { - if(sourceID == 0){ - node_array[sourceID] = 0; - } + void add_edges(uint64_t sourceID, const std::vector& relations) override { uint64_t offset = node_array[sourceID]; uint64_t nextOffset = offset + relations.size(); - for(auto & edge : relations){ - Edge* e = &edge; - edge_array[offset] = e; + for(const auto & edge : relations){ + edge_array[offset] = edge; ++offset; } - node_array[sourceID+1] = nextOffset; + + + if(sourceID < getNumberVertices()-1){ + node_array[sourceID+1] = nextOffset; + } } // function to add a single property to vertex @@ -122,8 +125,17 @@ namespace morphstore{ return numberEdges; } + void print_neighbors_of_vertex(uint64_t id) override{ + uint64_t offset = node_array[id]; + uint64_t numberEdges = get_number_edges(id); + + for(uint64_t i = offset; i < offset+numberEdges; ++i){ + std::cout << "Source-ID: " << edge_array[i].getSourceId() << " - Target-ID: " << edge_array[i].getTargetId() << " - Property: { " << edge_array[i].getProperty().first << ": " << edge_array[i].getProperty().second << " }" << " || "; + } + } + /* old-calculation of the graph size in bytes - size_t get_size_of_graph(){ + * size_t get_size_of_graph(){ size_t size = 0; // pointer to arrays: size += sizeof(uint64_t*) * 2 + sizeof(unsigned short int*); diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 25f096d5..c6df8b9e 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -124,8 +124,10 @@ namespace morphstore{ virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, std::vector& relations) = 0; + virtual void add_edges(uint64_t sourceID, const std::vector& relations) = 0; virtual uint64_t get_number_edges(uint64_t id) = 0; + // for debugging + virtual void print_neighbors_of_vertex(uint64_t id) = 0; // -------------------- debugging functions -------------------- diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 6e55ff53..dc7f0f27 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -53,7 +53,7 @@ namespace morphstore{ } // add edges to vertexs' adjacencylist - void add_edges(std::vector& edges) override { + void add_edges(const std::vector& edges) override { this->adjacencylist = edges; } @@ -62,6 +62,14 @@ namespace morphstore{ return adjacencylist.size(); } + void print_neighbors() override { + for(const auto& edge : adjacencylist){ + std::cout << "Source-ID: " << edge.getSourceId() << " - Target-ID: " << edge.getTargetId() << + " - Property: { " << edge.getProperty().first << ": " << edge.getProperty().second << " }" << " || "; + } + } + + /* old-calculation of vertex size size_t get_size_of_vertex() { size_t size = 0; diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index 06aa7769..b5b7aab8 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -37,16 +37,20 @@ namespace morphstore{ } // this function has no usage here: the adding of edges happens in the graph file -> csr.h - // it's just here because its a pure function in Vertex.h + // it's just here because its a pure function in Vertex.h void add_edge(uint64_t from, uint64_t to,unsigned short int rel) override { std::cout << " virtual add_edge - no usage: " << from << ", " << to << ", " << rel << std::endl; } // pure function -> no functionality - void add_edges(std::vector& edges) override { + void add_edges(const std::vector& edges) override { std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; } + void print_neighbors() override { + std::cout << " virtual print_neighbors - no usage: " << std::endl; + } + /* old-calculation of size of a vertex in bytes size_t get_size_of_vertex() { size_t size = 0; diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 06638dee..14bcc085 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -79,8 +79,9 @@ namespace morphstore{ // ----------------- (pure) virtual functions ----------------- - virtual void add_edges(std::vector& edges) = 0; + virtual void add_edges(const std::vector& edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; + virtual void print_neighbors() = 0; virtual uint64_t get_number_edges(){ return 0; From 85b7da83fc8ae69c44c5f1f185d90047b59d43fc Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 26 Aug 2019 11:16:31 +0200 Subject: [PATCH 064/216] tidy up --- include/core/storage/graph/formats/adjacencylist.h | 2 +- include/core/storage/graph/formats/csr.h | 4 ++-- include/core/storage/graph/graph.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 4804936b..9ea22f9d 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -102,7 +102,7 @@ namespace morphstore{ } // get number of neighbors of vertex with id - uint64_t get_number_edges(uint64_t id) override { + uint64_t get_degree(uint64_t id) override { return vertices[id]->get_number_edges(); } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 610d62db..df7473fb 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -93,7 +93,7 @@ namespace morphstore{ ++offset; } - + // to avoid segfualt: if(sourceID < getNumberVertices()-1){ node_array[sourceID+1] = nextOffset; } @@ -118,7 +118,7 @@ namespace morphstore{ } // get number of edges of vertex with id - uint64_t get_number_edges(uint64_t id) override { + uint64_t get_degree(uint64_t id) override { uint64_t offset = node_array[id]; uint64_t nextOffset = node_array[id+1]; uint64_t numberEdges = nextOffset - offset; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index c6df8b9e..37a619aa 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -125,7 +125,7 @@ namespace morphstore{ virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector& relations) = 0; - virtual uint64_t get_number_edges(uint64_t id) = 0; + virtual uint64_t get_degree(uint64_t id) = 0; // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; From 4ca0cd2a3082e3a750ea7a305ab4ff234b902aa0 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 26 Aug 2019 12:20:00 +0200 Subject: [PATCH 065/216] for testing @TUD computers --- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 715df15d..f9780a37 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -36,7 +36,9 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + // when using workstation @ TUD: social network directory: "~/Downloads/social_network/" + //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImport ldbcImport("~/Downloads/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); From 543f64922a00cb52a8b9255e7d538d305c144f20 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 26 Aug 2019 12:54:00 +0200 Subject: [PATCH 066/216] function name wasn't updated --- include/core/storage/graph/graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 37a619aa..c5fd1c43 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -146,7 +146,7 @@ namespace morphstore{ std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); - std::cout << "#Edges: " << this->get_number_edges(v->getID()); + std::cout << "#Edges: " << this->get_degree(v->getID()); std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } From 7f253685b8ec9fc543d1dcd107273ce6df008466 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 26 Aug 2019 12:56:12 +0200 Subject: [PATCH 067/216] forgotten stuff... --- include/core/storage/graph/formats/csr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index df7473fb..d939d99b 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -127,7 +127,7 @@ namespace morphstore{ void print_neighbors_of_vertex(uint64_t id) override{ uint64_t offset = node_array[id]; - uint64_t numberEdges = get_number_edges(id); + uint64_t numberEdges = get_degree(id); for(uint64_t i = offset; i < offset+numberEdges; ++i){ std::cout << "Source-ID: " << edge_array[i].getSourceId() << " - Target-ID: " << edge_array[i].getTargetId() << " - Property: { " << edge_array[i].getProperty().first << ": " << edge_array[i].getProperty().second << " }" << " || "; From fc6bed3c283e50e03c4edd23e0665f45a4b6bf4f Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 26 Aug 2019 13:14:41 +0200 Subject: [PATCH 068/216] tu dresden pcs directories --- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 4 ++-- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index f9780a37..91c4d673 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -36,9 +36,9 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - // when using workstation @ TUD: social network directory: "~/Downloads/social_network/" + // when using workstation @ TUD: social network directory: "/home/s8069724/Dokumente/social_network/" //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::LDBCImport ldbcImport("~/Downloads/social_network/"); + morphstore::LDBCImport ldbcImport("/home/s8069724/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index e194ed7a..5c6be190 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -36,7 +36,9 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + // when using workstation @ TUD: social network directory: "/home/s8069724/Dokumente/social_network/" + //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); + morphstore::LDBCImport ldbcImport("/home/s8069724/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); From 459fa939cf9efa1b37d8bb3de70ea6ff211aa916 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 27 Aug 2019 10:12:36 +0200 Subject: [PATCH 069/216] testing on TUD PC --- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 5 +++-- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 91c4d673..20eb6f47 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -36,9 +36,9 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - // when using workstation @ TUD: social network directory: "/home/s8069724/Dokumente/social_network/" + // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::LDBCImport ldbcImport("/home/s8069724/Dokumente/social_network/"); + morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -60,6 +60,7 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ + g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 5c6be190..a8e31c67 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -36,9 +36,9 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - // when using workstation @ TUD: social network directory: "/home/s8069724/Dokumente/social_network/" + // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::LDBCImport ldbcImport("/home/s8069724/Dokumente/social_network/"); + morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -60,6 +60,7 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ + g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; From 76e41bc251109f2b803d4a7c8a627b845e9d54e8 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 27 Aug 2019 11:12:42 +0200 Subject: [PATCH 070/216] added smart pointer for ldbc-import object --- .../graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 7 +++---- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 20eb6f47..29bdaad9 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -37,14 +37,14 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" - //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + //morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); // generate vertices & edges from LDBC files and insert into graph - ldbcImport.import(g1); + ldbcImport->import(g1); // measuring time: auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -60,7 +60,6 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ - g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index a8e31c67..1257eacb 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -37,14 +37,14 @@ int main( void ){ auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" - //morphstore::LDBCImport ldbcImport("/opt/ldbc_snb_datagen-0.2.8/social_network/"); - morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + //morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); // generate vertices & edges from LDBC files and insert into graph - ldbcImport.import(g1); + ldbcImport->import(g1); // measuring time: auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -60,7 +60,6 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ - g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; From 8e83a2b1af9fe0215dddd26c2d00ad2f81d7b8b3 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 27 Aug 2019 17:10:11 +0200 Subject: [PATCH 071/216] added naive bfs-alg. --- include/core/storage/graph/formats/adjacencylist.h | 5 +++++ include/core/storage/graph/formats/csr.h | 12 ++++++++++++ include/core/storage/graph/graph.h | 1 + .../core/storage/graph/vertex/adjacencylist_vertex.h | 11 +++++++++++ include/core/storage/graph/vertex/vertex.h | 6 ++++++ test/CMakeLists.txt | 2 +- .../graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 7 +++++++ 7 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 9ea22f9d..1000aa5f 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -106,6 +106,11 @@ namespace morphstore{ return vertices[id]->get_number_edges(); } + // get the neighbors-ids into vector for BFS alg. + std::vector get_neighbors_ids(uint64_t id) override { + return vertices.at(id)->get_neighbors_ids(); + } + /* old-calculation of the graph size in bytes size_t get_size_of_graph(){ size_t size = 0; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index d939d99b..bfaf1350 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -134,6 +134,18 @@ namespace morphstore{ } } + // function to return a vector of ids of neighbors for BFS alg. + std::vector get_neighbors_ids(uint64_t id) override { + std::vector neighbors; + uint64_t offset = node_array[id]; + uint64_t numberEdges = get_degree(id); + + for(uint64_t i = offset; i < offset+numberEdges; ++i){ + neighbors.push_back(edge_array[i].getTargetId()); + } + return neighbors; + } + /* old-calculation of the graph size in bytes * size_t get_size_of_graph(){ size_t size = 0; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index c5fd1c43..eef6da8f 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -126,6 +126,7 @@ namespace morphstore{ virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector& relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; + virtual std::vector get_neighbors_ids(uint64_t id) = 0; // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index dc7f0f27..35999b62 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -69,6 +69,17 @@ namespace morphstore{ } } + // function to return a vector of neighbor ids (for BFS) + std::vector get_neighbors_ids() override { + std::vector neighbors; + for(auto const& edge : adjacencylist){ + neighbors.push_back(edge.getTargetId()); + } + return neighbors; + } + + + /* old-calculation of vertex size size_t get_size_of_vertex() { diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 14bcc085..b16f2949 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -87,6 +87,12 @@ namespace morphstore{ return 0; }; + // for BFS alg.: adj-list + virtual std::vector get_neighbors_ids() { + // return empty vector: implementation only needed in ADj-Vertex + return std::vector(); + } + // ----------------- DEBUGGING ----------------- void print_properties() { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c879fdbb..ee0cf4a2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) -add_subdirectory(core/storage/graph/adjacencylist) +add_subdirectory( core/storage/graph/adjacencylist ) add_subdirectory( core/storage/graph/csr ) add_subdirectory(vector) \ No newline at end of file diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 29bdaad9..d684ea0b 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -23,6 +23,8 @@ #include #include +#include + #include // for high_resolution_clock int main( void ){ @@ -60,9 +62,14 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ + g1->print_vertex_by_id(10000); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + // BFS TEST: + std::unique_ptr bfs = std::make_unique(g1); + bfs->doBFS(10000); + return 0; } \ No newline at end of file From e16657915437898842d953d5c95786f6429d0d72 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 27 Aug 2019 17:11:10 +0200 Subject: [PATCH 072/216] bfs impl. --- include/core/operators/graph/bfs_naive.h | 90 ++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 include/core/operators/graph/bfs_naive.h diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h new file mode 100644 index 00000000..03199c2c --- /dev/null +++ b/include/core/operators/graph/bfs_naive.h @@ -0,0 +1,90 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs.h + * @brief naive (simple) BFS implementation to traverse graph of type CSR OR AdjacencyList + * @todo implement optimized version of BFS -> now just for simplicity + */ + +#ifndef MORPHSTORE_BFS_NAIVE_H +#define MORPHSTORE_BFS_NAIVE_H + +#include "../../storage/graph/graph.h" + +#include + +namespace morphstore{ + + class BFS{ + + private: + std::unique_ptr graph; + uint64_t graphSize; + // Create a "visited" array (true or false) to keep track of if we visited a vertex. + std::vector visited = { false }; + std::vector layer; + // Create a queue for the nodes we visit. + std::queue queue; + + public: + + // constructor with smart pointer to graph as parameter + BFS(std::unique_ptr& g) : graph(std::move(g)){ + graphSize = graph->getNumberVertices(); + visited.resize(graphSize); + layer.resize(graphSize); + } + + void doBFS(uint64_t startVertex){ + std::cout << "BFS: starting from Vertex " << startVertex << std::endl; + + //uint64_t graphSize = graph.getNumberVertices(); + + queue.push(startVertex); + visited[startVertex] = true; + + layer[startVertex] = 0; + + while(!queue.empty()){ + uint64_t currentVertex = queue.front(); + queue.pop(); + + std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; + + // get neighbors of current vertex + std::vector neighbors = graph->get_neighbors_ids(currentVertex); + + // Loop through all of neighbors of current vertex + for(uint64_t i = 0; i < neighbors.size(); i++){ + uint64_t neighbor = neighbors[i]; + + // If the neighbor hasn't been visited yet, add it to the queue and mark it as visited + if(!visited[neighbor]){ + queue.push(neighbor); + layer[neighbor] = layer[currentVertex] +1; + visited[neighbor] = true; + } + } + } + + } + + }; +} + +#endif //MORPHSTORE_BFS_NAIVE_H From ec98ee24ca0e4360371b18f6b0994b637b3f6538 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 28 Aug 2019 12:46:31 +0200 Subject: [PATCH 073/216] added time measurement for BFS --- include/core/operators/graph/bfs_naive.h | 8 ++--- include/core/storage/graph/ldbc_import.h | 6 ++-- .../ldbc_graph_adjacencylist.cpp | 29 ++++++++++----- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 36 ++++++++++++++----- 4 files changed, 53 insertions(+), 26 deletions(-) diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h index 03199c2c..cd431798 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/bfs_naive.h @@ -51,9 +51,7 @@ namespace morphstore{ } void doBFS(uint64_t startVertex){ - std::cout << "BFS: starting from Vertex " << startVertex << std::endl; - - //uint64_t graphSize = graph.getNumberVertices(); + //std::cout << "BFS: starting from Vertex " << startVertex << std::endl; queue.push(startVertex); visited[startVertex] = true; @@ -64,7 +62,7 @@ namespace morphstore{ uint64_t currentVertex = queue.front(); queue.pop(); - std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; + //std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; // get neighbors of current vertex std::vector neighbors = graph->get_neighbors_ids(currentVertex); @@ -73,7 +71,7 @@ namespace morphstore{ for(uint64_t i = 0; i < neighbors.size(); i++){ uint64_t neighbor = neighbors[i]; - // If the neighbor hasn't been visited yet, add it to the queue and mark it as visited + // check if neighbor has been visited, if not -> put into queue and mark as visit = true if(!visited[neighbor]){ queue.push(neighbor); layer[neighbor] = layer[currentVertex] +1; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 54aeb851..e0af8b9f 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -600,8 +600,8 @@ namespace morphstore{ // MAIN import function: see steps in comments void import(std::unique_ptr& graph) { - std::cout << "Importing LDBC-files into graph ... "; - std::cout.flush(); + //std::cout << "Importing LDBC-files into graph ... "; + //std::cout.flush(); // (1) get number vertices and number edges: uint64_t numberVertices = get_total_number_vertices(); @@ -622,7 +622,7 @@ namespace morphstore{ // (6) clear intermediates clear_intermediates(); - std::cout << "--> done" << std::endl; + //std::cout << "--> done" << std::endl; } }; } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index d684ea0b..69eef531 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -30,30 +30,32 @@ int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- + /* std::cout << "\n"; std::cout << "**********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; + */ - auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - - // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); - //morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); + // start measuring import time: + auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + // generate vertices & edges from LDBC files and insert into graph ldbcImport->import(g1); // measuring time: - auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time - std::chrono::duration elapsed = finish - start; + auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + - g1->statistics(); - std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; + //g1->statistics(); + std::cout << "Import: " << elapsedImportTime << " millisec.\n"; /* // test vertices: @@ -62,14 +64,23 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ - g1->print_vertex_by_id(10000); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; // BFS TEST: std::unique_ptr bfs = std::make_unique(g1); + + // start measuring bfs time: + auto startBFSTime = std::chrono::high_resolution_clock::now(); + + // actual algorithm bfs->doBFS(10000); + // measuring time: + auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); + std::cout << "BFS: " << elapsedBFSTime << " millisec.\n"; + return 0; } \ No newline at end of file diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 1257eacb..11358ea6 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -23,35 +23,39 @@ #include #include +#include + #include // for high_resolution_clock int main( void ){ // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + /* std::cout << "\n"; std::cout << "**********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: Compressed Row Storage Format *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; + */ - auto start = std::chrono::high_resolution_clock::now(); // For measuring the execution time - - // when using workstation @ TUD: social network directory: "/home/s8069724/s8069724-home/Dokumente/social_network/" std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); - //morphstore::LDBCImport ldbcImport("/home/s8069724/s8069724-home/Dokumente/social_network/"); // Graph init: std::unique_ptr g1 = std::make_unique(); + // start measuring import time: + auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + // generate vertices & edges from LDBC files and insert into graph ldbcImport->import(g1); // measuring time: - auto finish = std::chrono::high_resolution_clock::now(); // For measuring the execution time - std::chrono::duration elapsed = finish - start; + auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + - g1->statistics(); - std::cout << "Import & Graph-Generation Time: " << elapsed.count() << " sec.\n"; + //g1->statistics(); + std::cout << "Import: " << elapsedImportTime << " millisec.\n"; /* // test vertices: @@ -60,9 +64,23 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ - + // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + // BFS TEST: + std::unique_ptr bfs = std::make_unique(g1); + + // start measuring bfs time: + auto startBFSTime = std::chrono::high_resolution_clock::now(); + + // actual algorithm + bfs->doBFS(10000); + + // measuring time: + auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); + std::cout << "BFS: " << elapsedBFSTime << " millisec.\n"; + return 0; } \ No newline at end of file From 1f4673bac45fcc2020f60d78d12179955114afce Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 28 Aug 2019 14:58:01 +0200 Subject: [PATCH 074/216] graph size calculation (in bytes) --- include/core/storage/graph/edge/edge.h | 15 ++++++++++++-- .../storage/graph/formats/adjacencylist.h | 12 +++++------ include/core/storage/graph/formats/csr.h | 19 ++++++++---------- include/core/storage/graph/graph.h | 1 + .../graph/vertex/adjacencylist_vertex.h | 20 ++++++++----------- .../core/storage/graph/vertex/csr_vertex.h | 6 ++---- include/core/storage/graph/vertex/vertex.h | 1 + .../ldbc_graph_adjacencylist.cpp | 3 +++ .../core/storage/graph/csr/ldbc_graph_csr.cpp | 3 +++ 9 files changed, 45 insertions(+), 35 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 371810ab..6d533b66 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -117,10 +117,21 @@ namespace morphstore{ // function for sorting algorithms in the ldbc-importer: // compare target-ids and return if it's "lower" (we need the sorting for the CSR) - bool operator<(const Edge& e) const - { + bool operator<(const Edge& e) const{ return getTargetId() < e.getTargetId(); } + + size_t size_in_bytes() const{ + size_t size = 0; + size += sizeof(uint64_t) * 2; // source- and target-id + size += sizeof(unsigned short int); // relation + + // property: + size += sizeof(std::pair); + size += sizeof(char)*(property.first.length() + property.second.length()); + + return size; + } }; } diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 1000aa5f..57a636a0 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -27,6 +27,8 @@ #include "../graph.h" #include "../vertex/adjacencylist_vertex.h" +#include + namespace morphstore{ class AdjacencyList: public Graph { @@ -111,16 +113,14 @@ namespace morphstore{ return vertices.at(id)->get_neighbors_ids(); } - /* old-calculation of the graph size in bytes - size_t get_size_of_graph(){ + size_t get_size_of_graph() override { size_t size = 0; - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - size += it->second.get_size_of_vertex(); + size += sizeof(std::unordered_map>); + for(auto& it : vertices){ + size += it.second->get_size_of_vertex(); } return size; } - */ }; } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index bfaf1350..11adf035 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -146,26 +146,23 @@ namespace morphstore{ return neighbors; } - /* old-calculation of the graph size in bytes - * size_t get_size_of_graph(){ - size_t size = 0; + size_t get_size_of_graph() override { + size_t size = 0; // pointer to arrays: - size += sizeof(uint64_t*) * 2 + sizeof(unsigned short int*); + size += sizeof(uint64_t*) + sizeof(Edge*); // vertices: size += sizeof(uint64_t) * getNumberVertices(); // edges: - size += sizeof(uint64_t) * getNumberEdges(); - // val array: - size += sizeof(unsigned short int) * getNumberEdges(); - + for(uint64_t i = 0; i < getNumberEdges(); i++){ + size += edge_array[i].size_in_bytes(); + } // vertex map wth actual data: - for(std::unordered_map::iterator it = vertices.begin(); it != vertices.end(); ++it){ - size += it->second.get_size_of_vertex(); + for(auto& it : vertices){ + size += it.second->get_size_of_vertex(); } return size; } - */ }; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index eef6da8f..af76951c 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -129,6 +129,7 @@ namespace morphstore{ virtual std::vector get_neighbors_ids(uint64_t id) = 0; // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; + virtual size_t get_size_of_graph() = 0; // -------------------- debugging functions -------------------- diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 35999b62..ecb107b1 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -78,28 +78,24 @@ namespace morphstore{ return neighbors; } - - - - /* old-calculation of vertex size - size_t get_size_of_vertex() { + size_t get_size_of_vertex() override { size_t size = 0; size += sizeof(uint64_t); // id - // Adj.List: - for(const auto& e : adjList){ - size += e.size_in_bytes(); - } + size += sizeof(unsigned short int); // entity + // properties: size += sizeof(std::unordered_map); for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ size += sizeof(char)*(property->first.length() + property->second.length()); } - // entities: - size += sizeof(unsigned short int); + // Adj.List: + size += sizeof(std::vector); + for(const auto& e : adjacencylist){ + size += e.size_in_bytes(); + } return size; } - */ }; } diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index b5b7aab8..74dc70c0 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -51,8 +51,7 @@ namespace morphstore{ std::cout << " virtual print_neighbors - no usage: " << std::endl; } - /* old-calculation of size of a vertex in bytes - size_t get_size_of_vertex() { + size_t get_size_of_vertex() override { size_t size = 0; size += sizeof(uint64_t); // id // properties: @@ -60,12 +59,11 @@ namespace morphstore{ for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ size += sizeof(char)*(property->first.length() + property->second.length()); } - // entities: + // entity: size += sizeof(unsigned short int); return size; } - */ }; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index b16f2949..904852d7 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -82,6 +82,7 @@ namespace morphstore{ virtual void add_edges(const std::vector& edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void print_neighbors() = 0; + virtual size_t get_size_of_vertex() = 0; virtual uint64_t get_number_edges(){ return 0; diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 69eef531..c411bb7e 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -53,6 +53,9 @@ int main( void ){ auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + // size of graph in bytes: + size_t size = g1->get_size_of_graph(); + std::cout << "Size: " << size << " bytes\n"; //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 11358ea6..49633f80 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -53,6 +53,9 @@ int main( void ){ auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + // size of graph in bytes: + size_t size = g1->get_size_of_graph(); + std::cout << "Size: " << size << " bytes\n"; //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; From 7b494a7e382bb021019d9c39072913ac674b7594 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 30 Aug 2019 10:29:26 +0200 Subject: [PATCH 075/216] social_network directory for DB Server --- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 4 +++- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index c411bb7e..6d4697f2 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -38,7 +38,9 @@ int main( void ){ std::cout << "\n"; */ - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 49633f80..d1184f4c 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -38,7 +38,9 @@ int main( void ){ std::cout << "\n"; */ - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); From 7239760eb9990e9a7c5d098c7ab45e4c45266b38 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Mon, 2 Sep 2019 10:14:17 +0200 Subject: [PATCH 076/216] directory of sn files - switch --- .../graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 6 +++--- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 6d4697f2..8d1036c4 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -38,9 +38,9 @@ int main( void ){ std::cout << "\n"; */ - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index d1184f4c..8f3126e5 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -39,8 +39,8 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); From 367d71ab903bd3da283f03034f3f1f682dc18f5f Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 3 Sep 2019 10:52:01 +0200 Subject: [PATCH 077/216] added bfs measurement --- include/core/operators/graph/bfs_naive.h | 56 ++++++++++++++++--- test/CMakeLists.txt | 2 +- .../ldbc_graph_adjacencylist.cpp | 14 +---- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 5 +- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h index cd431798..2037c8fc 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/bfs_naive.h @@ -18,7 +18,7 @@ /** * @file bfs.h * @brief naive (simple) BFS implementation to traverse graph of type CSR OR AdjacencyList - * @todo implement optimized version of BFS -> now just for simplicity + * @todo implement vectorized BFS (AVX2, AVX-512) */ #ifndef MORPHSTORE_BFS_NAIVE_H @@ -27,6 +27,7 @@ #include "../../storage/graph/graph.h" #include +#include namespace morphstore{ @@ -37,7 +38,7 @@ namespace morphstore{ uint64_t graphSize; // Create a "visited" array (true or false) to keep track of if we visited a vertex. std::vector visited = { false }; - std::vector layer; + //std::vector layer; // Create a queue for the nodes we visit. std::queue queue; @@ -47,16 +48,22 @@ namespace morphstore{ BFS(std::unique_ptr& g) : graph(std::move(g)){ graphSize = graph->getNumberVertices(); visited.resize(graphSize); - layer.resize(graphSize); + //layer.resize(graphSize); } - void doBFS(uint64_t startVertex){ - //std::cout << "BFS: starting from Vertex " << startVertex << std::endl; + uint64_t get_graph_size(){ + return graphSize; + } + + // actual BFS (naive) algorithm: takes the start-node id and returns the number of explored vertices + uint64_t doBFS(uint64_t startVertex){ + + uint64_t exploredVertices = 0; queue.push(startVertex); visited[startVertex] = true; - layer[startVertex] = 0; + //layer[startVertex] = 0; while(!queue.empty()){ uint64_t currentVertex = queue.front(); @@ -68,18 +75,51 @@ namespace morphstore{ std::vector neighbors = graph->get_neighbors_ids(currentVertex); // Loop through all of neighbors of current vertex - for(uint64_t i = 0; i < neighbors.size(); i++){ + for(uint64_t i = 0; i < neighbors.size(); ++i){ uint64_t neighbor = neighbors[i]; // check if neighbor has been visited, if not -> put into queue and mark as visit = true if(!visited[neighbor]){ queue.push(neighbor); - layer[neighbor] = layer[currentVertex] +1; + //layer[neighbor] = layer[currentVertex] +1; visited[neighbor] = true; + ++exploredVertices; } } } + return exploredVertices; + } + + // this function sets every cell to false in visited array + void clear_visited_array(){ + std::fill(visited.begin(), visited.end(), false); + } + + // function that measures for every vertex the number and time of explored vertices in BFS + // writes results to local file for further analysis + void do_measurements(){ + std::ofstream fs; + std::string filename = "/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv"; + // open file for writing and delete existing stuff: + fs.open(filename, std::fstream::out | std::ofstream::trunc); + + for(uint64_t i = 0; i < graphSize; ++i){ + // start measuring bfs time: + auto startBFSTime = std::chrono::high_resolution_clock::now(); + + uint64_t exploredVertices = doBFS(i); + + auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); + + // set every entry in visited array back to { false } + clear_visited_array(); + + // write to file + fs << exploredVertices << "," << elapsedBFSTime << "\n"; + } + fs.close(); } }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ee0cf4a2..fd0a412d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,6 +6,6 @@ add_subdirectory( core/storage ) add_subdirectory( core/utils ) add_subdirectory( core/storage/graph/adjacencylist ) -add_subdirectory( core/storage/graph/csr ) +#add_subdirectory( core/storage/graph/csr ) add_subdirectory(vector) \ No newline at end of file diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 8d1036c4..620e6fff 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -26,6 +26,7 @@ #include #include // for high_resolution_clock +#include int main( void ){ @@ -73,19 +74,8 @@ int main( void ){ // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; - // BFS TEST: std::unique_ptr bfs = std::make_unique(g1); - - // start measuring bfs time: - auto startBFSTime = std::chrono::high_resolution_clock::now(); - - // actual algorithm - bfs->doBFS(10000); - - // measuring time: - auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); - std::cout << "BFS: " << elapsedBFSTime << " millisec.\n"; + bfs->do_measurements(); return 0; } \ No newline at end of file diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 8f3126e5..a52c8351 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -74,18 +74,19 @@ int main( void ){ //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; // BFS TEST: + /* std::unique_ptr bfs = std::make_unique(g1); // start measuring bfs time: auto startBFSTime = std::chrono::high_resolution_clock::now(); // actual algorithm - bfs->doBFS(10000); + uint64_t exploredV = bfs->doBFS(10000); // measuring time: auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); - std::cout << "BFS: " << elapsedBFSTime << " millisec.\n"; + */ return 0; } \ No newline at end of file From f337f0749435d948c2218d7e8c5ac8b691b8a20a Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Tue, 3 Sep 2019 13:32:09 +0200 Subject: [PATCH 078/216] added intermediate array before writing to file --- include/core/operators/graph/bfs_naive.h | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h index 2037c8fc..ea4ac56f 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/bfs_naive.h @@ -56,7 +56,7 @@ namespace morphstore{ } // actual BFS (naive) algorithm: takes the start-node id and returns the number of explored vertices - uint64_t doBFS(uint64_t startVertex){ + uint64_t do_BFS(uint64_t startVertex){ uint64_t exploredVertices = 0; @@ -99,16 +99,15 @@ namespace morphstore{ // writes results to local file for further analysis void do_measurements(){ - std::ofstream fs; - std::string filename = "/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv"; - // open file for writing and delete existing stuff: - fs.open(filename, std::fstream::out | std::ofstream::trunc); + // Intermediate data structure: + // size = graphSize*2, because we sequentially store both results (exploredVertices, needed Time) for every vertex + uint64_t* results = new uint64_t[graphSize*2]; for(uint64_t i = 0; i < graphSize; ++i){ // start measuring bfs time: auto startBFSTime = std::chrono::high_resolution_clock::now(); - uint64_t exploredVertices = doBFS(i); + uint64_t exploredVertices = do_BFS(i); auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); @@ -116,10 +115,24 @@ namespace morphstore{ // set every entry in visited array back to { false } clear_visited_array(); - // write to file - fs << exploredVertices << "," << elapsedBFSTime << "\n"; + // write to intermediate array: + results[i*2] = exploredVertices; + results[i*2+1] = elapsedBFSTime; + } + + // WRITE INTERMEDIATES TO FILE: + std::ofstream fs; + std::string filename = "/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv"; + // open file for writing and delete existing stuff: + fs.open(filename, std::fstream::out | std::ofstream::trunc); + + for(uint64_t j = 0; j < graphSize*2; ++j){ + fs << results[j] << "," << results[j+1] << "\n"; + ++j; } + fs.close(); + delete [] results; } }; From b8d9527155994a5e0cc772b819d9dc4541cccd2d Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 14:40:55 +0200 Subject: [PATCH 079/216] some fixes --- include/core/operators/graph/bfs_naive.h | 24 +++++++++++++------ include/core/storage/graph/edge/edge.h | 4 ++++ .../storage/graph/formats/adjacencylist.h | 2 +- include/core/storage/graph/formats/csr.h | 2 +- include/core/storage/graph/graph.h | 2 +- .../ldbc_graph_adjacencylist.cpp | 9 +++++++ 6 files changed, 33 insertions(+), 10 deletions(-) diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h index ea4ac56f..74cb6f7e 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/bfs_naive.h @@ -71,12 +71,9 @@ namespace morphstore{ //std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; - // get neighbors of current vertex - std::vector neighbors = graph->get_neighbors_ids(currentVertex); - // Loop through all of neighbors of current vertex - for(uint64_t i = 0; i < neighbors.size(); ++i){ - uint64_t neighbor = neighbors[i]; + for(uint64_t i = 0; i < graph->get_neighbors_ids(currentVertex).size(); ++i){ + uint64_t neighbor = graph->get_neighbors_ids(currentVertex)[i]; // check if neighbor has been visited, if not -> put into queue and mark as visit = true if(!visited[neighbor]){ @@ -101,7 +98,7 @@ namespace morphstore{ // Intermediate data structure: // size = graphSize*2, because we sequentially store both results (exploredVertices, needed Time) for every vertex - uint64_t* results = new uint64_t[graphSize*2]; + uint64_t* results = (uint64_t *) malloc(graphSize * 2 * sizeof(uint64_t)); for(uint64_t i = 0; i < graphSize; ++i){ // start measuring bfs time: @@ -118,20 +115,33 @@ namespace morphstore{ // write to intermediate array: results[i*2] = exploredVertices; results[i*2+1] = elapsedBFSTime; + + if(i % 1000 == 0) std::cout << "BFS" << i << " / " << graphSize << std::endl; } // WRITE INTERMEDIATES TO FILE: std::ofstream fs; + std::stringstream ss; std::string filename = "/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv"; // open file for writing and delete existing stuff: fs.open(filename, std::fstream::out | std::ofstream::trunc); for(uint64_t j = 0; j < graphSize*2; ++j){ - fs << results[j] << "," << results[j+1] << "\n"; + ss << results[j] << "," << results[j+1] << "\n"; ++j; } + fs << ss.str() ; fs.close(); + + /* + // NEW APPROACH + auto myfile = std::fstream("/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv", std::ios::out | std::ios::binary); + auto fileSize = graphSize * 2 * sizeof(uint64_t); + myfile.write((char*)&results[0], fileSize); + myfile.close(); + */ + delete [] results; } diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 6d533b66..0884d661 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -110,9 +110,13 @@ namespace morphstore{ void setProperty(const std::pair &prop) { // first check if there is any key value data, otherwise problems with segfaults + /* if(prop.first != "" && prop.second != ""){ Edge::property = prop; } + */ + Edge::property.first = prop.first; + Edge::property.second = prop.second; } // function for sorting algorithms in the ldbc-importer: diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 57a636a0..5d991f04 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -54,7 +54,7 @@ namespace morphstore{ } // adding a vertex with its properties - int add_vertex_with_properties(const std::unordered_map &props) override { + uint64_t add_vertex_with_properties(const std::unordered_map &props) override { std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 11adf035..fbd109af 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -68,7 +68,7 @@ namespace morphstore{ } // adding a vertex with its properties - int add_vertex_with_properties(const std::unordered_map& props ) override { + uint64_t add_vertex_with_properties(const std::unordered_map& props ) override { std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index af76951c..e163585d 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -120,7 +120,7 @@ namespace morphstore{ virtual storageFormat getStorageFormat() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; virtual void add_vertex() = 0; - virtual int add_vertex_with_properties(const std::unordered_map& props ) = 0; + virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 620e6fff..c55c883c 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -74,8 +74,17 @@ int main( void ){ // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + g1->print_vertex_by_id(0); + g1->print_vertex_by_id(10000); + g1->print_vertex_by_id(500000); + g1->print_vertex_by_id(1000000); + g1->print_vertex_by_id(2000000); + g1->print_vertex_by_id(3000000); + + /* std::unique_ptr bfs = std::make_unique(g1); bfs->do_measurements(); + */ return 0; } \ No newline at end of file From ed5bd3e4af2ab96d55b11ae4b228bcf73eb92be8 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 15:54:12 +0200 Subject: [PATCH 080/216] hopefully a fix --- .../core/storage/graph/formats/adjacencylist.h | 2 +- include/core/storage/graph/formats/csr.h | 2 +- include/core/storage/graph/graph.h | 15 ++++++++++++++- include/core/storage/graph/ldbc_import.h | 13 +++++++++++++ .../storage/graph/vertex/adjacencylist_vertex.h | 12 +++++++++--- include/core/storage/graph/vertex/csr_vertex.h | 2 +- include/core/storage/graph/vertex/vertex.h | 2 +- 7 files changed, 40 insertions(+), 8 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 5d991f04..20e9b036 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -89,7 +89,7 @@ namespace morphstore{ } // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(uint64_t sourceID, const std::vector& relations) override { + void add_edges(uint64_t sourceID, std::vector relations) override { if (exist_id(sourceID)) { if (relations.size() != 0) { vertices[sourceID]->add_edges(relations); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index fbd109af..2a315cd6 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -84,7 +84,7 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of neighbors - void add_edges(uint64_t sourceID, const std::vector& relations) override { + void add_edges(uint64_t sourceID, std::vector relations) override { uint64_t offset = node_array[sourceID]; uint64_t nextOffset = offset + relations.size(); diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index e163585d..21616629 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -124,7 +124,7 @@ namespace morphstore{ virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, const std::vector& relations) = 0; + virtual void add_edges(uint64_t sourceID, std::vector relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; // for debugging @@ -153,6 +153,19 @@ namespace morphstore{ std::cout << "-----------------------------------------------" << std::endl; } + void print_entity_relationship_dicts(){ + std::cout << "Entity-Dict: " << std::endl; + for(auto const& entry : entityDictionary){ + std::cout << entry.first << " -> " << entry.second << std::endl; + } + std::cout << "\n"; + + std::cout << "Relationship-Dict: " << std::endl; + for(auto const& rel : relationDictionary){ + std::cout << rel.first << " -> " << rel.second << std::endl; + } + } + }; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e0af8b9f..d825b420 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -624,6 +624,19 @@ namespace morphstore{ //std::cout << "--> done" << std::endl; } + + void print_vertex_rel_lookup(){ + std::cout << "VertexRealtionsLookup: " << std::endl; + + for(auto &rel: vertexRelationsLookup){ + for(auto& edge : rel.second){ + if(edge.getProperty().first != ""){ + std::cout << "Vertex-id: " << rel.first << " | "; + std::cout << "Edge-Property: " << edge.getProperty().first << ": " << edge.getProperty().second << "\n"; + } + } + } + } }; } diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index ecb107b1..3aa0f041 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -48,12 +48,18 @@ namespace morphstore{ // function to add a single edge to vertexs adjlist void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { - Edge e(from, to, rel); - this->adjacencylist.push_back(e); + this->adjacencylist.push_back(Edge(from, to, rel)); } // add edges to vertexs' adjacencylist - void add_edges(const std::vector& edges) override { + void add_edges(std::vector edges) override { + /* + adjacencylist.resize(edges.size()); + for(uint64_t i = 0; i < edges.size(); ++i){ + adjacencylist.push_back(edges[i]); + } + */ + this->adjacencylist = edges; } diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index 74dc70c0..f7b769da 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -43,7 +43,7 @@ namespace morphstore{ } // pure function -> no functionality - void add_edges(const std::vector& edges) override { + void add_edges(std::vector edges) override { std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 904852d7..6060e771 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -79,7 +79,7 @@ namespace morphstore{ // ----------------- (pure) virtual functions ----------------- - virtual void add_edges(const std::vector& edges) = 0; + virtual void add_edges(std::vector edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void print_neighbors() = 0; virtual size_t get_size_of_vertex() = 0; From a5e20d75826ccdf87e7d92ae34696a2a5d0e2243 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 22:29:35 +0200 Subject: [PATCH 081/216] finally edge-property working on server --- include/core/storage/graph/edge/edge.h | 14 ++++++++--- .../graph/vertex/adjacencylist_vertex.h | 17 ++++++++----- test/CMakeLists.txt | 4 +-- .../ldbc_graph_adjacencylist.cpp | 10 +++++--- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 25 +++++++++++-------- 5 files changed, 44 insertions(+), 26 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 0884d661..5a38c23d 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -26,6 +26,7 @@ #include #include +#include #include namespace morphstore{ @@ -54,6 +55,7 @@ namespace morphstore{ setProperty(prop); } + /* // Copy constructor Edge(const Edge& edge){ setSourceId(edge.sourceID); @@ -61,6 +63,7 @@ namespace morphstore{ setRelation(edge.relation); setProperty(edge.property); } + */ // this is needed for csr when doing edge_array[offset] = edge... Edge& operator= (const Edge &edge){ @@ -108,15 +111,18 @@ namespace morphstore{ return property; } - void setProperty(const std::pair &prop) { + void setProperty(const std::pair prop) { // first check if there is any key value data, otherwise problems with segfaults - /* + if(prop.first != "" && prop.second != ""){ - Edge::property = prop; + Edge::property.first = prop.first; + Edge::property.second = prop.second; } - */ + + /* Edge::property.first = prop.first; Edge::property.second = prop.second; + */ } // function for sorting algorithms in the ldbc-importer: diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 3aa0f041..fade02b9 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -53,13 +53,18 @@ namespace morphstore{ // add edges to vertexs' adjacencylist void add_edges(std::vector edges) override { - /* - adjacencylist.resize(edges.size()); - for(uint64_t i = 0; i < edges.size(); ++i){ - adjacencylist.push_back(edges[i]); + //adjacencylist.resize(edges.size()); + /* + for(uint64_t i = 0; i < edges.size(); ++i){ + adjacencylist.push_back(morphstore::Edge(edges[i])); } - */ - + + for(const auto& e : edges){ + if(e.getProperty().first != ""){ + std::cout << e.getProperty().first << ": " << e.getProperty().second << std::endl; + } + } + */ this->adjacencylist = edges; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fd0a412d..125ec463 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,6 +6,6 @@ add_subdirectory( core/storage ) add_subdirectory( core/utils ) add_subdirectory( core/storage/graph/adjacencylist ) -#add_subdirectory( core/storage/graph/csr ) +add_subdirectory( core/storage/graph/csr ) -add_subdirectory(vector) \ No newline at end of file +add_subdirectory(vector) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index c55c883c..cbbaad58 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -40,8 +40,8 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -70,6 +70,7 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ + g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; @@ -81,10 +82,13 @@ int main( void ){ g1->print_vertex_by_id(2000000); g1->print_vertex_by_id(3000000); + g1->print_vertex_by_id(1035174); + g1->print_neighbors_of_vertex(1035174); + /* std::unique_ptr bfs = std::make_unique(g1); bfs->do_measurements(); */ return 0; -} \ No newline at end of file +} diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index a52c8351..9ef9a025 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -39,8 +39,8 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -62,16 +62,19 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - /* - // test vertices: - g1->print_vertex_by_id(100454); - g1->print_vertex_by_id(100450); - g1->print_vertex_by_id(100168); - g1->print_vertex_by_id(2000100); - */ // calculate size of social graph - //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + std::cout << "Size of social network: " << g1->get_size_of_graph() << " Bytes\n"; + + g1->print_vertex_by_id(0); + g1->print_vertex_by_id(10000); + g1->print_vertex_by_id(500000); + g1->print_vertex_by_id(1000000); + g1->print_vertex_by_id(2000000); + g1->print_vertex_by_id(3000000); + + g1->print_vertex_by_id(1035174); + g1->print_neighbors_of_vertex(1035174); // BFS TEST: /* @@ -89,4 +92,4 @@ int main( void ){ */ return 0; -} \ No newline at end of file +} From 6818d80594e4af84adf837ac52de6f9f0229043a Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 23:02:26 +0200 Subject: [PATCH 082/216] tidy up --- include/core/storage/graph/edge/edge.h | 20 ----------- include/core/storage/graph/ldbc_import.h | 13 ------- .../graph/vertex/adjacencylist_vertex.h | 12 ------- include/core/storage/graph/vertex/vertex.h | 6 ---- .../ldbc_graph_adjacencylist.cpp | 13 ++----- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 35 +++++++------------ 6 files changed, 14 insertions(+), 85 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 5a38c23d..850871c7 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -26,7 +26,6 @@ #include #include -#include #include namespace morphstore{ @@ -55,16 +54,6 @@ namespace morphstore{ setProperty(prop); } - /* - // Copy constructor - Edge(const Edge& edge){ - setSourceId(edge.sourceID); - setTargetId(edge.targetID); - setRelation(edge.relation); - setProperty(edge.property); - } - */ - // this is needed for csr when doing edge_array[offset] = edge... Edge& operator= (const Edge &edge){ // self-assignment guard @@ -112,17 +101,8 @@ namespace morphstore{ } void setProperty(const std::pair prop) { - // first check if there is any key value data, otherwise problems with segfaults - - if(prop.first != "" && prop.second != ""){ - Edge::property.first = prop.first; - Edge::property.second = prop.second; - } - - /* Edge::property.first = prop.first; Edge::property.second = prop.second; - */ } // function for sorting algorithms in the ldbc-importer: diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index d825b420..e0af8b9f 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -624,19 +624,6 @@ namespace morphstore{ //std::cout << "--> done" << std::endl; } - - void print_vertex_rel_lookup(){ - std::cout << "VertexRealtionsLookup: " << std::endl; - - for(auto &rel: vertexRelationsLookup){ - for(auto& edge : rel.second){ - if(edge.getProperty().first != ""){ - std::cout << "Vertex-id: " << rel.first << " | "; - std::cout << "Edge-Property: " << edge.getProperty().first << ": " << edge.getProperty().second << "\n"; - } - } - } - } }; } diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index fade02b9..6e8d43a2 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -53,18 +53,6 @@ namespace morphstore{ // add edges to vertexs' adjacencylist void add_edges(std::vector edges) override { - //adjacencylist.resize(edges.size()); - /* - for(uint64_t i = 0; i < edges.size(); ++i){ - adjacencylist.push_back(morphstore::Edge(edges[i])); - } - - for(const auto& e : edges){ - if(e.getProperty().first != ""){ - std::cout << e.getProperty().first << ": " << e.getProperty().second << std::endl; - } - } - */ this->adjacencylist = edges; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 6060e771..dfd56a69 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -68,12 +68,6 @@ namespace morphstore{ // function that adds a single property key-value pair to vertex void add_property(const std::pair& property){ - /* - auto it = properties.find(property.first); - if(it != properties.end()){ - it->second = property.second; - } - */ this->properties[property.first] = std::move(property.second); } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index cbbaad58..26b1a8af 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -40,8 +40,8 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -70,18 +70,9 @@ int main( void ){ g1->print_vertex_by_id(100168); g1->print_vertex_by_id(2000100); */ - g1->print_neighbors_of_vertex(100449); // calculate size of social graph //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; - - g1->print_vertex_by_id(0); - g1->print_vertex_by_id(10000); - g1->print_vertex_by_id(500000); - g1->print_vertex_by_id(1000000); - g1->print_vertex_by_id(2000000); - g1->print_vertex_by_id(3000000); - g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 9ef9a025..b6979a7b 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -39,8 +39,8 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" - std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - // std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); + std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -62,34 +62,23 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; + /* + // test vertices: + g1->print_vertex_by_id(100454); + g1->print_vertex_by_id(100450); + g1->print_vertex_by_id(100168); + g1->print_vertex_by_id(2000100); + */ // calculate size of social graph - std::cout << "Size of social network: " << g1->get_size_of_graph() << " Bytes\n"; - - g1->print_vertex_by_id(0); - g1->print_vertex_by_id(10000); - g1->print_vertex_by_id(500000); - g1->print_vertex_by_id(1000000); - g1->print_vertex_by_id(2000000); - g1->print_vertex_by_id(3000000); - + //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); - // BFS TEST: /* std::unique_ptr bfs = std::make_unique(g1); - - // start measuring bfs time: - auto startBFSTime = std::chrono::high_resolution_clock::now(); - - // actual algorithm - uint64_t exploredV = bfs->doBFS(10000); - - // measuring time: - auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); - */ + bfs->do_measurements(); + */ return 0; } From 3cdac5737570817c2d45a7e5166f16c3d501bc69 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 23:09:43 +0200 Subject: [PATCH 083/216] removed segfault error --- include/core/storage/graph/edge/edge.h | 7 +++++-- .../adjacencylist/ldbc_graph_adjacencylist.cpp | 14 +++++--------- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 14 +++++--------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 850871c7..7ecc45f1 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -101,8 +101,11 @@ namespace morphstore{ } void setProperty(const std::pair prop) { - Edge::property.first = prop.first; - Edge::property.second = prop.second; + // first check if there is any key value data, otherwise problems with segfaults + if(prop.first != "" && prop.second != ""){ + Edge::property.first = prop.first; + Edge::property.second = prop.second; + } } // function for sorting algorithms in the ldbc-importer: diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 26b1a8af..63976ef8 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -63,18 +63,14 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - /* - // test vertices: - g1->print_vertex_by_id(100454); - g1->print_vertex_by_id(100450); - g1->print_vertex_by_id(100168); - g1->print_vertex_by_id(2000100); - */ - // calculate size of social graph - //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + + /* Test Vertex, which contains edges with properties: + * g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); + */ /* std::unique_ptr bfs = std::make_unique(g1); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index b6979a7b..1fe7b480 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -62,18 +62,14 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - /* - // test vertices: - g1->print_vertex_by_id(100454); - g1->print_vertex_by_id(100450); - g1->print_vertex_by_id(100168); - g1->print_vertex_by_id(2000100); - */ - // calculate size of social graph - //std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; + + /* Test Vertex, which contains edges with properties: + * g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); + */ /* std::unique_ptr bfs = std::make_unique(g1); From 1f1dd859fea82b25e6dc4a9ac570d53b3dfc186b Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 23:14:24 +0200 Subject: [PATCH 084/216] comment stuff --- include/core/storage/graph/formats/csr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 2a315cd6..aff1462b 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -93,7 +93,7 @@ namespace morphstore{ ++offset; } - // to avoid segfualt: + // to avoid buffer overflow: if(sourceID < getNumberVertices()-1){ node_array[sourceID+1] = nextOffset; } From 74e14b66b92e786762840261fb9030493cf0990d Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Wed, 4 Sep 2019 23:18:48 +0200 Subject: [PATCH 085/216] tidy -up 2 --- include/core/storage/graph/formats/csr.h | 2 +- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 3 --- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 3 --- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 2a315cd6..aff1462b 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -93,7 +93,7 @@ namespace morphstore{ ++offset; } - // to avoid segfualt: + // to avoid buffer overflow: if(sourceID < getNumberVertices()-1){ node_array[sourceID+1] = nextOffset; } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 63976ef8..1540bc86 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -63,9 +63,6 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - // calculate size of social graph - std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; - /* Test Vertex, which contains edges with properties: * g1->print_vertex_by_id(1035174); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 1fe7b480..2361c4ee 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -62,9 +62,6 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - // calculate size of social graph - std::cout << "Size of social network: " << socialGraph.get_size_of_graph() << " Bytes\n"; - /* Test Vertex, which contains edges with properties: * g1->print_vertex_by_id(1035174); From 9cfc3377158f8682b931131ce9ab7662a39ea83a Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 6 Sep 2019 00:01:42 +0200 Subject: [PATCH 086/216] some optimizations --- .../storage/graph/formats/adjacencylist.h | 8 +++--- include/core/storage/graph/formats/csr.h | 11 ++++---- include/core/storage/graph/graph.h | 12 ++++---- include/core/storage/graph/ldbc_import.h | 28 +++++++++---------- .../graph/vertex/adjacencylist_vertex.h | 2 +- .../core/storage/graph/vertex/csr_vertex.h | 2 +- include/core/storage/graph/vertex/vertex.h | 12 ++++---- .../ldbc_graph_adjacencylist.cpp | 14 +++++++--- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 12 ++++++-- 9 files changed, 56 insertions(+), 45 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 20e9b036..fb320869 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -54,7 +54,7 @@ namespace morphstore{ } // adding a vertex with its properties - uint64_t add_vertex_with_properties(const std::unordered_map &props) override { + uint64_t add_vertex_with_properties(const std::unordered_map props) override { std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; @@ -62,7 +62,7 @@ namespace morphstore{ } // function to add a single property to vertex - void add_property_to_vertex(uint64_t id, const std::pair &property) override { + void add_property_to_vertex(uint64_t id, const std::pair property) override { if (exist_id(id)) { vertices[id]->add_property(property); } else { @@ -71,7 +71,7 @@ namespace morphstore{ } // adding entity to vertex - void add_entity_to_vertex(const uint64_t id, unsigned short int entity) override { + void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) override { if (exist_id(id)) { vertices[id]->setEntity(entity); } else { @@ -89,7 +89,7 @@ namespace morphstore{ } // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(uint64_t sourceID, std::vector relations) override { + void add_edges(uint64_t sourceID, const std::vector relations) override { if (exist_id(sourceID)) { if (relations.size() != 0) { vertices[sourceID]->add_edges(relations); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index aff1462b..ac6e9162 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -36,8 +36,7 @@ namespace morphstore{ * node array: index is vertex-id; array cell contains offset in edge_array * edge array: every cell contains pointer to edge object of vertex */ - // TODO: construct a graph-topology struct ? - // TODO: free memory in destructor + // TODO: free memory in destructor ? uint64_t* node_array = nullptr; Edge* edge_array = nullptr; @@ -68,7 +67,7 @@ namespace morphstore{ } // adding a vertex with its properties - uint64_t add_vertex_with_properties(const std::unordered_map& props ) override { + uint64_t add_vertex_with_properties(const std::unordered_map props ) override { std::shared_ptr v = std::make_shared(); v->setProperties(props); vertices[v->getID()] = v; @@ -84,7 +83,7 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of neighbors - void add_edges(uint64_t sourceID, std::vector relations) override { + void add_edges(uint64_t sourceID, const std::vector relations) override { uint64_t offset = node_array[sourceID]; uint64_t nextOffset = offset + relations.size(); @@ -100,7 +99,7 @@ namespace morphstore{ } // function to add a single property to vertex - void add_property_to_vertex(uint64_t id, const std::pair& property) override { + void add_property_to_vertex(uint64_t id, const std::pair property) override { if(exist_id(id)){ vertices[id]->add_property(property); }else{ @@ -109,7 +108,7 @@ namespace morphstore{ } // adding entity to vertex - void add_entity_to_vertex(const uint64_t id, unsigned short int entity) override { + void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) override { if(exist_id(id)){ vertices[id]->setEntity(entity); }else{ diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 21616629..09800a70 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -58,7 +58,7 @@ namespace morphstore{ return entityDictionary; } - void setEntityDictionary(const std::map &ent) { + void setEntityDictionary(const std::map& ent) { this->entityDictionary = ent; } @@ -66,7 +66,7 @@ namespace morphstore{ return relationDictionary; } - void setRelationDictionary(const std::map &rel) { + void setRelationDictionary(const std::map& rel) { this->relationDictionary = rel; } @@ -120,11 +120,11 @@ namespace morphstore{ virtual storageFormat getStorageFormat() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; virtual void add_vertex() = 0; - virtual uint64_t add_vertex_with_properties(const std::unordered_map& props ) = 0; - virtual void add_property_to_vertex(uint64_t id, const std::pair& property) = 0; - virtual void add_entity_to_vertex(const uint64_t id, unsigned short int entity) = 0; + virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; + virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; + virtual void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, std::vector relations) = 0; + virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; // for debugging diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index e0af8b9f..b53c8e88 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -100,7 +100,7 @@ namespace morphstore{ // this function reads the vertices-files and creates vertices in a graph // + creates the entityLookup (number to string) for the graph - void generate_vertices(std::unique_ptr& graph) { + void generate_vertices(Graph& graph) { if (!verticesPaths.empty()) { //std::cout << "(1/2) Generating LDBC-Vertices ..."; @@ -184,9 +184,9 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties - uint64_t systemID = graph->add_vertex_with_properties(properties); + uint64_t systemID = graph.add_vertex_with_properties(properties); // add entity number to vertex - graph->add_entity_to_vertex(systemID, entityNumber); + graph.add_entity_to_vertex(systemID, entityNumber); // map entity and ldbc id to system generated id globalIdLookupMap.insert({{entity, ldbcID}, systemID}); //----------------------------------------------------- @@ -203,9 +203,10 @@ namespace morphstore{ // insert entity-number with string into map entitiesLookup.insert(std::make_pair(entityNumber, entity)); ++entityNumber; + attributes.clear(); } // graph gets full entity-list here: - graph->setEntityDictionary(entitiesLookup); + graph.setEntityDictionary(entitiesLookup); } } @@ -404,7 +405,7 @@ namespace morphstore{ // this function reads the relation-files and fills the intermediate: vertexRelationLookup // + creates the relationLookup (number to string) for the graph - void fill_vertexRelationsLookup(std::unique_ptr& graph){ + void fill_vertexRelationsLookup(Graph& graph){ if(!relationsPaths.empty()) { //std::cout << "(2/2) Generating LDBC-Edges ..."; @@ -487,8 +488,8 @@ namespace morphstore{ } // iterate through multiValue map and assign property to vertex for(const auto &pair : multiValueAttr){ - const std::pair& keyValuePair = {propertyKey, pair.second}; - graph->add_property_to_vertex(pair.first, keyValuePair); + //const std::pair keyValuePair = {propertyKey, pair.second}; + graph.add_property_to_vertex(pair.first, {propertyKey, pair.second}); } } @@ -542,7 +543,6 @@ namespace morphstore{ // insert relation into vertexRealtionsLookup: vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); - }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); @@ -572,7 +572,7 @@ namespace morphstore{ } // graph gets full relation-list here: - graph->setRelationDictionary(relationsLookup); + graph.setRelationDictionary(relationsLookup); } } @@ -586,20 +586,20 @@ namespace morphstore{ } // this function writes the actual data from the intermediate vertexRelationsLookup into the graph - void generate_edges(std::unique_ptr& graph){ + void generate_edges(Graph& graph){ // firstly, sorting the intermediates with their target IDs ASC sort_VertexRelationsLookup(); - uint64_t graphSize = graph->getNumberVertices(); + uint64_t graphSize = graph.getNumberVertices(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ // add edge data: - graph->add_edges(vertexID, vertexRelationsLookup[vertexID]); + graph.add_edges(vertexID, vertexRelationsLookup[vertexID]); } } // MAIN import function: see steps in comments - void import(std::unique_ptr& graph) { + void import(Graph& graph) { //std::cout << "Importing LDBC-files into graph ... "; //std::cout.flush(); @@ -608,7 +608,7 @@ namespace morphstore{ uint64_t numberEdges = get_total_number_edges(); // (2) allocate graph memory - graph->allocate_graph_structure(numberVertices, numberEdges); + graph.allocate_graph_structure(numberVertices, numberEdges); // (3) generate vertices generate_vertices(graph); diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 6e8d43a2..009bb22d 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -52,7 +52,7 @@ namespace morphstore{ } // add edges to vertexs' adjacencylist - void add_edges(std::vector edges) override { + void add_edges(const std::vector edges) override { this->adjacencylist = edges; } diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index f7b769da..4d0a324c 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -43,7 +43,7 @@ namespace morphstore{ } // pure function -> no functionality - void add_edges(std::vector edges) override { + void add_edges(const std::vector edges) override { std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index dfd56a69..7b15da7a 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -54,7 +54,7 @@ namespace morphstore{ return entity; } - void setEntity(unsigned short e) { + void setEntity(const unsigned short e) { Vertex::entity = e; } @@ -62,18 +62,18 @@ namespace morphstore{ return properties; } - void setProperties(const std::unordered_map &props) { + void setProperties(const std::unordered_map props) { Vertex::properties = props; } // function that adds a single property key-value pair to vertex - void add_property(const std::pair& property){ - this->properties[property.first] = std::move(property.second); + void add_property(const std::pair property){ + this->properties[property.first] = property.second;//std::move(property.second); } // ----------------- (pure) virtual functions ----------------- - virtual void add_edges(std::vector edges) = 0; + virtual void add_edges(const std::vector edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void print_neighbors() = 0; virtual size_t get_size_of_vertex() = 0; @@ -91,7 +91,7 @@ namespace morphstore{ // ----------------- DEBUGGING ----------------- void print_properties() { - for (const auto &entry : properties) { + for (const auto entry : properties) { std::cout << "{" << entry.first << ": " << entry.second << "}"; } std::cout << "\n"; diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 1540bc86..4404e11d 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -40,8 +40,9 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + // NEVER FORGET THE LAST / in address!!! // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -50,7 +51,7 @@ int main( void ){ auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time // generate vertices & edges from LDBC files and insert into graph - ldbcImport->import(g1); + ldbcImport->import(*g1); // measuring time: auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -60,15 +61,20 @@ int main( void ){ size_t size = g1->get_size_of_graph(); std::cout << "Size: " << size << " bytes\n"; - //g1->statistics(); + g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - /* Test Vertex, which contains edges with properties: + /* Test Vertex, which contains edges with properties (SERVER): * g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); */ + /* Test Vertex, which contains edges with properties (MY PC):*/ + g1->print_vertex_by_id(100449); + g1->print_neighbors_of_vertex(100449); + + /* std::unique_ptr bfs = std::make_unique(g1); bfs->do_measurements(); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 2361c4ee..b0368274 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -39,8 +39,9 @@ int main( void ){ */ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + // NEVER FORGET THE LAST / in address!!! // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique(("/opt/ldbc_snb_datagen-0.2.8/social_network/")); + std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -49,7 +50,7 @@ int main( void ){ auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time // generate vertices & edges from LDBC files and insert into graph - ldbcImport->import(g1); + ldbcImport->import(*g1); // measuring time: auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time @@ -62,12 +63,17 @@ int main( void ){ //g1->statistics(); std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - /* Test Vertex, which contains edges with properties: + /* Test Vertex, which contains edges with properties (SERVER): * g1->print_vertex_by_id(1035174); g1->print_neighbors_of_vertex(1035174); */ + /* Test Vertex, which contains edges with properties (MY PC):*/ + g1->print_vertex_by_id(100449); + g1->print_neighbors_of_vertex(100449); + + /* std::unique_ptr bfs = std::make_unique(g1); bfs->do_measurements(); From 31ffcf4f245cf6aae089989c6f3e11bc0bd18204 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 6 Sep 2019 00:15:18 +0200 Subject: [PATCH 087/216] server-side tidy up --- .../adjacencylist/ldbc_graph_adjacencylist.cpp | 18 +++++++++--------- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 4404e11d..7fb219aa 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -39,10 +39,10 @@ int main( void ){ std::cout << "\n"; */ - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" // NEVER FORGET THE LAST / in address!!! - // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -65,15 +65,15 @@ int main( void ){ std::cout << "Import: " << elapsedImportTime << " millisec.\n"; /* Test Vertex, which contains edges with properties (SERVER): - * - g1->print_vertex_by_id(1035174); - g1->print_neighbors_of_vertex(1035174); - */ + */ + // g1->print_vertex_by_id(1035174); + // g1->print_neighbors_of_vertex(1035174); + - /* Test Vertex, which contains edges with properties (MY PC):*/ + /* Test Vertex, which contains edges with properties (MY PC): g1->print_vertex_by_id(100449); g1->print_neighbors_of_vertex(100449); - + */ /* std::unique_ptr bfs = std::make_unique(g1); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index b0368274..2f476247 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -38,10 +38,10 @@ int main( void ){ std::cout << "\n"; */ - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/social_network/" + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" // NEVER FORGET THE LAST / in address!!! - // std::unique_ptr ldbcImport = std::make_unique(("/home/pfeiffer/social_network/")); - std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); @@ -64,15 +64,15 @@ int main( void ){ std::cout << "Import: " << elapsedImportTime << " millisec.\n"; /* Test Vertex, which contains edges with properties (SERVER): - * - g1->print_vertex_by_id(1035174); - g1->print_neighbors_of_vertex(1035174); - */ + */ + // g1->print_vertex_by_id(1035174); + // g1->print_neighbors_of_vertex(1035174); + - /* Test Vertex, which contains edges with properties (MY PC):*/ + /* Test Vertex, which contains edges with properties (MY PC): g1->print_vertex_by_id(100449); g1->print_neighbors_of_vertex(100449); - + */ /* std::unique_ptr bfs = std::make_unique(g1); From 9c7e70da9c5e053bee3d1b680b33464588b278ae Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 1 Nov 2019 13:55:25 +0100 Subject: [PATCH 088/216] code push from db server --- include/core/operators/graph/bfs_naive.h | 119 +++++----- .../storage/graph/formats/adjacencylist.h | 33 ++- include/core/storage/graph/formats/csr.h | 85 +++++-- include/core/storage/graph/graph.h | 51 ++++- include/core/storage/graph/graph.h.save | 211 ++++++++++++++++++ include/core/storage/graph/ldbc_import.h | 12 +- .../graph/vertex/adjacencylist_vertex.h | 17 +- .../core/storage/graph/vertex/csr_vertex.h | 5 +- include/core/storage/graph/vertex/vertex.h | 2 +- .../ldbc_graph_adjacencylist.cpp | 35 ++- .../ldbc_graph_adjacencylist.cpp.save | 83 +++++++ .../core/storage/graph/csr/ldbc_graph_csr.cpp | 35 ++- 12 files changed, 570 insertions(+), 118 deletions(-) create mode 100644 include/core/storage/graph/graph.h.save create mode 100644 test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/bfs_naive.h index 74cb6f7e..15a2073c 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/bfs_naive.h @@ -37,17 +37,16 @@ namespace morphstore{ std::unique_ptr graph; uint64_t graphSize; // Create a "visited" array (true or false) to keep track of if we visited a vertex. - std::vector visited = { false }; + //std::vector visited = { false }; //std::vector layer; // Create a queue for the nodes we visit. - std::queue queue; - - public: + + public: // constructor with smart pointer to graph as parameter BFS(std::unique_ptr& g) : graph(std::move(g)){ graphSize = graph->getNumberVertices(); - visited.resize(graphSize); + //visited.resize(graphSize); //layer.resize(graphSize); } @@ -57,95 +56,109 @@ namespace morphstore{ // actual BFS (naive) algorithm: takes the start-node id and returns the number of explored vertices uint64_t do_BFS(uint64_t startVertex){ + std::vector frontier; + std::vector next; + std::vector visited(graphSize, false); + + // debug: + //int layer = 0; + //int layerVertices = 0; + + // set every entry in visited array back to { false } + //clear_visited_array(); uint64_t exploredVertices = 0; - queue.push(startVertex); + frontier.push_back(startVertex); visited[startVertex] = true; //layer[startVertex] = 0; - while(!queue.empty()){ - uint64_t currentVertex = queue.front(); - queue.pop(); - - //std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; - - // Loop through all of neighbors of current vertex - for(uint64_t i = 0; i < graph->get_neighbors_ids(currentVertex).size(); ++i){ - uint64_t neighbor = graph->get_neighbors_ids(currentVertex)[i]; - - // check if neighbor has been visited, if not -> put into queue and mark as visit = true - if(!visited[neighbor]){ - queue.push(neighbor); - //layer[neighbor] = layer[currentVertex] +1; - visited[neighbor] = true; - ++exploredVertices; + while(!frontier.empty()){ + // Loop through current layer of vertices in the frontier + for(uint64_t i = 0; i < frontier.size(); ++i){ + uint64_t currentVertex = frontier[i]; + std::vector neighbors = graph->get_neighbors_ids(currentVertex); + // Loop through all of neighbors of current vertex + for(uint64_t j = 0; j < neighbors.size(); ++j){ + // check if neighbor has been visited, if not -> put into queue and mark as visit = true + if(!visited[neighbors[j]]){ + next.push_back(neighbors[j]); + //layer[neighbor] = layer[currentVertex] +1; + visited[neighbors[j]] = true; + ++exploredVertices; + //++layerVertices; + } } } + //++layer; + //std::cout << "Explored layer " << layer << " -> " << layerVertices << std::endl; + //layerVertices = 0; + // swap frontier with next + frontier.swap(next); + // clear next: swap with an empty container is much faster + std::vector().swap(next); + + //std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; } return exploredVertices; - } + } - // this function sets every cell to false in visited array - void clear_visited_array(){ - std::fill(visited.begin(), visited.end(), false); - } + // function that measures the number of explored vertices and TIME: + // results are written into a file + // parameter cycle means the ith vertex (modulo) + void do_measurements(uint64_t cycle, std::string pathToFile){ - // function that measures for every vertex the number and time of explored vertices in BFS - // writes results to local file for further analysis - void do_measurements(){ + // list of measurement candidates: the parameter means the ith vertex in total + std::vector candidates = get_list_of_every_ith_vertex(cycle); // Intermediate data structure: - // size = graphSize*2, because we sequentially store both results (exploredVertices, needed Time) for every vertex - uint64_t* results = (uint64_t *) malloc(graphSize * 2 * sizeof(uint64_t)); + // size = candidatesVector size*2, because we sequentially store both results (exploredVertices, needed Time) for every vertex + std::vector> results; + results.reserve(candidates.size()); + - for(uint64_t i = 0; i < graphSize; ++i){ + for(uint64_t i = 0; i < candidates.size(); ++i){ // start measuring bfs time: auto startBFSTime = std::chrono::high_resolution_clock::now(); - uint64_t exploredVertices = do_BFS(i); + uint64_t exploredVertices = do_BFS(candidates[i]); auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); - // set every entry in visited array back to { false } - clear_visited_array(); - // write to intermediate array: - results[i*2] = exploredVertices; - results[i*2+1] = elapsedBFSTime; - - if(i % 1000 == 0) std::cout << "BFS" << i << " / " << graphSize << std::endl; + results.push_back({exploredVertices, elapsedBFSTime}); } // WRITE INTERMEDIATES TO FILE: std::ofstream fs; std::stringstream ss; - std::string filename = "/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv"; + std::string filename = pathToFile; // open file for writing and delete existing stuff: fs.open(filename, std::fstream::out | std::ofstream::trunc); - for(uint64_t j = 0; j < graphSize*2; ++j){ - ss << results[j] << "," << results[j+1] << "\n"; + for(uint64_t j = 0; j < results.size(); ++j){ + ss << results[j].first << "," << results[j].second << "\n"; ++j; } fs << ss.str() ; fs.close(); + } - /* - // NEW APPROACH - auto myfile = std::fstream("/home/tim/Documents/TUD/(8) Informatik SS 2019/MorphStore/bfs_measurements.csv", std::ios::out | std::ios::binary); - auto fileSize = graphSize * 2 * sizeof(uint64_t); - myfile.write((char*)&results[0], fileSize); - myfile.close(); - */ - - delete [] results; + // function which returns a list of every ith vertex which is sorted by degree DESC + std::vector< uint64_t > get_list_of_every_ith_vertex(uint64_t cycle){ + std::vector< uint64_t > measurementCandidates; + std::vector< std::pair > totalListOfVertices = graph->get_list_of_degree_DESC(); + for(uint64_t i = 0; i < totalListOfVertices.size(); i = i + cycle){ + measurementCandidates.push_back(totalListOfVertices[i].first); + } + return measurementCandidates; } }; } #endif //MORPHSTORE_BFS_NAIVE_H + diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index fb320869..8d6d718a 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -113,13 +113,34 @@ namespace morphstore{ return vertices.at(id)->get_neighbors_ids(); } - size_t get_size_of_graph() override { - size_t size = 0; - size += sizeof(std::unordered_map>); - for(auto& it : vertices){ - size += it.second->get_size_of_vertex(); + std::pair get_size_of_graph() override { + std::pair index_data_size; + size_t data_size = 0; + size_t index_size = 0; + + // lookup dicts: entity dict + relation dict. + index_size += 2 * sizeof(std::map); + for(auto& ent : entityDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(ent.second.length()); } - return size; + for(auto& rel : relationDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(rel.second.length()); + } + + // container for indexes: + index_size += sizeof(std::unordered_map>); + for(auto& it : vertices){ + // index size of vertex: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += it.second->get_data_size_of_vertex(); + } + + index_data_size = {index_size, data_size}; + + return index_data_size; } }; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index ac6e9162..e0a8d291 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -39,6 +39,9 @@ namespace morphstore{ // TODO: free memory in destructor ? uint64_t* node_array = nullptr; Edge* edge_array = nullptr; + + // BFS OPTIMIZATION APPROACH: + uint64_t* edge_targetID_array = nullptr; public: @@ -56,6 +59,9 @@ namespace morphstore{ node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); edge_array = (Edge*) malloc(numberEdges * sizeof(Edge)); + // BFS OPTIMIZATION APPROACH: + edge_targetID_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); + // init node array: node_array[0] = 0; } @@ -89,7 +95,11 @@ namespace morphstore{ for(const auto & edge : relations){ edge_array[offset] = edge; - ++offset; + + // BFS OPTIMIZATION APPROACH: + edge_targetID_array[offset] = edge.getTargetId(); + + ++offset; } // to avoid buffer overflow: @@ -119,8 +129,16 @@ namespace morphstore{ // get number of edges of vertex with id uint64_t get_degree(uint64_t id) override { uint64_t offset = node_array[id]; - uint64_t nextOffset = node_array[id+1]; + // special case: last vertex id has no next offset + uint64_t nextOffset; + if(id == getNumberVertices() -1){ + nextOffset = getNumberEdges(); + }else{ + nextOffset = node_array[id+1]; + } + if(offset == nextOffset) return 0; uint64_t numberEdges = nextOffset - offset; + //if(id == 1030169) std::cout << "edges: " << numberEdges << " - offset: " << offset << " - nextOffset: " << nextOffset << std::endl; return numberEdges; } @@ -135,32 +153,57 @@ namespace morphstore{ // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { - std::vector neighbors; - uint64_t offset = node_array[id]; - uint64_t numberEdges = get_degree(id); + std::vector neighbors; + uint64_t offset = node_array[id]; + uint64_t numberEdges = get_degree(id); + /* + for(uint64_t i = offset; i < offset+numberEdges; ++i){ + neighbors.push_back(edge_array[i].getTargetId()); + }*/ + + // BFS OPTIMIZATION APPROACH: + /*Problem is that it does not put the second element into the vector -> alternative to insert() ???*/ + if( offset < getNumberEdges()){ + neighbors.insert(neighbors.end(), edge_targetID_array+offset, edge_targetID_array+offset+numberEdges); + } + //std::vector neighbors(edge_targetID_array+offset, edge_targetID_array+offset+numberEdges-1); + return neighbors; + } - for(uint64_t i = offset; i < offset+numberEdges; ++i){ - neighbors.push_back(edge_array[i].getTargetId()); + std::pair get_size_of_graph() override { + std::pair index_data_size; + size_t data_size = 0; + size_t index_size = 0; + + // lookup dicts: entity dict + relation dict. + index_size += 2 * sizeof(std::map); + for(auto& ent : entityDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(ent.second.length()); + } + for(auto& rel : relationDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(rel.second.length()); + } + + // container for indexes: + index_size += sizeof(std::unordered_map>); + for(auto& it : vertices){ + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + data_size += it.second->get_data_size_of_vertex(); } - return neighbors; - } - size_t get_size_of_graph() override { - size_t size = 0; // pointer to arrays: - size += sizeof(uint64_t*) + sizeof(Edge*); - // vertices: - size += sizeof(uint64_t) * getNumberVertices(); - // edges: + index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); + // edges array values: for(uint64_t i = 0; i < getNumberEdges(); i++){ - size += edge_array[i].size_in_bytes(); - } - // vertex map wth actual data: - for(auto& it : vertices){ - size += it.second->get_size_of_vertex(); + index_size += sizeof(uint64_t); // node_array with offsets + data_size += edge_array[i].size_in_bytes(); // edge value arrray with object } - return size; + index_data_size = {index_size, data_size}; + + return index_data_size; } }; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 09800a70..939d35e4 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -31,7 +31,12 @@ #include #include #include +#include +#include #include +#include +#include + namespace morphstore{ @@ -115,6 +120,48 @@ namespace morphstore{ return vertices[id]; } + // function that return a list of pair < vertex id, degree > DESC + std::vector> get_list_of_degree_DESC(){ + std::vector> vertexDegreeList; + vertexDegreeList.reserve(numberVertices); + // fill the vector with every vertex key and his degree + for(uint64_t i = 0; i < numberVertices; ++i){ + vertexDegreeList.push_back({i, this->get_degree(i)}); + } + // sort the vector on degree DESC + /*std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](auto &left, auto &right) { + return left.second > right.second; + });*/ + std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { + return left.second > right.second; + }); + + return vertexDegreeList; + } + + // function to measure graph characteristics (degree and count): + void measure_degree_count(std::string filePath){ + std::vector> verticesDegree = get_list_of_degree_DESC(); + // unordered map for mapping degree to count: + std::unordered_map results; + for(uint64_t i = 0; i < verticesDegree.size(); ++i){ + // increment count in results for a given degree: + results[verticesDegree[i].second]++; + } + // write to file: + std::ofstream fs; + std::stringstream ss; + // open file for writing and delete existing stuff: + fs.open(filePath, std::fstream::out | std::ofstream::trunc); + + for(auto const& m : results){ + ss << m.first << "," << m.second << "\n"; + } + fs << ss.str() ; + + fs.close(); + } + // -------------------- pure virtual functions -------------------- virtual storageFormat getStorageFormat() const = 0; @@ -127,9 +174,9 @@ namespace morphstore{ virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; - // for debugging + virtual std::pair get_size_of_graph() = 0; + // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; - virtual size_t get_size_of_graph() = 0; // -------------------- debugging functions -------------------- diff --git a/include/core/storage/graph/graph.h.save b/include/core/storage/graph/graph.h.save new file mode 100644 index 00000000..75a51f77 --- /dev/null +++ b/include/core/storage/graph/graph.h.save @@ -0,0 +1,211 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph.h + * @brief abstract graph class for any storage format --> CSR,ADJ + * @todo graph-size calculation!! +*/ + +#ifndef MORPHSTORE_GRAPH_H +#define MORPHSTORE_GRAPH_H + +#include "vertex/vertex.h" +#include "edge/edge.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace morphstore{ + + class Graph{ + + protected: + uint64_t numberVertices; + uint64_t numberEdges; + + // Data-structure for Vertex-Properties + std::unordered_map> vertices; + + // Lookup for entities and relations: number to string + std::map entityDictionary; + std::map relationDictionary; + + public: + + enum storageFormat {csr, adjacencylist }; + + // -------------------- Setters & Getters -------------------- + + const std::map &getEntityDictionary() const { + return entityDictionary; + } + + void setEntityDictionary(const std::map& ent) { + this->entityDictionary = ent; + } + + const std::map &getRelationDictionary() const { + return relationDictionary; + } + + void setRelationDictionary(const std::map& rel) { + this->relationDictionary = rel; + } + + uint64_t getNumberVertices() const { + return numberVertices; + } + + void setNumberVertices(uint64_t numV) { + Graph::numberVertices = numV; + } + + uint64_t getNumberEdges() const { + return numberEdges; + } + + void setNumberEdges(uint64_t numE) { + Graph::numberEdges = numE; + } + + std::string get_entity_by_number(unsigned short int e){ + if(entityDictionary.find( e ) != entityDictionary.end()){ + return entityDictionary.at(e); + }else{ + return "No Matching of entity-number in the database!"; + } + } + + std::string get_relation_by_number(unsigned short int re){ + if(relationDictionary.find( re ) != relationDictionary.end()){ + return relationDictionary.at(re); + }else{ + return "No Matching of relation-number in the database!"; + } + } + + // function to check if the vertex-ID is present or not (exists) + bool exist_id(const uint64_t id){ + if(vertices.find(id) == vertices.end()){ + return false; + } + return true; + } + + // function which returns a pointer to vertex by id + std::shared_ptr get_vertex_by_id(uint64_t id){ + return vertices[id]; + } + + // function that return a list of pair < vertex id, degree > DESC + std::vector> get_list_of_degree_DESC(){ + std::vector> vertexDegreeList; + vertexDegreeList.reserve(numberVertices); + // fill the vector with every vertex key and his degree + for(uint64_t i = 0; i < numberVertices; ++i){ + vertexDegreeList.push_back({i, this->get_degree(i)}); + } + // sort the vector on degree DESC + /*std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](auto &left, auto &right) { + return left.second > right.second; + });*/ + std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { + return left.second > right.second; + }); + + return vertexDegreeList; + } + + // function to measure graph characteristics (degree and count): + void measure_degree_count(std::string filePath){ + std::vector> verticesDegree = get_list_of_degree_DESC(); + std::cout << "Highest degree: " << verticesDegree[0].second << std::endl; + // unordered map for mapping degree to count: + std::unordered_map results; + for(uint64_t i = 0; i < verticesDegree.size(); ++i){ + // increment count in results for a given degree: + results[verticesDegree[i].second]++; + } + // write to file: + for(auto const & m : results){ + ; + } + } + + // -------------------- pure virtual functions -------------------- + + virtual storageFormat getStorageFormat() const = 0; + virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; + virtual void add_vertex() = 0; + virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; + virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; + virtual void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) = 0; + virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; + virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; + virtual uint64_t get_degree(uint64_t id) = 0; + virtual std::vector get_neighbors_ids(uint64_t id) = 0; + virtual std::pair get_size_of_graph() = 0; + // for debugging + virtual void print_neighbors_of_vertex(uint64_t id) = 0; + + // -------------------- debugging functions -------------------- + + void statistics(){ + std::cout << "---------------- Statistics ----------------" << std::endl; + std::cout << "Number of vertices: " << getNumberVertices() << std::endl; + std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; + std::cout << "--------------------------------------------" << std::endl; + } + + void print_vertex_by_id(uint64_t id) { + std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; + std::shared_ptr v = vertices[id]; + std::cout << "Vertex-ID: \t" << v->getID() << std::endl; + std::cout << "Entity: \t" << get_entity_by_number(v->getEntity()) << std::endl; + std::cout << "\n"; + std::cout << "Properties: "; + v->print_properties(); + std::cout << "#Edges: " << this->get_degree(v->getID()); + std::cout << "\n"; + std::cout << "-----------------------------------------------" << std::endl; + } + + void print_entity_relationship_dicts(){ + std::cout << "Entity-Dict: " << std::endl; + for(auto const& entry : entityDictionary){ + std::cout << entry.first << " -> " << entry.second << std::endl; + } + std::cout << "\n"; + + std::cout << "Relationship-Dict: " << std::endl; + for(auto const& rel : relationDictionary){ + std::cout << rel.first << " -> " << rel.second << std::endl; + } + } + + }; + +} + + +#endif //MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index b53c8e88..4e272180 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -462,7 +462,7 @@ namespace morphstore{ std::unordered_map multiValueAttr; uint64_t systemID; std::string value; - + for(size_t i = 0; i < fileSize; ++i){ if(buffer[i] == '\n'){ // get a row into string form buffer with start- and end-point @@ -477,9 +477,9 @@ namespace morphstore{ if(start == 0){ propertyKey = row.substr(row.find(delimiter) + 1); }else{ - // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) - systemID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); - value = row.substr(row.find(delimiter) + 1); + // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) + systemID = globalIdLookupMap[{fromEntity, row.substr(0, row.find(delimiter))}]; + value = row.substr(row.find(delimiter) + 1); multiValueAttr[systemID] = std::move(value); } @@ -497,7 +497,7 @@ namespace morphstore{ else{ isRelation = true; - + bool hasProperties = false; std::string propertyKey; uint64_t fromID, toID; @@ -612,7 +612,7 @@ namespace morphstore{ // (3) generate vertices generate_vertices(graph); - + // (4) read relations and write to intermediate results fill_vertexRelationsLookup(graph); diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 009bb22d..8d537d37 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -33,6 +33,9 @@ namespace morphstore{ protected: std::vector adjacencylist; + // BFS OPTIMIZATION APPROACH: + std::vector adjacencylistBFS; + public: // constructor with unique id generation AdjacencyListVertex(){ @@ -54,11 +57,16 @@ namespace morphstore{ // add edges to vertexs' adjacencylist void add_edges(const std::vector edges) override { this->adjacencylist = edges; + + // BFS OPTIMIZATION APPROACH: + for(auto edge : edges){ + adjacencylistBFS.push_back(edge.getTargetId()); + } } // function which returns the number of edges uint64_t get_number_edges() override { - return adjacencylist.size(); + return adjacencylist.size(); } void print_neighbors() override { @@ -70,18 +78,21 @@ namespace morphstore{ // function to return a vector of neighbor ids (for BFS) std::vector get_neighbors_ids() override { + /* std::vector neighbors; for(auto const& edge : adjacencylist){ neighbors.push_back(edge.getTargetId()); } return neighbors; + */ + // BFS OPTIMIZATION APPROACH: + return adjacencylistBFS; } - size_t get_size_of_vertex() override { + size_t get_data_size_of_vertex() override { size_t size = 0; size += sizeof(uint64_t); // id size += sizeof(unsigned short int); // entity - // properties: size += sizeof(std::unordered_map); for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index 4d0a324c..68fa8edc 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -51,9 +51,8 @@ namespace morphstore{ std::cout << " virtual print_neighbors - no usage: " << std::endl; } - size_t get_size_of_vertex() override { + size_t get_data_size_of_vertex() override { size_t size = 0; - size += sizeof(uint64_t); // id // properties: size += sizeof(std::unordered_map); for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ @@ -61,6 +60,8 @@ namespace morphstore{ } // entity: size += sizeof(unsigned short int); + // id + size += sizeof(uint64_t); return size; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 7b15da7a..8cbc30d5 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -76,7 +76,7 @@ namespace morphstore{ virtual void add_edges(const std::vector edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void print_neighbors() = 0; - virtual size_t get_size_of_vertex() = 0; + virtual size_t get_data_size_of_vertex() = 0; virtual uint64_t get_number_edges(){ return 0; diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index 7fb219aa..a863e044 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -41,44 +41,55 @@ int main( void ){ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" // NEVER FORGET THE LAST / in address!!! - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_10/"); // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); // start measuring import time: - auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + //auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time // generate vertices & edges from LDBC files and insert into graph ldbcImport->import(*g1); + // get some graph infos: + g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); + // measuring time: - auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + //auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + //auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); - // size of graph in bytes: - size_t size = g1->get_size_of_graph(); - std::cout << "Size: " << size << " bytes\n"; + //g1->statistics(); - g1->statistics(); - std::cout << "Import: " << elapsedImportTime << " millisec.\n"; + // size of graph in bytes: + //std::pair size = g1->get_size_of_graph(); + //std::cout << "index: " << size.first << " - data: " << size.second << std::endl; // size in bytes + //std::cout << elapsedImportTime << std::endl; // time in milli sec. /* Test Vertex, which contains edges with properties (SERVER): */ // g1->print_vertex_by_id(1035174); // g1->print_neighbors_of_vertex(1035174); - /* Test Vertex, which contains edges with properties (MY PC): g1->print_vertex_by_id(100449); g1->print_neighbors_of_vertex(100449); */ - /* + //g1->print_vertex_by_id(1033808); + + /*BFS single test: std::unique_ptr bfs = std::make_unique(g1); - bfs->do_measurements(); + auto startImportBFSTime = std::chrono::high_resolution_clock::now(); + uint64_t explored = bfs->do_BFS(1033808); + auto finishImportBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedImportBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportBFSTime - startImportBFSTime ).count(); + std::cout << explored << " -> " << elapsedImportBFSTime << std::endl; */ + //std::unique_ptr bfs = std::make_unique(g1); + //bfs->do_measurements(10000, "/home/pfeiffer/measurements/adjacency_list/bfs_SF10.csv"); + return 0; } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save new file mode 100644 index 00000000..2b4ccef6 --- /dev/null +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save @@ -0,0 +1,83 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file ldbc_graph_adjacency.cpp + * @brief Test for generating social network graph in ADJ_LIST format + * @todo + */ + +#include +#include +#include + +#include // for high_resolution_clock +#include + +int main( void ){ + + // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- + /* + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; + */ + + // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" + // NEVER FORGET THE LAST / in address!!! + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); + + // Graph init: + std::unique_ptr g1 = std::make_unique(); + + // start measuring import time: + auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + + // generate vertices & edges from LDBC files and insert into graph + ldbcImport->import(*g1); + + // measuring time: + auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + + // size of graph in bytes: + size_t size = g1->get_size_of_graph(); + std::cout << "Size: " << size << " bytes\n"; + + g1->statistics(); + std::cout << "Import: " << elapsedImportTime << " millisec.\n"; + + /* Test Vertex, which contains edges with properties (SERVER): + */ + // g1->print_vertex_by_id(1035174); + // g1->print_neighbors_of_vertex(1035174); + + + /* Test Vertex, which contains edges with properties (MY PC): + g1->print_vertex_by_id(100449); + g1->print_neighbors_of_vertex(100449); + */ + + + std::unique_ptr bfs = std::make_unique(g1); + bfs->do_measurements(); + + return 0; +} diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 2f476247..65c2965b 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -40,44 +40,55 @@ int main( void ){ // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" // NEVER FORGET THE LAST / in address!!! - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_10/"); // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); // start measuring import time: - auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + //auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time // generate vertices & edges from LDBC files and insert into graph ldbcImport->import(*g1); - // measuring time: - auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + // get some graph infos: + g1->measure_degree_count("/home/pfeiffer/measurements/csr/graph_degree_count_SF10.csv"); - // size of graph in bytes: - size_t size = g1->get_size_of_graph(); - std::cout << "Size: " << size << " bytes\n"; + // measuring time: + //auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time + //auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); //g1->statistics(); - std::cout << "Import: " << elapsedImportTime << " millisec.\n"; + + // size of graph in bytes: + //std::pair size = g1->get_size_of_graph(); + //std::cout << "index: " << size.first << " - data: " << size.second << std::endl; // size in bytes + //std::cout << elapsedImportTime << std::endl; // time in milli sec. /* Test Vertex, which contains edges with properties (SERVER): */ // g1->print_vertex_by_id(1035174); // g1->print_neighbors_of_vertex(1035174); - /* Test Vertex, which contains edges with properties (MY PC): g1->print_vertex_by_id(100449); g1->print_neighbors_of_vertex(100449); */ - /* + //g1->print_vertex_by_id(1033808); + + /* BFS single test: std::unique_ptr bfs = std::make_unique(g1); - bfs->do_measurements(); + auto startImportBFSTime = std::chrono::high_resolution_clock::now(); + uint64_t explored = bfs->do_BFS(1033808); + auto finishImportBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the$ + auto elapsedImportBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportBFSTime - startImportBFSTime ).count(); + std::cout << explored << " -> " << elapsedImportBFSTime << std::endl; */ + //std::unique_ptr bfs = std::make_unique(g1); + //bfs->do_measurements(10000, "/home/pfeiffer/measurements/csr/bfs_SF10.csv"); + return 0; } From 7bf2ffa7c404b758e9df27a2fe6f19795c7dbdb9 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 1 Nov 2019 14:14:06 +0100 Subject: [PATCH 089/216] cleaned bfs.h --- .../graph/{bfs_naive.h => top_down_bfs.h} | 72 +++++++------------ 1 file changed, 25 insertions(+), 47 deletions(-) rename include/core/operators/graph/{bfs_naive.h => top_down_bfs.h} (70%) diff --git a/include/core/operators/graph/bfs_naive.h b/include/core/operators/graph/top_down_bfs.h similarity index 70% rename from include/core/operators/graph/bfs_naive.h rename to include/core/operators/graph/top_down_bfs.h index 15a2073c..a47e3c8a 100644 --- a/include/core/operators/graph/bfs_naive.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -16,17 +16,16 @@ **********************************************************************************************/ /** - * @file bfs.h - * @brief naive (simple) BFS implementation to traverse graph of type CSR OR AdjacencyList + * @file top_down_bfs.h + * @brief top down BFS implementation to traverse graph * @todo implement vectorized BFS (AVX2, AVX-512) */ -#ifndef MORPHSTORE_BFS_NAIVE_H -#define MORPHSTORE_BFS_NAIVE_H +#ifndef MORPHSTORE_TOP_DOWN_BFS +#define MORPHSTORE_TOP_DOWN_BFS #include "../../storage/graph/graph.h" -#include #include namespace morphstore{ @@ -36,86 +35,66 @@ namespace morphstore{ private: std::unique_ptr graph; uint64_t graphSize; - // Create a "visited" array (true or false) to keep track of if we visited a vertex. - //std::vector visited = { false }; - //std::vector layer; - // Create a queue for the nodes we visit. - public: + public: - // constructor with smart pointer to graph as parameter + // constructor with smart pointer to graph as parameter/reference BFS(std::unique_ptr& g) : graph(std::move(g)){ graphSize = graph->getNumberVertices(); - //visited.resize(graphSize); - //layer.resize(graphSize); } uint64_t get_graph_size(){ return graphSize; } - // actual BFS (naive) algorithm: takes the start-node id and returns the number of explored vertices + // ------------------------------------------ BFS impl. ------------------------------------------ + + // actual BFS algorithm: takes the start-node id and returns the number of explored vertices uint64_t do_BFS(uint64_t startVertex){ std::vector frontier; std::vector next; - std::vector visited(graphSize, false); - - // debug: - //int layer = 0; - //int layerVertices = 0; - - // set every entry in visited array back to { false } - //clear_visited_array(); - + std::vector visited(graphSize, false); uint64_t exploredVertices = 0; frontier.push_back(startVertex); visited[startVertex] = true; - //layer[startVertex] = 0; - while(!frontier.empty()){ // Loop through current layer of vertices in the frontier for(uint64_t i = 0; i < frontier.size(); ++i){ uint64_t currentVertex = frontier[i]; - std::vector neighbors = graph->get_neighbors_ids(currentVertex); + // get list of a vertex's adjacency + std::vector neighbors = graph->get_neighbors_ids(currentVertex); + // Loop through all of neighbors of current vertex for(uint64_t j = 0; j < neighbors.size(); ++j){ - // check if neighbor has been visited, if not -> put into queue and mark as visit = true + // check if neighbor has been visited, if not -> put into frontier and mark as visit = true if(!visited[neighbors[j]]){ next.push_back(neighbors[j]); - //layer[neighbor] = layer[currentVertex] +1; visited[neighbors[j]] = true; ++exploredVertices; - //++layerVertices; } } } - //++layer; - //std::cout << "Explored layer " << layer << " -> " << layerVertices << std::endl; - //layerVertices = 0; - // swap frontier with next + // swap frontier with next frontier.swap(next); - // clear next: swap with an empty container is much faster + // clear next: swap with an empty container is faster std::vector().swap(next); - - //std::cout << "Vertex with ID " << currentVertex << "\t @ Layer " << layer[currentVertex] << std::endl; } return exploredVertices; - } + } - // function that measures the number of explored vertices and TIME: - // results are written into a file - // parameter cycle means the ith vertex (modulo) - void do_measurements(uint64_t cycle, std::string pathToFile){ + // ------------------------------------------ Measurement stuff ------------------------------------------ + // function that measures the number of explored vertices and time in ms: + // results are written into a file; cycle determines the ith vertex from list + void do_measurements(uint64_t cycle, std::string pathToFile){ // list of measurement candidates: the parameter means the ith vertex in total std::vector candidates = get_list_of_every_ith_vertex(cycle); - // Intermediate data structure: - // size = candidatesVector size*2, because we sequentially store both results (exploredVertices, needed Time) for every vertex + // Intermediate data structure: (explored vertices, time in ms) std::vector> results; - results.reserve(candidates.size()); + results.reserve(candidates.size()); for(uint64_t i = 0; i < candidates.size(); ++i){ @@ -147,7 +126,7 @@ namespace morphstore{ fs.close(); } - // function which returns a list of every ith vertex which is sorted by degree DESC + // function which returns a list of every ith vertex which is sorted by degree DESC std::vector< uint64_t > get_list_of_every_ith_vertex(uint64_t cycle){ std::vector< uint64_t > measurementCandidates; std::vector< std::pair > totalListOfVertices = graph->get_list_of_degree_DESC(); @@ -156,9 +135,8 @@ namespace morphstore{ } return measurementCandidates; } - }; } -#endif //MORPHSTORE_BFS_NAIVE_H +#endif //MORPHSTORE_TOP_DOWN_BFS From f9e120f004506e6857a9de4122bf5c014af0125b Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 1 Nov 2019 14:54:48 +0100 Subject: [PATCH 090/216] clean code and comments... --- include/core/operators/graph/top_down_bfs.h | 2 +- include/core/storage/graph/edge/edge.h | 3 +- .../storage/graph/formats/adjacencylist.h | 5 +- include/core/storage/graph/formats/csr.h | 64 +++--- include/core/storage/graph/graph.h | 49 ++-- include/core/storage/graph/graph.h.save | 211 ------------------ include/core/storage/graph/ldbc_import.h | 14 +- .../graph/vertex/adjacencylist_vertex.h | 14 +- .../core/storage/graph/vertex/csr_vertex.h | 4 +- include/core/storage/graph/vertex/vertex.h | 4 +- .../ldbc_graph_adjacencylist.cpp | 54 +---- .../ldbc_graph_adjacencylist.cpp.save | 83 ------- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 61 ++--- 13 files changed, 102 insertions(+), 466 deletions(-) delete mode 100644 include/core/storage/graph/graph.h.save delete mode 100644 test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save diff --git a/include/core/operators/graph/top_down_bfs.h b/include/core/operators/graph/top_down_bfs.h index a47e3c8a..f6ba23b4 100644 --- a/include/core/operators/graph/top_down_bfs.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -47,7 +47,7 @@ namespace morphstore{ return graphSize; } - // ------------------------------------------ BFS impl. ------------------------------------------ + // ------------------------------------------ BFS algorithm ------------------------------------------ // actual BFS algorithm: takes the start-node id and returns the number of explored vertices uint64_t do_BFS(uint64_t startVertex){ diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 7ecc45f1..731cdd48 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -17,7 +17,7 @@ /** * @file edge.h - * @brief Edge class which represents a relationship between 2 Vertices + * @brief Edge class which represents a relationship object betwenn two vertices * @todo */ @@ -114,6 +114,7 @@ namespace morphstore{ return getTargetId() < e.getTargetId(); } + // get size of edge object in bytes: size_t size_in_bytes() const{ size_t size = 0; size += sizeof(uint64_t) * 2; // source- and target-id diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 8d6d718a..11ee3c19 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -17,7 +17,7 @@ /** * @file adjacencylist.h - * @brief Derived ADJ-List storage format class. Base: graph.h + * @brief Derived adj. list storage format class. Base: graph.h * @todo */ @@ -35,7 +35,6 @@ namespace morphstore{ public: - storageFormat getStorageFormat() const override { return adjacencylist; } @@ -99,6 +98,7 @@ namespace morphstore{ } } + // for debugging: print neighbors a vertex void print_neighbors_of_vertex(uint64_t id) override{ vertices[id]->print_neighbors(); } @@ -113,6 +113,7 @@ namespace morphstore{ return vertices.at(id)->get_neighbors_ids(); } + // for measuring the size in bytes: std::pair get_size_of_graph() override { std::pair index_data_size; size_t data_size = 0; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index e0a8d291..a67af412 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -18,7 +18,7 @@ /** * @file csr.h * @brief Derived CSR storage format class. Base: graph.h - * @todo add_edge() functionality is missing -> needs a realloc()-strategy + * @todo */ #ifndef MORPHSTORE_CSR_H @@ -32,16 +32,14 @@ namespace morphstore{ class CSR: public Graph{ private: - /* graph topology: hybrid approach + /* graph topology: * node array: index is vertex-id; array cell contains offset in edge_array - * edge array: every cell contains pointer to edge object of vertex + * edge array: contains target id of relationship + * edge value array: contains edge object with addtional information (same index with edge array) */ - // TODO: free memory in destructor ? uint64_t* node_array = nullptr; - Edge* edge_array = nullptr; - - // BFS OPTIMIZATION APPROACH: - uint64_t* edge_targetID_array = nullptr; + uint64_t* edge_array = nullptr; + Edge* edge_value_array = nullptr; public: @@ -57,10 +55,8 @@ namespace morphstore{ vertices.reserve(numberVertices); node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); - edge_array = (Edge*) malloc(numberEdges * sizeof(Edge)); - - // BFS OPTIMIZATION APPROACH: - edge_targetID_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); + edge_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); + edge_value_array = (Edge*) malloc(numberEdges * sizeof(Edge)); // init node array: node_array[0] = 0; @@ -80,7 +76,7 @@ namespace morphstore{ return v->getID(); } - // TODO: add a single edge in graph arrays -> needs a memory reallocating stragety + // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { if(exist_id(from) && exist_id(to)){ std::cout << rel << std::endl; @@ -88,18 +84,16 @@ namespace morphstore{ } // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC - // every vertex id contains a list of neighbors + // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector relations) override { uint64_t offset = node_array[sourceID]; uint64_t nextOffset = offset + relations.size(); + // fill the arrays for(const auto & edge : relations){ - edge_array[offset] = edge; - - // BFS OPTIMIZATION APPROACH: - edge_targetID_array[offset] = edge.getTargetId(); - - ++offset; + edge_value_array[offset] = edge; + edge_array[offset] = edge.getTargetId(); + ++offset; } // to avoid buffer overflow: @@ -136,18 +130,19 @@ namespace morphstore{ }else{ nextOffset = node_array[id+1]; } - if(offset == nextOffset) return 0; + + if(offset == nextOffset) return 0; uint64_t numberEdges = nextOffset - offset; - //if(id == 1030169) std::cout << "edges: " << numberEdges << " - offset: " << offset << " - nextOffset: " << nextOffset << std::endl; return numberEdges; } + // for debugging: void print_neighbors_of_vertex(uint64_t id) override{ uint64_t offset = node_array[id]; uint64_t numberEdges = get_degree(id); for(uint64_t i = offset; i < offset+numberEdges; ++i){ - std::cout << "Source-ID: " << edge_array[i].getSourceId() << " - Target-ID: " << edge_array[i].getTargetId() << " - Property: { " << edge_array[i].getProperty().first << ": " << edge_array[i].getProperty().second << " }" << " || "; + std::cout << "Source-ID: " << edge_value_array[i].getSourceId() << " - Target-ID: " << edge_value_array[i].getTargetId() << " - Property: { " << edge_value_array[i].getProperty().first << ": " << edge_value_array[i].getProperty().second << " }" << " || "; } } @@ -156,20 +151,16 @@ namespace morphstore{ std::vector neighbors; uint64_t offset = node_array[id]; uint64_t numberEdges = get_degree(id); - /* - for(uint64_t i = offset; i < offset+numberEdges; ++i){ - neighbors.push_back(edge_array[i].getTargetId()); - }*/ - - // BFS OPTIMIZATION APPROACH: - /*Problem is that it does not put the second element into the vector -> alternative to insert() ???*/ + + // avoiding out of bounds ... if( offset < getNumberEdges()){ - neighbors.insert(neighbors.end(), edge_targetID_array+offset, edge_targetID_array+offset+numberEdges); - } - //std::vector neighbors(edge_targetID_array+offset, edge_targetID_array+offset+numberEdges-1); + neighbors.insert(neighbors.end(), edge_array+offset, edge_array+offset+numberEdges); + } + return neighbors; } + // get size of storage format: std::pair get_size_of_graph() override { std::pair index_data_size; size_t data_size = 0; @@ -197,17 +188,14 @@ namespace morphstore{ index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); // edges array values: for(uint64_t i = 0; i < getNumberEdges(); i++){ - index_size += sizeof(uint64_t); // node_array with offsets - data_size += edge_array[i].size_in_bytes(); // edge value arrray with object + index_size += sizeof(uint64_t); // node_array with offsets + data_size += edge_value_array[i].size_in_bytes(); // edge value array with object } index_data_size = {index_size, data_size}; return index_data_size; } - }; - } - #endif //MORPHSTORE_CSR_H diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 939d35e4..e946da5c 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -17,8 +17,8 @@ /** * @file graph.h - * @brief abstract graph class for any storage format --> CSR,ADJ - * @todo graph-size calculation!! + * @brief base graph class for any storage format --> CSR,ADJ + * @todo */ #ifndef MORPHSTORE_GRAPH_H @@ -120,7 +120,7 @@ namespace morphstore{ return vertices[id]; } - // function that return a list of pair < vertex id, degree > DESC + // function to return a list of pair < vertex id, degree > DESC: std::vector> get_list_of_degree_DESC(){ std::vector> vertexDegreeList; vertexDegreeList.reserve(numberVertices); @@ -129,27 +129,26 @@ namespace morphstore{ vertexDegreeList.push_back({i, this->get_degree(i)}); } // sort the vector on degree DESC - /*std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](auto &left, auto &right) { + std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { return left.second > right.second; - });*/ - std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { - return left.second > right.second; - }); + }); return vertexDegreeList; } - - // function to measure graph characteristics (degree and count): - void measure_degree_count(std::string filePath){ - std::vector> verticesDegree = get_list_of_degree_DESC(); - // unordered map for mapping degree to count: - std::unordered_map results; - for(uint64_t i = 0; i < verticesDegree.size(); ++i){ - // increment count in results for a given degree: - results[verticesDegree[i].second]++; - } - // write to file: - std::ofstream fs; + + // function to measure graph characteristics (degree and count): + void measure_degree_count(std::string filePath){ + std::vector> verticesDegree = get_list_of_degree_DESC(); + // unordered map for mapping degree to count: + std::unordered_map results; + + for(uint64_t i = 0; i < verticesDegree.size(); ++i){ + // increment count in results for a given degree: + results[verticesDegree[i].second]++; + } + + // write to file: + std::ofstream fs; std::stringstream ss; // open file for writing and delete existing stuff: fs.open(filePath, std::fstream::out | std::ofstream::trunc); @@ -158,9 +157,8 @@ namespace morphstore{ ss << m.first << "," << m.second << "\n"; } fs << ss.str() ; - fs.close(); - } + } // -------------------- pure virtual functions -------------------- @@ -174,12 +172,13 @@ namespace morphstore{ virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; - virtual std::pair get_size_of_graph() = 0; - // for debugging - virtual void print_neighbors_of_vertex(uint64_t id) = 0; + virtual std::pair get_size_of_graph() = 0; // -------------------- debugging functions -------------------- + // for debugging + virtual void print_neighbors_of_vertex(uint64_t id) = 0; + void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getNumberVertices() << std::endl; diff --git a/include/core/storage/graph/graph.h.save b/include/core/storage/graph/graph.h.save deleted file mode 100644 index 75a51f77..00000000 --- a/include/core/storage/graph/graph.h.save +++ /dev/null @@ -1,211 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file graph.h - * @brief abstract graph class for any storage format --> CSR,ADJ - * @todo graph-size calculation!! -*/ - -#ifndef MORPHSTORE_GRAPH_H -#define MORPHSTORE_GRAPH_H - -#include "vertex/vertex.h" -#include "edge/edge.h" - -#include -#include -#include -#include -#include -#include -#include - -namespace morphstore{ - - class Graph{ - - protected: - uint64_t numberVertices; - uint64_t numberEdges; - - // Data-structure for Vertex-Properties - std::unordered_map> vertices; - - // Lookup for entities and relations: number to string - std::map entityDictionary; - std::map relationDictionary; - - public: - - enum storageFormat {csr, adjacencylist }; - - // -------------------- Setters & Getters -------------------- - - const std::map &getEntityDictionary() const { - return entityDictionary; - } - - void setEntityDictionary(const std::map& ent) { - this->entityDictionary = ent; - } - - const std::map &getRelationDictionary() const { - return relationDictionary; - } - - void setRelationDictionary(const std::map& rel) { - this->relationDictionary = rel; - } - - uint64_t getNumberVertices() const { - return numberVertices; - } - - void setNumberVertices(uint64_t numV) { - Graph::numberVertices = numV; - } - - uint64_t getNumberEdges() const { - return numberEdges; - } - - void setNumberEdges(uint64_t numE) { - Graph::numberEdges = numE; - } - - std::string get_entity_by_number(unsigned short int e){ - if(entityDictionary.find( e ) != entityDictionary.end()){ - return entityDictionary.at(e); - }else{ - return "No Matching of entity-number in the database!"; - } - } - - std::string get_relation_by_number(unsigned short int re){ - if(relationDictionary.find( re ) != relationDictionary.end()){ - return relationDictionary.at(re); - }else{ - return "No Matching of relation-number in the database!"; - } - } - - // function to check if the vertex-ID is present or not (exists) - bool exist_id(const uint64_t id){ - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } - - // function which returns a pointer to vertex by id - std::shared_ptr get_vertex_by_id(uint64_t id){ - return vertices[id]; - } - - // function that return a list of pair < vertex id, degree > DESC - std::vector> get_list_of_degree_DESC(){ - std::vector> vertexDegreeList; - vertexDegreeList.reserve(numberVertices); - // fill the vector with every vertex key and his degree - for(uint64_t i = 0; i < numberVertices; ++i){ - vertexDegreeList.push_back({i, this->get_degree(i)}); - } - // sort the vector on degree DESC - /*std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](auto &left, auto &right) { - return left.second > right.second; - });*/ - std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { - return left.second > right.second; - }); - - return vertexDegreeList; - } - - // function to measure graph characteristics (degree and count): - void measure_degree_count(std::string filePath){ - std::vector> verticesDegree = get_list_of_degree_DESC(); - std::cout << "Highest degree: " << verticesDegree[0].second << std::endl; - // unordered map for mapping degree to count: - std::unordered_map results; - for(uint64_t i = 0; i < verticesDegree.size(); ++i){ - // increment count in results for a given degree: - results[verticesDegree[i].second]++; - } - // write to file: - for(auto const & m : results){ - ; - } - } - - // -------------------- pure virtual functions -------------------- - - virtual storageFormat getStorageFormat() const = 0; - virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; - virtual void add_vertex() = 0; - virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; - virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; - virtual void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) = 0; - virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; - virtual uint64_t get_degree(uint64_t id) = 0; - virtual std::vector get_neighbors_ids(uint64_t id) = 0; - virtual std::pair get_size_of_graph() = 0; - // for debugging - virtual void print_neighbors_of_vertex(uint64_t id) = 0; - - // -------------------- debugging functions -------------------- - - void statistics(){ - std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << getNumberVertices() << std::endl; - std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; - std::cout << "--------------------------------------------" << std::endl; - } - - void print_vertex_by_id(uint64_t id) { - std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; - std::shared_ptr v = vertices[id]; - std::cout << "Vertex-ID: \t" << v->getID() << std::endl; - std::cout << "Entity: \t" << get_entity_by_number(v->getEntity()) << std::endl; - std::cout << "\n"; - std::cout << "Properties: "; - v->print_properties(); - std::cout << "#Edges: " << this->get_degree(v->getID()); - std::cout << "\n"; - std::cout << "-----------------------------------------------" << std::endl; - } - - void print_entity_relationship_dicts(){ - std::cout << "Entity-Dict: " << std::endl; - for(auto const& entry : entityDictionary){ - std::cout << entry.first << " -> " << entry.second << std::endl; - } - std::cout << "\n"; - - std::cout << "Relationship-Dict: " << std::endl; - for(auto const& rel : relationDictionary){ - std::cout << rel.first << " -> " << rel.second << std::endl; - } - } - - }; - -} - - -#endif //MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 4e272180..2fed0f01 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -27,6 +27,7 @@ #include #include +// experimental/filesystem to read file directories #include #include #include @@ -267,8 +268,6 @@ namespace morphstore{ // iterate through vector of relation-addresses for (const auto &address : relationsPaths) { - - // TODO OPTIMIZE HERE: remove string operations // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); @@ -296,8 +295,7 @@ namespace morphstore{ if (relationFile.is_open()) { fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. relationFile.clear(); - relationFile.seekg(0, - std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory @@ -477,9 +475,9 @@ namespace morphstore{ if(start == 0){ propertyKey = row.substr(row.find(delimiter) + 1); }else{ - // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) + // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) systemID = globalIdLookupMap[{fromEntity, row.substr(0, row.find(delimiter))}]; - value = row.substr(row.find(delimiter) + 1); + value = row.substr(row.find(delimiter) + 1); multiValueAttr[systemID] = std::move(value); } @@ -549,7 +547,7 @@ namespace morphstore{ row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // insert relation into vertexRealtionsLookup with its edge-property: + // insert relation into vertexRelationsLookup with its edge-property: vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber, {propertyKey, value})); } } @@ -598,7 +596,7 @@ namespace morphstore{ } } - // MAIN import function: see steps in comments + // MAIN IMPORT FUNCTION: see steps in comments void import(Graph& graph) { //std::cout << "Importing LDBC-files into graph ... "; //std::cout.flush(); diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h index 8d537d37..3fa1fd83 100644 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ b/include/core/storage/graph/vertex/adjacencylist_vertex.h @@ -16,8 +16,8 @@ **********************************************************************************************/ /** - * @file avertex.h - * @brief Derived vertex calss for ADJ_LIST storage format: base-class: vertex + * @file adjacencylistvertex.h + * @brief Derived vertex calss for adj. list storage format: base-class: vertex * @todo */ @@ -32,8 +32,7 @@ namespace morphstore{ protected: std::vector adjacencylist; - - // BFS OPTIMIZATION APPROACH: + // additional adjacency list that only contains the target ids -> for bfs measurements std::vector adjacencylistBFS; public: @@ -58,7 +57,7 @@ namespace morphstore{ void add_edges(const std::vector edges) override { this->adjacencylist = edges; - // BFS OPTIMIZATION APPROACH: + // for the additional adjacency list: transformation for(auto edge : edges){ adjacencylistBFS.push_back(edge.getTargetId()); } @@ -69,6 +68,7 @@ namespace morphstore{ return adjacencylist.size(); } + // debugging: void print_neighbors() override { for(const auto& edge : adjacencylist){ std::cout << "Source-ID: " << edge.getSourceId() << " - Target-ID: " << edge.getTargetId() << @@ -78,17 +78,17 @@ namespace morphstore{ // function to return a vector of neighbor ids (for BFS) std::vector get_neighbors_ids() override { - /* + /* old approach std::vector neighbors; for(auto const& edge : adjacencylist){ neighbors.push_back(edge.getTargetId()); } return neighbors; */ - // BFS OPTIMIZATION APPROACH: return adjacencylistBFS; } + // get size of vertex in bytes: size_t get_data_size_of_vertex() override { size_t size = 0; size += sizeof(uint64_t); // id diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h index 68fa8edc..09de9b74 100644 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ b/include/core/storage/graph/vertex/csr_vertex.h @@ -37,7 +37,7 @@ namespace morphstore{ } // this function has no usage here: the adding of edges happens in the graph file -> csr.h - // it's just here because its a pure function in Vertex.h + // it's just here because it's a pure function in Vertex.h void add_edge(uint64_t from, uint64_t to,unsigned short int rel) override { std::cout << " virtual add_edge - no usage: " << from << ", " << to << ", " << rel << std::endl; } @@ -47,10 +47,12 @@ namespace morphstore{ std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; } + // debugging void print_neighbors() override { std::cout << " virtual print_neighbors - no usage: " << std::endl; } + // get size of csr vertex in bytes: size_t get_data_size_of_vertex() override { size_t size = 0; // properties: diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 8cbc30d5..29713e06 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -18,7 +18,7 @@ /** * @file vertex.h * @brief abstract vertex class for storage formats - * @todo add vertex size calculation + * @todo */ #ifndef MORPHSTORE_VERTEX_H @@ -84,7 +84,7 @@ namespace morphstore{ // for BFS alg.: adj-list virtual std::vector get_neighbors_ids() { - // return empty vector: implementation only needed in ADj-Vertex + // return empty vector: implementation only needed in adj - Vertex return std::vector(); } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index a863e044..d63a6740 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -17,16 +17,15 @@ /** * @file ldbc_graph_adjacency.cpp - * @brief Test for generating social network graph in ADJ_LIST format + * @brief Test for generating social network graph in adj. list format + BFS measurements * @todo */ #include #include -#include +#include #include // for high_resolution_clock -#include int main( void ){ @@ -39,57 +38,28 @@ int main( void ){ std::cout << "\n"; */ - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" - // NEVER FORGET THE LAST / in address!!! - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_10/"); - // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); + // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); - // start measuring import time: - //auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - - // generate vertices & edges from LDBC files and insert into graph + // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*g1); - // get some graph infos: + // measure degree distribution and write to file (file path as parameter): g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); - // measuring time: - //auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - //auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); - - //g1->statistics(); + // some statistics (DEBUG) + // g1->statistics(); - // size of graph in bytes: - //std::pair size = g1->get_size_of_graph(); - //std::cout << "index: " << size.first << " - data: " << size.second << std::endl; // size in bytes - //std::cout << elapsedImportTime << std::endl; // time in milli sec. - - /* Test Vertex, which contains edges with properties (SERVER): - */ + // (DEBUG) Test Vertex, which contains edges with properties (SERVER): // g1->print_vertex_by_id(1035174); // g1->print_neighbors_of_vertex(1035174); - /* Test Vertex, which contains edges with properties (MY PC): - g1->print_vertex_by_id(100449); - g1->print_neighbors_of_vertex(100449); - */ - - //g1->print_vertex_by_id(1033808); - - /*BFS single test: - std::unique_ptr bfs = std::make_unique(g1); - auto startImportBFSTime = std::chrono::high_resolution_clock::now(); - uint64_t explored = bfs->do_BFS(1033808); - auto finishImportBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedImportBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportBFSTime - startImportBFSTime ).count(); - std::cout << explored << " -> " << elapsedImportBFSTime << std::endl; - */ - - //std::unique_ptr bfs = std::make_unique(g1); - //bfs->do_measurements(10000, "/home/pfeiffer/measurements/adjacency_list/bfs_SF10.csv"); + // Execute BFS measurements: + // std::unique_ptr bfs = std::make_unique(g1); + // bfs->do_measurements(10000, "/home/pfeiffer/measurements/adjacency_list/bfs_SF1.csv"); return 0; } diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save deleted file mode 100644 index 2b4ccef6..00000000 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp.save +++ /dev/null @@ -1,83 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file ldbc_graph_adjacency.cpp - * @brief Test for generating social network graph in ADJ_LIST format - * @todo - */ - -#include -#include -#include - -#include // for high_resolution_clock -#include - -int main( void ){ - - // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- - /* - std::cout << "\n"; - std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; - std::cout << "**********************************************************" << std::endl; - std::cout << "\n"; - */ - - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" - // NEVER FORGET THE LAST / in address!!! - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); - // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); - - // Graph init: - std::unique_ptr g1 = std::make_unique(); - - // start measuring import time: - auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - - // generate vertices & edges from LDBC files and insert into graph - ldbcImport->import(*g1); - - // measuring time: - auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); - - // size of graph in bytes: - size_t size = g1->get_size_of_graph(); - std::cout << "Size: " << size << " bytes\n"; - - g1->statistics(); - std::cout << "Import: " << elapsedImportTime << " millisec.\n"; - - /* Test Vertex, which contains edges with properties (SERVER): - */ - // g1->print_vertex_by_id(1035174); - // g1->print_neighbors_of_vertex(1035174); - - - /* Test Vertex, which contains edges with properties (MY PC): - g1->print_vertex_by_id(100449); - g1->print_neighbors_of_vertex(100449); - */ - - - std::unique_ptr bfs = std::make_unique(g1); - bfs->do_measurements(); - - return 0; -} diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 65c2965b..5cd6b71e 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -17,78 +17,49 @@ /** * @file ldbc_graph_csr.cpp - * @brief Test for generating social network graph in CSR format + * @brief Test for generating social network graph in CSR format + BFS measurements * @todo */ #include #include -#include +#include #include // for high_resolution_clock int main( void ){ - // ------------------------------------ LDBC-IMPORT TEST ------------------------------------ + // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- /* std::cout << "\n"; std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: Compressed Row Storage Format *" << std::endl; + std::cout << "* MorphStore-Storage-Test: CSR Storage Format *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; - */ + */ - // when using server with ssh pfeiffer@141.76.47.9: directory = "/home/pfeiffer/ldbc_sn_data/" - // NEVER FORGET THE LAST / in address!!! - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_10/"); - // std::unique_ptr ldbcImport = std::make_unique("/opt/ldbc_snb_datagen-0.2.8/social_network/social_network_1/"); + // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) + std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); // Graph init: std::unique_ptr g1 = std::make_unique(); - // start measuring import time: - //auto startImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - - // generate vertices & edges from LDBC files and insert into graph + // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*g1); - // get some graph infos: - g1->measure_degree_count("/home/pfeiffer/measurements/csr/graph_degree_count_SF10.csv"); - - // measuring time: - //auto finishImportTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - //auto elapsedImportTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportTime - startImportTime ).count(); + // measure degree distribution and write to file (file path as parameter): + g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); - //g1->statistics(); + // some statistics (DEBUG) + // g1->statistics(); - // size of graph in bytes: - //std::pair size = g1->get_size_of_graph(); - //std::cout << "index: " << size.first << " - data: " << size.second << std::endl; // size in bytes - //std::cout << elapsedImportTime << std::endl; // time in milli sec. - - /* Test Vertex, which contains edges with properties (SERVER): - */ + // (DEBUG) Test Vertex, which contains edges with properties (SERVER): // g1->print_vertex_by_id(1035174); // g1->print_neighbors_of_vertex(1035174); - /* Test Vertex, which contains edges with properties (MY PC): - g1->print_vertex_by_id(100449); - g1->print_neighbors_of_vertex(100449); - */ - - //g1->print_vertex_by_id(1033808); - - /* BFS single test: - std::unique_ptr bfs = std::make_unique(g1); - auto startImportBFSTime = std::chrono::high_resolution_clock::now(); - uint64_t explored = bfs->do_BFS(1033808); - auto finishImportBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the$ - auto elapsedImportBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishImportBFSTime - startImportBFSTime ).count(); - std::cout << explored << " -> " << elapsedImportBFSTime << std::endl; - */ - - //std::unique_ptr bfs = std::make_unique(g1); - //bfs->do_measurements(10000, "/home/pfeiffer/measurements/csr/bfs_SF10.csv"); + // Execute BFS measurements: + // std::unique_ptr bfs = std::make_unique(g1); + // bfs->do_measurements(10000, "/home/pfeiffer/measurements/csr/bfs_SF1.csv"); return 0; } From 18fb28e7cd886523960771a1a8db0fa3e5648425 Mon Sep 17 00:00:00 2001 From: Tim Pfeiffer Date: Fri, 1 Nov 2019 14:58:51 +0100 Subject: [PATCH 091/216] comment out measurements --- .../storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp | 4 ++-- test/core/storage/graph/csr/ldbc_graph_csr.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp index d63a6740..eea8c86b 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp @@ -48,10 +48,10 @@ int main( void ){ ldbcImport->import(*g1); // measure degree distribution and write to file (file path as parameter): - g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); + // g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); // some statistics (DEBUG) - // g1->statistics(); + g1->statistics(); // (DEBUG) Test Vertex, which contains edges with properties (SERVER): // g1->print_vertex_by_id(1035174); diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp index 5cd6b71e..9da0f041 100644 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ b/test/core/storage/graph/csr/ldbc_graph_csr.cpp @@ -48,10 +48,10 @@ int main( void ){ ldbcImport->import(*g1); // measure degree distribution and write to file (file path as parameter): - g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); + // g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); // some statistics (DEBUG) - // g1->statistics(); + g1->statistics(); // (DEBUG) Test Vertex, which contains edges with properties (SERVER): // g1->print_vertex_by_id(1035174); From d35fec89c553d9fd2f588fa8c92cd2930d8249f2 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 22 Mar 2020 16:04:31 +0100 Subject: [PATCH 092/216] Check if vertex files could be found for ldbc import --- include/core/storage/graph/ldbc_import.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 2fed0f01..446e4c69 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -86,6 +86,11 @@ namespace morphstore{ differentiate(entry.path().string(), dir); } } + + if(verticesPaths.empty()) { + print_file_names(); + throw std::invalid_argument("No vertex files found"); + } } // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector @@ -209,7 +214,6 @@ namespace morphstore{ // graph gets full entity-list here: graph.setEntityDictionary(entitiesLookup); } - } // function which returns true, if parameter is a entity in ldbc-files From 956b9752706c211fb7a7c636a80f5277e4524e40 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 22 Mar 2020 19:31:52 +0100 Subject: [PATCH 093/216] Use regExp to filter valid csv files and getEntityType removing previous string magic (could not handle f.i. "post_0_0.csv" as expected "post.csv") --- include/core/storage/graph/ldbc_import.h | 250 +++++++++++++---------- 1 file changed, 146 insertions(+), 104 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 446e4c69..9d91011e 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -35,6 +35,7 @@ #include #include #include +#include // hash function used to hash a pair of any kind using XOR (for verticesMap) struct hash_pair { @@ -75,11 +76,28 @@ namespace morphstore{ return directory; } + // get the vertex or edge type based on the fileName + std::string getEntityType(std::string fileName) { + // last [a-zA-Z] to remove ending _ + std::regex typeRegExp("[a-zA-Z_]+[a-zA-Z]"); + std::smatch match; + + if(std::regex_search(fileName, match, typeRegExp)) { + std::cout << "EntityType: " << match[0] << std::endl; + std::cout.flush(); + return match[0]; + } + else { + throw std::invalid_argument("No EntityType in: " + fileName); + } + } + + // function which iterates through directory to receive file names (entire path) void insert_file_names(std::string dir) { for (const auto &entry : std::experimental::filesystem::directory_iterator(dir)) { - // ignore files starting with a '.' - if (entry.path().string()[dir.size()] == '.') { + // ignore files starting with a '.' (+ 1 as '/' is the first character otherwise) + if (entry.path().string()[dir.size() + 1] == '.') { continue; } else { // insert file path to vertices or relations vector @@ -87,8 +105,9 @@ namespace morphstore{ } } + print_file_names(); + if(verticesPaths.empty()) { - print_file_names(); throw std::invalid_argument("No vertex files found"); } } @@ -96,128 +115,142 @@ namespace morphstore{ // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector void differentiate(std::string path, std::string dir) { // if the string contains a '_' -> it's a relation file; otherwise a vertex file - // remove dir name to remain only the *.csv - if (path.substr(dir.size()).find('_') != std::string::npos) { - relationsPaths.push_back(path); + // if string contains word_word it is an edge files (vertex files only contain one word) + // todo: remove dir name to remain only the *.csv + + // a vertex file contains exactly one word and after that only numbers are allowed f.i. _0_0 + std::regex vertexFileRegExp("^\\/([a-zA-Z]+\\_)([0-9_]*).csv$"); + std::string fileName = path.substr(dir.size()); + + if (std::regex_match(fileName, vertexFileRegExp)) { + verticesPaths.push_back(fileName); } else { - verticesPaths.push_back(path); + relationsPaths.push_back(fileName); } } // this function reads the vertices-files and creates vertices in a graph // + creates the entityLookup (number to string) for the graph void generate_vertices(Graph& graph) { + std::cout << "(1/2) Generating LDBC-Vertices ..."; + std::cout.flush(); - if (!verticesPaths.empty()) { - //std::cout << "(1/2) Generating LDBC-Vertices ..."; - //std::cout.flush(); + //this variable is used for the entityLookup-keys, starting by 0 + unsigned short int entityNumber = 0; - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; - - // iterate through vector of vertex-addresses - for (const auto &address : verticesPaths) { + // iterate through vector of vertex-addresses + for (const auto &file : verticesPaths) + { + // data structure for attributes of entity, e.g. taglass -> id, name, url + std::vector attributes; - // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; + + // get the entity from address ([...path...] / [entity-name].csv) + std::string entity = getEntityType(file); - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), - address.size() - getDirectory().size() - 4); + char *buffer; - char *buffer; + uint64_t fileSize = 0; - uint64_t fileSize = 0; + std::string address = getDirectory() + file; - std::ifstream vertexFile(address, std::ios::binary | + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!vertexFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, - std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } - // allocate memory - buffer = (char *) malloc(fileSize * sizeof(char)); - vertexFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; + // calculate file size + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + vertexFile.seekg(0, + std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + } - // read buffer and do the magic ... - for (size_t i = 0; i < fileSize; ++i) { - if (buffer[i] == '\n') { - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); + // allocate memory + buffer = (char *)malloc(fileSize * sizeof(char)); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; + + // read buffer and do the magic ... + for (size_t i = 0; i < fileSize; ++i) + { + if (buffer[i] == '\n') + { + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); + + // remove unnecessary '\n' at the beginning of a string + if (row.find('\n') != std::string::npos) + { + row.erase(0, 1); + } - // remove unnecessary '\n' at the beginning of a string - if (row.find('\n') != std::string::npos) { - row.erase(0, 1); + size_t last = 0; + size_t next = 0; + + // first line of *.csv contains the attributes -> write to attributes vector + if (start == 0) + { + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos) + { + attributes.push_back(row.substr(last, next - last)); + last = next + 1; } - - size_t last = 0; - size_t next = 0; - - // first line of *.csv contains the attributes -> write to attributes vector - if (start == 0) { - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos) { - attributes.push_back(row.substr(last, next - last)); - last = next + 1; - } - // last attribute - attributes.push_back(row.substr(last)); - } else { - // actual data: - std::unordered_map properties; - size_t attrIndex = 0; - std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos) { - properties.insert( - std::make_pair(attributes[attrIndex], row.substr(last, next - last))); - last = next + 1; - ++attrIndex; - } - // last attribute - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); - - //----------------------------------------------------- - // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); - // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, systemID}); - //----------------------------------------------------- - properties.clear(); // free memory + // last attribute + attributes.push_back(row.substr(last)); + } + else + { + // actual data: + std::unordered_map properties; + size_t attrIndex = 0; + std::string ldbcID = row.substr(0, row.find(delimiter)); + while ((next = row.find(delimiter, last)) != std::string::npos) + { + properties.insert( + std::make_pair(attributes[attrIndex], row.substr(last, next - last))); + last = next + 1; + ++attrIndex; } - - start = i; // set new starting point for buffer (otherwise it's concatenated) + // last attribute + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + + //----------------------------------------------------- + // create vertex and insert into graph with properties + uint64_t systemID = graph.add_vertex_with_properties(properties); + // add entity number to vertex + graph.add_entity_to_vertex(systemID, entityNumber); + // map entity and ldbc id to system generated id + globalIdLookupMap.insert({{entity, ldbcID}, systemID}); + //----------------------------------------------------- + properties.clear(); // free memory } + + start = i; // set new starting point for buffer (otherwise it's concatenated) } + } - delete[] buffer; // free memory - vertexFile.close(); + delete[] buffer; // free memory + vertexFile.close(); - // insert entity-number with string into map - entitiesLookup.insert(std::make_pair(entityNumber, entity)); - ++entityNumber; - attributes.clear(); - } - // graph gets full entity-list here: - graph.setEntityDictionary(entitiesLookup); + // insert entity-number with string into map + entitiesLookup.insert(std::make_pair(entityNumber, entity)); + ++entityNumber; + attributes.clear(); } + // graph gets full entity-list here: + graph.setEntityDictionary(entitiesLookup); } // function which returns true, if parameter is a entity in ldbc-files bool is_entity(const std::string &entity) { + // Todo: replace whole function by by entitiesLookup.find(entity) // iterate through entities-map to look up for paramater for (auto const &entry : entitiesLookup) { if (entry.second == entity) { @@ -242,6 +275,7 @@ namespace morphstore{ // for debugging void print_file_names() { + std::cout << "File-directory: " << getDirectory() << std::endl; std::cout << "Vertices-Files: " << std::endl; for (const auto &v : verticesPaths) { std::cout << "\t" << v << std::endl; @@ -271,10 +305,12 @@ namespace morphstore{ if (!relationsPaths.empty()) { // iterate through vector of relation-addresses - for (const auto &address : relationsPaths) { + for (const auto &file : relationsPaths) { // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = address.substr(getDirectory().size(), - address.size() - getDirectory().size() - 4); + std::string relation = getEntityType(file); + + + // TOdo: use regExp ([a-zA-Z]+)_([a-zA-Z]+)_([a-zA-Z]+) std::string fromEntity = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); @@ -287,6 +323,8 @@ namespace morphstore{ uint64_t fileSize = 0; + std::string address = getDirectory() + file; + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening @@ -342,15 +380,17 @@ namespace morphstore{ unsigned short int entityNumber = 0; // iterate through vector of vertex-addresses - for (const auto &address : verticesPaths) { + for (const auto &file : verticesPaths) { // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string entity = getEntityType(file); char *buffer; uint64_t fileSize = 0; + std::string address = getDirectory() + file; + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening @@ -410,20 +450,20 @@ namespace morphstore{ void fill_vertexRelationsLookup(Graph& graph){ if(!relationsPaths.empty()) { - //std::cout << "(2/2) Generating LDBC-Edges ..."; - //std::cout.flush(); + std::cout << "(2/2) Generating LDBC-Edges ..."; + std::cout.flush(); //this variable is used for the relationLookup-keys, starting by 0 unsigned short int relationNumber = 0; bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) // iterate through vector of vertex-addresses - for (const auto &address : relationsPaths) { + for (const auto &file : relationsPaths) { isRelation = false; // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = address.substr(getDirectory().size(), address.size() - getDirectory().size() - 4); + std::string relation = getEntityType(file); std::string fromEntity = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); @@ -436,6 +476,8 @@ namespace morphstore{ uint64_t fileSize = 0; + std::string address = getDirectory() + file; + std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!relationFile) { From 5f427f42bbce782fb53aa5f9efd0a12d6eeacd3d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 22 Mar 2020 19:47:46 +0100 Subject: [PATCH 094/216] Make entity naming consistent * only vertices and edges now * ldbc-entity was replaced with vertexType --- include/core/storage/graph/edge/edge.h | 23 ++--- .../storage/graph/formats/adjacencylist.h | 16 ++-- include/core/storage/graph/formats/csr.h | 14 ++-- include/core/storage/graph/graph.h | 48 +++++------ include/core/storage/graph/ldbc_import.h | 83 ++++++++++--------- include/core/storage/graph/vertex/vertex.h | 17 ++-- 6 files changed, 104 insertions(+), 97 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 731cdd48..ca23c9f5 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -17,7 +17,7 @@ /** * @file edge.h - * @brief Edge class which represents a relationship object betwenn two vertices + * @brief Edge class which represents an edge object between two vertices * @todo */ @@ -35,22 +35,23 @@ namespace morphstore{ protected: // Edge characteristics uint64_t sourceID, targetID; - unsigned short int relation; + unsigned short int type; + // todo: allow map instead of pair std::pair property; public: // Constructors with parameters - Edge(uint64_t from, uint64_t to, unsigned short int rel){ + Edge(uint64_t from, uint64_t to, unsigned short int type){ setSourceId(from); setTargetId(to); - setRelation(rel); + setType(type); } - Edge(uint64_t from, uint64_t to, unsigned short int rel, std::pair prop){ + Edge(uint64_t from, uint64_t to, unsigned short int type, std::pair prop){ setSourceId(from); setTargetId(to); - setRelation(rel); + setType(type); setProperty(prop); } @@ -63,7 +64,7 @@ namespace morphstore{ // do the copy setSourceId(edge.sourceID); setTargetId(edge.targetID); - setRelation(edge.relation); + setType(edge.type); setProperty(edge.property); // return the existing object so we can chain this operator @@ -88,12 +89,12 @@ namespace morphstore{ targetID = targetId; } - unsigned short getRelation() const { - return relation; + unsigned short getType() const { + return type; } - void setRelation(unsigned short relation) { - Edge::relation = relation; + void setType(unsigned short type) { + Edge::type = type; } const std::pair &getProperty() const { diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 11ee3c19..553cfdaf 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -69,19 +69,19 @@ namespace morphstore{ } } - // adding entity to vertex - void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) override { + // adding type to vertex + void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { if (exist_id(id)) { - vertices[id]->setEntity(entity); + vertices[id]->setType(type); } else { std::cout << "Vertex with ID " << id << " not found." << std::endl; } } // adding a single edge to vertex: - void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { + void add_edge(uint64_t from, uint64_t to, unsigned short int type) override { if (exist_id(from) && exist_id(to)) { - vertices[from]->add_edge(from, to, rel); + vertices[from]->add_edge(from, to, type); } else { std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; } @@ -119,13 +119,13 @@ namespace morphstore{ size_t data_size = 0; size_t index_size = 0; - // lookup dicts: entity dict + relation dict. + // lookup type dicts index_size += 2 * sizeof(std::map); - for(auto& ent : entityDictionary){ + for(auto& ent : vertexTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(ent.second.length()); } - for(auto& rel : relationDictionary){ + for(auto& rel : edgeTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(rel.second.length()); } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index a67af412..2d142db8 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -34,7 +34,7 @@ namespace morphstore{ private: /* graph topology: * node array: index is vertex-id; array cell contains offset in edge_array - * edge array: contains target id of relationship + * edge array: contains target id of the edge (TODO: should contain edge-id) * edge value array: contains edge object with addtional information (same index with edge array) */ uint64_t* node_array = nullptr; @@ -111,12 +111,12 @@ namespace morphstore{ } } - // adding entity to vertex - void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) override { + // adding type to vertex + void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { if(exist_id(id)){ - vertices[id]->setEntity(entity); + vertices[id]->setType(type); }else{ - std::cout << "Vertex with ID " << id << " not found./entity_to_vertex." << std::endl; + std::cout << "Vertex with ID " << id << " not found./type_to_vertex." << std::endl; } } @@ -168,11 +168,11 @@ namespace morphstore{ // lookup dicts: entity dict + relation dict. index_size += 2 * sizeof(std::map); - for(auto& ent : entityDictionary){ + for(auto& ent : vertexTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(ent.second.length()); } - for(auto& rel : relationDictionary){ + for(auto& rel : edgeTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(rel.second.length()); } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index e946da5c..832c2668 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -49,9 +49,9 @@ namespace morphstore{ // Data-structure for Vertex-Properties std::unordered_map> vertices; - // Lookup for entities and relations: number to string - std::map entityDictionary; - std::map relationDictionary; + // Lookup for types: number to string + std::map vertexTypeDictionary; + std::map edgeTypeDictionary; public: @@ -59,20 +59,20 @@ namespace morphstore{ // -------------------- Setters & Getters -------------------- - const std::map &getEntityDictionary() const { - return entityDictionary; + const std::map &getVertexTypeDictionary() const { + return vertexTypeDictionary; } - void setEntityDictionary(const std::map& ent) { - this->entityDictionary = ent; + void setVertexTypeDictionary(const std::map& ent) { + this->vertexTypeDictionary = ent; } const std::map &getRelationDictionary() const { - return relationDictionary; + return edgeTypeDictionary; } - void setRelationDictionary(const std::map& rel) { - this->relationDictionary = rel; + void setEdgeTypeDictionary(const std::map& rel) { + this->edgeTypeDictionary = rel; } uint64_t getNumberVertices() const { @@ -91,17 +91,17 @@ namespace morphstore{ Graph::numberEdges = numE; } - std::string get_entity_by_number(unsigned short int e){ - if(entityDictionary.find( e ) != entityDictionary.end()){ - return entityDictionary.at(e); + std::string get_vertexType_by_number(unsigned short int type){ + if(vertexTypeDictionary.find( type ) != vertexTypeDictionary.end()){ + return vertexTypeDictionary.at(type); }else{ - return "No Matching of entity-number in the database!"; + return "No Matching of type-number in the database!"; } } - std::string get_relation_by_number(unsigned short int re){ - if(relationDictionary.find( re ) != relationDictionary.end()){ - return relationDictionary.at(re); + std::string get_edgeType_by_number(unsigned short int type){ + if(edgeTypeDictionary.find( type ) != edgeTypeDictionary.end()){ + return edgeTypeDictionary.at(type); }else{ return "No Matching of relation-number in the database!"; } @@ -167,7 +167,7 @@ namespace morphstore{ virtual void add_vertex() = 0; virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; - virtual void add_entity_to_vertex(const uint64_t id, const unsigned short int entity) = 0; + virtual void add_type_to_vertex(const uint64_t id, const unsigned short int type) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_degree(uint64_t id) = 0; @@ -190,7 +190,7 @@ namespace morphstore{ std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; std::shared_ptr v = vertices[id]; std::cout << "Vertex-ID: \t" << v->getID() << std::endl; - std::cout << "Entity: \t" << get_entity_by_number(v->getEntity()) << std::endl; + std::cout << "Type: \t" << get_vertexType_by_number(v->getType()) << std::endl; std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); @@ -199,15 +199,15 @@ namespace morphstore{ std::cout << "-----------------------------------------------" << std::endl; } - void print_entity_relationship_dicts(){ - std::cout << "Entity-Dict: " << std::endl; - for(auto const& entry : entityDictionary){ + void print_type_dicts(){ + std::cout << "VertexType-Dict: " << std::endl; + for(auto const& entry : vertexTypeDictionary){ std::cout << entry.first << " -> " << entry.second << std::endl; } std::cout << "\n"; - std::cout << "Relationship-Dict: " << std::endl; - for(auto const& rel : relationDictionary){ + std::cout << "EdgeType-Dict: " << std::endl; + for(auto const& rel : edgeTypeDictionary){ std::cout << rel.first << " -> " << rel.second << std::endl; } } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 9d91011e..4e12c0c3 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -55,14 +55,14 @@ namespace morphstore{ private: std::string directory; std::vector verticesPaths; - std::vector relationsPaths; - std::map entitiesLookup; - std::map relationsLookup; + std::vector edgesPaths; + std::map vertexTypeLookup; + std::map edgeTypeLookup; // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) - std::unordered_map> vertexRelationsLookup; + std::unordered_map> vertexEdgesLookup; public: @@ -83,8 +83,8 @@ namespace morphstore{ std::smatch match; if(std::regex_search(fileName, match, typeRegExp)) { - std::cout << "EntityType: " << match[0] << std::endl; - std::cout.flush(); + //std::cout << "EntityType: " << match[0] << std::endl; + //std::cout.flush(); return match[0]; } else { @@ -100,7 +100,7 @@ namespace morphstore{ if (entry.path().string()[dir.size() + 1] == '.') { continue; } else { - // insert file path to vertices or relations vector + // insert file path to vertices or edges vector differentiate(entry.path().string(), dir); } } @@ -125,7 +125,7 @@ namespace morphstore{ if (std::regex_match(fileName, vertexFileRegExp)) { verticesPaths.push_back(fileName); } else { - relationsPaths.push_back(fileName); + edgesPaths.push_back(fileName); } } @@ -225,7 +225,7 @@ namespace morphstore{ // create vertex and insert into graph with properties uint64_t systemID = graph.add_vertex_with_properties(properties); // add entity number to vertex - graph.add_entity_to_vertex(systemID, entityNumber); + graph.add_type_to_vertex(systemID, entityNumber); // map entity and ldbc id to system generated id globalIdLookupMap.insert({{entity, ldbcID}, systemID}); //----------------------------------------------------- @@ -240,19 +240,19 @@ namespace morphstore{ vertexFile.close(); // insert entity-number with string into map - entitiesLookup.insert(std::make_pair(entityNumber, entity)); + vertexTypeLookup.insert(std::make_pair(entityNumber, entity)); ++entityNumber; attributes.clear(); } // graph gets full entity-list here: - graph.setEntityDictionary(entitiesLookup); + graph.setVertexTypeDictionary(vertexTypeLookup); } // function which returns true, if parameter is a entity in ldbc-files bool is_entity(const std::string &entity) { // Todo: replace whole function by by entitiesLookup.find(entity) // iterate through entities-map to look up for paramater - for (auto const &entry : entitiesLookup) { + for (auto const &entry : vertexTypeLookup) { if (entry.second == entity) { return true; } @@ -263,8 +263,9 @@ namespace morphstore{ // function which returns true, if the relation already exist bool exist_relation_name(const std::string &relation) { - // iterate through relations-map to look up for paramater - for (auto const &entry : relationsLookup) { + // Todo: replace whole function by by entitiesLookup.find(relation) + // iterate through edges-map to look up for paramater + for (auto const &entry : edgeTypeLookup) { if (entry.second == relation) { return true; } @@ -280,8 +281,8 @@ namespace morphstore{ for (const auto &v : verticesPaths) { std::cout << "\t" << v << std::endl; } - std::cout << "Relations-Files: " << std::endl; - for (const auto &rel : relationsPaths) { + std::cout << "Edge-Files: " << std::endl; + for (const auto &rel : edgesPaths) { std::cout << "\t" << rel << std::endl; } @@ -290,11 +291,11 @@ namespace morphstore{ // function which clears all intermediates after import void clear_intermediates() { globalIdLookupMap.clear(); - relationsLookup.clear(); - entitiesLookup.clear(); - relationsPaths.clear(); + edgeTypeLookup.clear(); + vertexTypeLookup.clear(); + edgesPaths.clear(); verticesPaths.clear(); - vertexRelationsLookup.clear(); + vertexEdgesLookup.clear(); } // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the entity lookup creation) @@ -302,10 +303,10 @@ namespace morphstore{ uint64_t result = 0; - if (!relationsPaths.empty()) { + if (!edgesPaths.empty()) { // iterate through vector of relation-addresses - for (const auto &file : relationsPaths) { + for (const auto &file : edgesPaths) { // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = getEntityType(file); @@ -437,7 +438,7 @@ namespace morphstore{ vertexFile.close(); // insert entity-number with string into map - entitiesLookup.insert(std::make_pair( entityNumber, entity)); + vertexTypeLookup.insert(std::make_pair( entityNumber, entity)); ++entityNumber; } @@ -447,9 +448,9 @@ namespace morphstore{ // this function reads the relation-files and fills the intermediate: vertexRelationLookup // + creates the relationLookup (number to string) for the graph - void fill_vertexRelationsLookup(Graph& graph){ + void fill_vertexEdgesLookup(Graph& graph){ - if(!relationsPaths.empty()) { + if(!edgesPaths.empty()) { std::cout << "(2/2) Generating LDBC-Edges ..."; std::cout.flush(); @@ -458,7 +459,7 @@ namespace morphstore{ bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) // iterate through vector of vertex-addresses - for (const auto &file : relationsPaths) { + for (const auto &file : edgesPaths) { isRelation = false; @@ -586,15 +587,15 @@ namespace morphstore{ toID = globalIdLookupMap.at({toEntity, row}); // insert relation into vertexRealtionsLookup: - vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); + vertexEdgesLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); }else{ // with properties means: toID is until the next delimiter, and then the value for the property toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // insert relation into vertexRelationsLookup with its edge-property: - vertexRelationsLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber, {propertyKey, value})); + // insert relation into vertexEdgesLookup with its edge-property: + vertexEdgesLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber, {propertyKey, value})); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -609,43 +610,43 @@ namespace morphstore{ // check if the name already exists if(!exist_relation_name(relationName)){ // insert relation-number with string into map - relationsLookup.insert(std::make_pair( relationNumber, relationName)); + edgeTypeLookup.insert(std::make_pair( relationNumber, relationName)); ++relationNumber; } } } // graph gets full relation-list here: - graph.setRelationDictionary(relationsLookup); + graph.setEdgeTypeDictionary(edgeTypeLookup); } } - // function for sorting the vertexRelationsLookup ASC (needed in CSR) + // function for sorting the vertexEdgesLookup ASC (needed in CSR) // sorting for every vertex its vector list with target-ids ASC - void sort_VertexRelationsLookup(){ + void sort_VertexEdgesLookup(){ // sorting the first element of the pair (target-id) - for(auto &rel: vertexRelationsLookup){ + for(auto &rel: vertexEdgesLookup){ std::sort(rel.second.begin(), rel.second.end()); } } - // this function writes the actual data from the intermediate vertexRelationsLookup into the graph + // this function writes the actual data from the intermediate vertexEdgesLookup into the graph void generate_edges(Graph& graph){ // firstly, sorting the intermediates with their target IDs ASC - sort_VertexRelationsLookup(); + sort_VertexEdgesLookup(); uint64_t graphSize = graph.getNumberVertices(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ // add edge data: - graph.add_edges(vertexID, vertexRelationsLookup[vertexID]); + graph.add_edges(vertexID, vertexEdgesLookup[vertexID]); } } // MAIN IMPORT FUNCTION: see steps in comments void import(Graph& graph) { - //std::cout << "Importing LDBC-files into graph ... "; - //std::cout.flush(); + std::cout << "Importing LDBC-files into graph ... "; + std::cout.flush(); // (1) get number vertices and number edges: uint64_t numberVertices = get_total_number_vertices(); @@ -657,8 +658,8 @@ namespace morphstore{ // (3) generate vertices generate_vertices(graph); - // (4) read relations and write to intermediate results - fill_vertexRelationsLookup(graph); + // (4) read edges and write to intermediate results + fill_vertexEdgesLookup(graph); // (5) read intermediates and write edges generate_edges(graph); diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 29713e06..71e32a34 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -37,8 +37,8 @@ namespace morphstore{ protected: // vertex: id, // optional: entity, properties - uint64_t id; - unsigned short int entity; + uint64_t id; + unsigned short int type; std::unordered_map properties; @@ -50,12 +50,12 @@ namespace morphstore{ return id; } - unsigned short getEntity() const { - return entity; + unsigned short getType() const { + return type; } - void setEntity(const unsigned short e) { - Vertex::entity = e; + void setType(const unsigned short type) { + Vertex::type = type; } const std::unordered_map &getProperties() const { @@ -73,16 +73,21 @@ namespace morphstore{ // ----------------- (pure) virtual functions ----------------- + // todo: remove (not a vertex but a graph.h function) virtual void add_edges(const std::vector edges) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void print_neighbors() = 0; + + virtual size_t get_data_size_of_vertex() = 0; + // todo: remove (not a vertex but a graph.h function) virtual uint64_t get_number_edges(){ return 0; }; // for BFS alg.: adj-list + // todo: remove (not a vertex but a graph.h function) virtual std::vector get_neighbors_ids() { // return empty vector: implementation only needed in adj - Vertex return std::vector(); From e490e1544458dcb13572593ff8fc63e3cbb085bd Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 23 Mar 2020 11:32:58 +0100 Subject: [PATCH 095/216] Replace entity with vertexType in ldbc loader --- include/core/storage/graph/ldbc_import.h | 126 +++++++++++------------ 1 file changed, 60 insertions(+), 66 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 4e12c0c3..81d1db8c 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -58,7 +58,7 @@ namespace morphstore{ std::vector edgesPaths; std::map vertexTypeLookup; std::map edgeTypeLookup; - // data structure for lookup local ids with entity to global system id: (entity, ldbc_id) -> global id + // data structure for lookup local ids with vertexType to global system id: (vertexType, ldbc_id) -> global id std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) @@ -130,13 +130,13 @@ namespace morphstore{ } // this function reads the vertices-files and creates vertices in a graph - // + creates the entityLookup (number to string) for the graph + // + creates the vertexTypeLookup (number to string) for the graph void generate_vertices(Graph& graph) { std::cout << "(1/2) Generating LDBC-Vertices ..."; std::cout.flush(); - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; + //this variable is used for the vertexTypeLookup-keys, starting by 0 + unsigned short int vertexTypeNumber = 0; // iterate through vector of vertex-addresses for (const auto &file : verticesPaths) @@ -144,9 +144,7 @@ namespace morphstore{ // data structure for attributes of entity, e.g. taglass -> id, name, url std::vector attributes; - - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = getEntityType(file); + std::string vertexType = getEntityType(file); char *buffer; @@ -224,10 +222,10 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties uint64_t systemID = graph.add_vertex_with_properties(properties); - // add entity number to vertex - graph.add_type_to_vertex(systemID, entityNumber); - // map entity and ldbc id to system generated id - globalIdLookupMap.insert({{entity, ldbcID}, systemID}); + // add vertexType number to vertex + graph.add_type_to_vertex(systemID, vertexTypeNumber); + // map vertexType and ldbc id to system generated id + globalIdLookupMap.insert({{vertexType, ldbcID}, systemID}); //----------------------------------------------------- properties.clear(); // free memory } @@ -239,21 +237,20 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); - // insert entity-number with string into map - vertexTypeLookup.insert(std::make_pair(entityNumber, entity)); - ++entityNumber; + // insert vertexType-number with string into map + vertexTypeLookup.insert(std::make_pair(vertexTypeNumber, vertexType)); + ++vertexTypeNumber; attributes.clear(); } - // graph gets full entity-list here: + // graph gets full vertexType-list here: graph.setVertexTypeDictionary(vertexTypeLookup); } - // function which returns true, if parameter is a entity in ldbc-files - bool is_entity(const std::string &entity) { - // Todo: replace whole function by by entitiesLookup.find(entity) + // function which returns true, if parameter is a vertexType in ldbc-files + bool is_vertexType(const std::string &vertexType) { // iterate through entities-map to look up for paramater for (auto const &entry : vertexTypeLookup) { - if (entry.second == entity) { + if (entry.second == vertexType) { return true; } } @@ -298,7 +295,7 @@ namespace morphstore{ vertexEdgesLookup.clear(); } - // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the entity lookup creation) + // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the vertexType lookup creation) uint64_t get_total_number_edges() { uint64_t result = 0; @@ -307,18 +304,17 @@ namespace morphstore{ // iterate through vector of relation-addresses for (const auto &file : edgesPaths) { - // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = getEntityType(file); // TOdo: use regExp ([a-zA-Z]+)_([a-zA-Z]+)_([a-zA-Z]+) - std::string fromEntity = relation.substr(0, relation.find('_')); + std::string sourceVertexType = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); - std::string relationName = relation.substr(0, relation.find('_')); + std::string edgeType = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); - std::string toEntity = relation; + std::string targetVertexType = relation; char *buffer; @@ -326,28 +322,28 @@ namespace morphstore{ std::string address = getDirectory() + file; - std::ifstream relationFile(address, std::ios::binary | + std::ifstream edgeFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!relationFile) { + if (!edgeFile) { std::cerr << "Error, opening file. "; exit(EXIT_FAILURE); } // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + if (edgeFile.is_open()) { + fileSize = static_cast(edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + edgeFile.clear(); + edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory buffer = (char *) malloc(fileSize * sizeof(char)); - relationFile.read(buffer, fileSize); // read data as one big block + edgeFile.read(buffer, fileSize); // read data as one big block bool firstLine = true; // check from file name whether it's a relation file or multi value attribute file - if (is_entity(toEntity)) { + if (is_vertexType(targetVertexType)) { for (size_t i = 0; i < fileSize; ++i) { if (buffer[i] == '\n') { @@ -363,7 +359,7 @@ namespace morphstore{ } delete[] buffer; // free memory - relationFile.close(); + edgeFile.close(); } } @@ -377,14 +373,12 @@ namespace morphstore{ if (!verticesPaths.empty()) { - //this variable is used for the entityLookup-keys, starting by 0 - unsigned short int entityNumber = 0; + //this variable is used for the vertexTypeLookup-keys, starting by 0 + unsigned short int vertexTypeNumber = 0; // iterate through vector of vertex-addresses for (const auto &file : verticesPaths) { - - // get the entity from address ([...path...] / [entity-name].csv) - std::string entity = getEntityType(file); + std::string vertexType = getEntityType(file); char *buffer; @@ -437,17 +431,17 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); - // insert entity-number with string into map - vertexTypeLookup.insert(std::make_pair( entityNumber, entity)); - ++entityNumber; + // insert vertexType-number with string into map + vertexTypeLookup.insert(std::make_pair( vertexTypeNumber, vertexType)); + ++vertexTypeNumber; } } return result; } - // this function reads the relation-files and fills the intermediate: vertexRelationLookup - // + creates the relationLookup (number to string) for the graph + // this function reads the edge-files and fills the intermediate: vertexEdgeLookup + // + creates the edgeLookup (number to string) for the graph void fill_vertexEdgesLookup(Graph& graph){ if(!edgesPaths.empty()) { @@ -456,22 +450,22 @@ namespace morphstore{ //this variable is used for the relationLookup-keys, starting by 0 unsigned short int relationNumber = 0; - bool isRelation = false; // flag which is used to differentiate for relatoin-lookup-entrys (to avoid e.g. email as relation) + bool isEdge = false; // flag which is used to differentiate for edge-lookup-entrys (to avoid e.g. email as an edge) // iterate through vector of vertex-addresses for (const auto &file : edgesPaths) { - isRelation = false; + isEdge = false; // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment std::string relation = getEntityType(file); - std::string fromEntity = relation.substr(0, relation.find('_')); + std::string sourceVertexType = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); - std::string relationName = relation.substr(0, relation.find('_')); + std::string edgeType = relation.substr(0, relation.find('_')); relation.erase(0, relation.find('_') + 1); - std::string toEntity = relation; + std::string targetVertexType = relation; char* buffer; @@ -479,29 +473,29 @@ namespace morphstore{ std::string address = getDirectory() + file; - std::ifstream relationFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream edgeFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!relationFile) { + if (!edgeFile) { std::cerr << "Error, opening file. "; exit(EXIT_FAILURE); } // calculate file size - if (relationFile.is_open()) { - fileSize = static_cast(relationFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - relationFile.clear(); - relationFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + if (edgeFile.is_open()) { + fileSize = static_cast(edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + edgeFile.clear(); + edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) } // allocate memory buffer = (char*) malloc( fileSize * sizeof( char ) ); - relationFile.read(buffer, fileSize); // read data as one big block + edgeFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; // check from file name whether it's a relation file or multi value attribute file - if(!is_entity(toEntity)){ + if(!is_vertexType(targetVertexType)){ // Multi-value-attributes: just take the last recently one std::string propertyKey; std::unordered_map multiValueAttr; @@ -523,7 +517,7 @@ namespace morphstore{ propertyKey = row.substr(row.find(delimiter) + 1); }else{ // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) - systemID = globalIdLookupMap[{fromEntity, row.substr(0, row.find(delimiter))}]; + systemID = globalIdLookupMap[{sourceVertexType, row.substr(0, row.find(delimiter))}]; value = row.substr(row.find(delimiter) + 1); multiValueAttr[systemID] = std::move(value); } @@ -541,7 +535,7 @@ namespace morphstore{ // handling of relation-files ... else{ - isRelation = true; + isEdge = true; bool hasProperties = false; std::string propertyKey; @@ -578,19 +572,19 @@ namespace morphstore{ }else{ // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property // get the system-(global) id's from local ids - fromID = globalIdLookupMap.at({fromEntity, row.substr(0, row.find(delimiter))}); + fromID = globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); // remove from id from string row.erase(0, row.find(delimiter) + delimiter.length()); std::string value; if(!hasProperties){ // WITHOUT properties: just from the first delimiter on - toID = globalIdLookupMap.at({toEntity, row}); + toID = globalIdLookupMap.at({targetVertexType, row}); // insert relation into vertexRealtionsLookup: vertexEdgesLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); }else{ // with properties means: toID is until the next delimiter, and then the value for the property - toID = globalIdLookupMap.at({toEntity, row.substr(0, row.find(delimiter))}); + toID = globalIdLookupMap.at({targetVertexType, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; @@ -603,14 +597,14 @@ namespace morphstore{ } } delete[] buffer; // free memory - relationFile.close(); + edgeFile.close(); //check if the relation name is a relation (no multi value file) - if(isRelation){ + if(isEdge){ // check if the name already exists - if(!exist_relation_name(relationName)){ + if(!exist_relation_name(edgeType)){ // insert relation-number with string into map - edgeTypeLookup.insert(std::make_pair( relationNumber, relationName)); + edgeTypeLookup.insert(std::make_pair( relationNumber, edgeType)); ++relationNumber; } } @@ -667,7 +661,7 @@ namespace morphstore{ // (6) clear intermediates clear_intermediates(); - //std::cout << "--> done" << std::endl; + std::cout << "--> done" << std::endl; } }; } From faf69cb335670c6db1731b1f7047ada96f55f39c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 24 Mar 2020 12:08:30 +0100 Subject: [PATCH 096/216] Store AdjacencyLists in one map per graph * AdjacencyList handling by graph instead of vertex * Remove format-specific vertex classes * edgeId for allowing multi graphs * nodeId generation now graph-scope (instead of vertex) * "edges" mapping of id->edge * return vertex id after adding vertex * Add a simple test --- include/core/storage/graph/edge/edge.h | 21 +++- .../storage/graph/formats/adjacencylist.h | 116 ++++++++++++++---- include/core/storage/graph/formats/csr.h | 33 +++-- include/core/storage/graph/graph.h | 42 +++++-- .../graph/vertex/adjacencylist_vertex.h | 113 ----------------- .../core/storage/graph/vertex/csr_vertex.h | 74 ----------- include/core/storage/graph/vertex/vertex.h | 37 +++--- .../graph/adjacencylist/CMakeLists.txt | 5 +- .../adjacencylist/simple_adj_graph_test.cpp | 64 ++++++++++ 9 files changed, 244 insertions(+), 261 deletions(-) delete mode 100644 include/core/storage/graph/vertex/adjacencylist_vertex.h delete mode 100644 include/core/storage/graph/vertex/csr_vertex.h create mode 100644 test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index ca23c9f5..a73a6ce5 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -27,6 +27,7 @@ #include #include #include +#include namespace morphstore{ @@ -34,11 +35,16 @@ namespace morphstore{ protected: // Edge characteristics - uint64_t sourceID, targetID; + uint64_t sourceID, targetID, id; unsigned short int type; // todo: allow map instead of pair std::pair property; + uint64_t getNextEdgeId() const { + static uint64_t currentMaxEdgeId = 0; + return currentMaxEdgeId++; + } + public: // Constructors with parameters @@ -46,6 +52,7 @@ namespace morphstore{ setSourceId(from); setTargetId(to); setType(type); + this->id = getNextEdgeId(); } Edge(uint64_t from, uint64_t to, unsigned short int type, std::pair prop){ @@ -53,6 +60,7 @@ namespace morphstore{ setTargetId(to); setType(type); setProperty(prop); + this->id = getNextEdgeId(); } // this is needed for csr when doing edge_array[offset] = edge... @@ -73,6 +81,10 @@ namespace morphstore{ // --------------- Getter and Setter --------------- + uint64_t getId() const { + return id; + } + uint64_t getSourceId() const { return sourceID; } @@ -127,6 +139,13 @@ namespace morphstore{ return size; } + + + // ----------------- DEBUGGING ----------------- + void print_properties() { + std::cout << "{" << getProperty().first << ": " << getProperty().second << "}"; + std::cout << "\n"; + } }; } diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 553cfdaf..36dfaee8 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -18,14 +18,14 @@ /** * @file adjacencylist.h * @brief Derived adj. list storage format class. Base: graph.h - * @todo + * @todo Adjust get_size_of_graph(), ?replace unordered_map with a fixed sized array */ #ifndef MORPHSTORE_ADJACENCYLIST_H #define MORPHSTORE_ADJACENCYLIST_H #include "../graph.h" -#include "../vertex/adjacencylist_vertex.h" +#include "../vertex/vertex.h" #include @@ -33,8 +33,10 @@ namespace morphstore{ class AdjacencyList: public Graph { - public: + private: + std::unordered_map>> adjacencylistPerVertex; + public: storageFormat getStorageFormat() const override { return adjacencylist; } @@ -42,19 +44,23 @@ namespace morphstore{ // function: to set graph allocations void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { vertices.reserve(numberVertices); + adjacencylistPerVertex.reserve(numberVertices); + edges.reserve(numberEdges); + setNumberEdges(numberEdges); setNumberVertices(numberVertices); } // adding a single vertex - void add_vertex() override { - std::shared_ptr v = std::make_shared(); + uint64_t add_vertex() override { + std::shared_ptr v = std::make_shared(getNextVertexId()); vertices[v->getID()] = v; + return v->getID(); } // adding a vertex with its properties uint64_t add_vertex_with_properties(const std::unordered_map props) override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(getNextVertexId()); v->setProperties(props); vertices[v->getID()] = v; return v->getID(); @@ -62,7 +68,7 @@ namespace morphstore{ // function to add a single property to vertex void add_property_to_vertex(uint64_t id, const std::pair property) override { - if (exist_id(id)) { + if (exist_vertexId(id)) { vertices[id]->add_property(property); } else { std::cout << "Vertex with ID " << id << " not found." << std::endl; @@ -71,7 +77,7 @@ namespace morphstore{ // adding type to vertex void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { - if (exist_id(id)) { + if (exist_vertexId(id)) { vertices[id]->setType(type); } else { std::cout << "Vertex with ID " << id << " not found." << std::endl; @@ -79,38 +85,56 @@ namespace morphstore{ } // adding a single edge to vertex: - void add_edge(uint64_t from, uint64_t to, unsigned short int type) override { - if (exist_id(from) && exist_id(to)) { - vertices[from]->add_edge(from, to, type); - } else { - std::cout << "Source-/Target-Vertex-ID does not exist in the database!" << std::endl; - } + void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { + Edge e = Edge(sourceId, targetId, type); + add_edges(sourceId, {e}); } // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(uint64_t sourceID, const std::vector relations) override { - if (exist_id(sourceID)) { - if (relations.size() != 0) { - vertices[sourceID]->add_edges(relations); + void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { + if (exist_vertexId(sourceId)) { + std::shared_ptr> adjacencyList; + if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { + adjacencyList = adjacencylistPerVertex[sourceId]; + } else { + adjacencyList = std::make_shared>(); + adjacencylistPerVertex[sourceId] = adjacencyList; + } + + for(const auto& edge : edgesToAdd) { + edges[edge.getId()] = std::make_shared(edge); + if(exist_vertexId(edge.getTargetId())) { + adjacencyList->push_back(edge.getId()); + } + else { + std::cout << "Target-Vertex with ID " << edge.getTargetId() << " not found." << std::endl; + } } } else { - std::cout << "Vertex with ID " << sourceID << " not found." << std::endl; + std::cout << "Source-Vertex with ID " << sourceId << " not found." << std::endl; } } - // for debugging: print neighbors a vertex - void print_neighbors_of_vertex(uint64_t id) override{ - vertices[id]->print_neighbors(); - } // get number of neighbors of vertex with id uint64_t get_degree(uint64_t id) override { - return vertices[id]->get_number_edges(); + if (adjacencylistPerVertex.find(id) == adjacencylistPerVertex.end()) { + return 0; + } + else { + return adjacencylistPerVertex[id]->size(); + } } // get the neighbors-ids into vector for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { - return vertices.at(id)->get_neighbors_ids(); + std::vector targetVertexIds = std::vector(); + + for(auto const edgeId: *adjacencylistPerVertex[id]) { + targetVertexIds.push_back(edges[edgeId]->getTargetId()); + } + + return targetVertexIds; } // for measuring the size in bytes: @@ -131,19 +155,57 @@ namespace morphstore{ } // container for indexes: - index_size += sizeof(std::unordered_map>); + index_size += sizeof(std::unordered_map>); for(auto& it : vertices){ // index size of vertex: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); // data size: data_size += it.second->get_data_size_of_vertex(); } + index_size += sizeof(std::unordered_map>); + for(auto& it : edges){ + // index size of vertex: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += it.second->size_in_bytes(); + } + + // adjacencyListPerVertex + for(auto& it : adjacencylistPerVertex){ + // data size: + data_size += sizeof(it); + } + index_data_size = {index_size, data_size}; return index_data_size; } + // for debugging: print neighbors a vertex + void print_neighbors_of_vertex(uint64_t id) override{ + std::cout << "Neighbours for Vertex with id " << id << std::endl; + if(adjacencylistPerVertex.find(id) == adjacencylistPerVertex.end()) { + std::cout << " No outgoing edges for vertex with id: " << id << std::endl; + } + else { + for (const auto edgeId : *adjacencylistPerVertex[id]) { + auto edge = edges[edgeId]; + std::cout << " Edge-ID: " << edge->getId() + << " Source-ID: " << edge->getSourceId() + << " Target-ID: " << edge->getTargetId() + << " Property: { " << edge->getProperty().first << ": " << edge->getProperty().second << " }" + << std::endl; + } + } + } + + void statistics() override { + Graph::statistics(); + std::cout << "Number of adjacency lists:" << adjacencylistPerVertex.size() << std::endl; + std::cout << std::endl << std::endl; + } + }; } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 2d142db8..c8138799 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -18,14 +18,14 @@ /** * @file csr.h * @brief Derived CSR storage format class. Base: graph.h - * @todo + * @todo Edge_value_array should only store edge-ids (not whole objects) */ #ifndef MORPHSTORE_CSR_H #define MORPHSTORE_CSR_H #include "../graph.h" -#include "../vertex/csr_vertex.h" +#include "../vertex/vertex.h" namespace morphstore{ @@ -63,34 +63,33 @@ namespace morphstore{ } // adding a single vertex (without any properties, etc...) - void add_vertex() override { - std::shared_ptr v = std::make_shared(); + uint64_t add_vertex() override { + std::shared_ptr v = std::make_shared(getNextVertexId()); vertices[v->getID()] = v; + return v->getID(); } // adding a vertex with its properties uint64_t add_vertex_with_properties(const std::unordered_map props ) override { - std::shared_ptr v = std::make_shared(); + std::shared_ptr v = std::make_shared(getNextVertexId()); v->setProperties(props); vertices[v->getID()] = v; return v->getID(); } // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy - void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { - if(exist_id(from) && exist_id(to)){ - std::cout << rel << std::endl; - } + void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { + std::cerr << "Singe edge addition not yet implemented for CSR" << sourceId << targetId << type; } // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of its neighbors - void add_edges(uint64_t sourceID, const std::vector relations) override { + void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { uint64_t offset = node_array[sourceID]; - uint64_t nextOffset = offset + relations.size(); + uint64_t nextOffset = offset + edgesToAdd.size(); // fill the arrays - for(const auto & edge : relations){ + for(const auto & edge : edgesToAdd){ edge_value_array[offset] = edge; edge_array[offset] = edge.getTargetId(); ++offset; @@ -104,7 +103,7 @@ namespace morphstore{ // function to add a single property to vertex void add_property_to_vertex(uint64_t id, const std::pair property) override { - if(exist_id(id)){ + if(exist_vertexId(id)){ vertices[id]->add_property(property); }else{ std::cout << "Vertex with ID " << id << " not found./property_to_vertex" << std::endl; @@ -113,7 +112,7 @@ namespace morphstore{ // adding type to vertex void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { - if(exist_id(id)){ + if(exist_vertexId(id)){ vertices[id]->setType(type); }else{ std::cout << "Vertex with ID " << id << " not found./type_to_vertex." << std::endl; @@ -165,7 +164,7 @@ namespace morphstore{ std::pair index_data_size; size_t data_size = 0; size_t index_size = 0; - + // TODO: use Graph::get_size_of_graph() for vertices, edges, vertexTypeDictionary and edgeTypeDictionary // lookup dicts: entity dict + relation dict. index_size += 2 * sizeof(std::map); for(auto& ent : vertexTypeDictionary){ @@ -178,9 +177,9 @@ namespace morphstore{ } // container for indexes: - index_size += sizeof(std::unordered_map>); + index_size += sizeof(std::unordered_map>); for(auto& it : vertices){ - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); data_size += it.second->get_data_size_of_vertex(); } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 832c2668..fe849a1d 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -48,11 +48,16 @@ namespace morphstore{ // Data-structure for Vertex-Properties std::unordered_map> vertices; + std::unordered_map> edges; // Lookup for types: number to string std::map vertexTypeDictionary; std::map edgeTypeDictionary; + uint64_t getNextVertexId() const { + static uint64_t currentMaxVertexId = 0; + return currentMaxVertexId++; + } public: enum storageFormat {csr, adjacencylist }; @@ -76,7 +81,7 @@ namespace morphstore{ } uint64_t getNumberVertices() const { - return numberVertices; + return vertices.size(); } void setNumberVertices(uint64_t numV) { @@ -84,7 +89,7 @@ namespace morphstore{ } uint64_t getNumberEdges() const { - return numberEdges; + return edges.size(); } void setNumberEdges(uint64_t numE) { @@ -95,7 +100,7 @@ namespace morphstore{ if(vertexTypeDictionary.find( type ) != vertexTypeDictionary.end()){ return vertexTypeDictionary.at(type); }else{ - return "No Matching of type-number in the database!"; + return "No Matching of type-number in the database! For type " + std::to_string(type); } } @@ -103,18 +108,27 @@ namespace morphstore{ if(edgeTypeDictionary.find( type ) != edgeTypeDictionary.end()){ return edgeTypeDictionary.at(type); }else{ - return "No Matching of relation-number in the database!"; + print_type_dicts(); + return std::to_string(type) + " not found in edge-type dictionary"; } } // function to check if the vertex-ID is present or not (exists) - bool exist_id(const uint64_t id){ + bool exist_vertexId(const uint64_t id){ if(vertices.find(id) == vertices.end()){ return false; } return true; } + // function to check if the edge-ID is present or not (exists) + bool exist_edgeId(const uint64_t id){ + if(edges.find(id) == edges.end()){ + return false; + } + return true; + } + // function which returns a pointer to vertex by id std::shared_ptr get_vertex_by_id(uint64_t id){ return vertices[id]; @@ -164,7 +178,7 @@ namespace morphstore{ virtual storageFormat getStorageFormat() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; - virtual void add_vertex() = 0; + virtual uint64_t add_vertex() = 0; virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; virtual void add_type_to_vertex(const uint64_t id, const unsigned short int type) = 0; @@ -179,7 +193,7 @@ namespace morphstore{ // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; - void statistics(){ + virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getNumberVertices() << std::endl; std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; @@ -199,6 +213,20 @@ namespace morphstore{ std::cout << "-----------------------------------------------" << std::endl; } + void print_edge_by_id(uint64_t id) { + std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; + std::shared_ptr edge = edges[id]; + std::cout << "Edge-ID: \t" << edge->getId() << std::endl; + std::cout << "Source-ID: \t" << edge->getSourceId() << std::endl; + std::cout << "Target-ID: \t" << edge->getTargetId() << std::endl; + std::cout << "Type: \t" << get_edgeType_by_number(edge->getType()) << std::endl; + std::cout << "\n"; + std::cout << "Properties: "; + edge->print_properties(); + std::cout << "\n"; + std::cout << "-----------------------------------------------" << std::endl; + } + void print_type_dicts(){ std::cout << "VertexType-Dict: " << std::endl; for(auto const& entry : vertexTypeDictionary){ diff --git a/include/core/storage/graph/vertex/adjacencylist_vertex.h b/include/core/storage/graph/vertex/adjacencylist_vertex.h deleted file mode 100644 index 3fa1fd83..00000000 --- a/include/core/storage/graph/vertex/adjacencylist_vertex.h +++ /dev/null @@ -1,113 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file adjacencylistvertex.h - * @brief Derived vertex calss for adj. list storage format: base-class: vertex - * @todo -*/ - -#ifndef MORPHSTORE_AVERTEX_H -#define MORPHSTORE_AVERTEX_H - -#include "../edge/edge.h" - -namespace morphstore{ - - class AdjacencyListVertex: public Vertex{ - - protected: - std::vector adjacencylist; - // additional adjacency list that only contains the target ids -> for bfs measurements - std::vector adjacencylistBFS; - - public: - // constructor with unique id generation - AdjacencyListVertex(){ - // unique ID generation - static uint64_t startID = 0; - id = startID++; - } - - // returns a reference (read-only) of the adjacency list - const std::vector& get_adjList() const{ - return adjacencylist; - } - - // function to add a single edge to vertexs adjlist - void add_edge(uint64_t from, uint64_t to, unsigned short int rel) override { - this->adjacencylist.push_back(Edge(from, to, rel)); - } - - // add edges to vertexs' adjacencylist - void add_edges(const std::vector edges) override { - this->adjacencylist = edges; - - // for the additional adjacency list: transformation - for(auto edge : edges){ - adjacencylistBFS.push_back(edge.getTargetId()); - } - } - - // function which returns the number of edges - uint64_t get_number_edges() override { - return adjacencylist.size(); - } - - // debugging: - void print_neighbors() override { - for(const auto& edge : adjacencylist){ - std::cout << "Source-ID: " << edge.getSourceId() << " - Target-ID: " << edge.getTargetId() << - " - Property: { " << edge.getProperty().first << ": " << edge.getProperty().second << " }" << " || "; - } - } - - // function to return a vector of neighbor ids (for BFS) - std::vector get_neighbors_ids() override { - /* old approach - std::vector neighbors; - for(auto const& edge : adjacencylist){ - neighbors.push_back(edge.getTargetId()); - } - return neighbors; - */ - return adjacencylistBFS; - } - - // get size of vertex in bytes: - size_t get_data_size_of_vertex() override { - size_t size = 0; - size += sizeof(uint64_t); // id - size += sizeof(unsigned short int); // entity - // properties: - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ - size += sizeof(char)*(property->first.length() + property->second.length()); - } - - // Adj.List: - size += sizeof(std::vector); - for(const auto& e : adjacencylist){ - size += e.size_in_bytes(); - } - return size; - } - - }; -} - -#endif //MORPHSTORE_AVERTEX_H diff --git a/include/core/storage/graph/vertex/csr_vertex.h b/include/core/storage/graph/vertex/csr_vertex.h deleted file mode 100644 index 09de9b74..00000000 --- a/include/core/storage/graph/vertex/csr_vertex.h +++ /dev/null @@ -1,74 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file cvertex.h - * @brief Derived vertex calss for CSR storage format - * @todo -*/ - -#ifndef MORPHSTORE_CVERTEX_H -#define MORPHSTORE_CVERTEX_H - -namespace morphstore{ - - class CSRVertex: public Vertex{ - - public: - // constructor with unique id generation - CSRVertex(){ - // unique ID generation - static uint64_t startID = 0; - id = startID++; - } - - // this function has no usage here: the adding of edges happens in the graph file -> csr.h - // it's just here because it's a pure function in Vertex.h - void add_edge(uint64_t from, uint64_t to,unsigned short int rel) override { - std::cout << " virtual add_edge - no usage: " << from << ", " << to << ", " << rel << std::endl; - } - - // pure function -> no functionality - void add_edges(const std::vector edges) override { - std::cout << " virtual add_edge - no usage: " << edges[0].getSourceId() << std::endl; - } - - // debugging - void print_neighbors() override { - std::cout << " virtual print_neighbors - no usage: " << std::endl; - } - - // get size of csr vertex in bytes: - size_t get_data_size_of_vertex() override { - size_t size = 0; - // properties: - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ - size += sizeof(char)*(property->first.length() + property->second.length()); - } - // entity: - size += sizeof(unsigned short int); - // id - size += sizeof(uint64_t); - - return size; - } - - }; -} - -#endif //MORPHSTORE_CVERTEX_H diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 71e32a34..f209f8c1 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -36,8 +36,8 @@ namespace morphstore{ protected: // vertex: id, - // optional: entity, properties uint64_t id; + // optional: type, properties unsigned short int type; std::unordered_map properties; @@ -46,6 +46,10 @@ namespace morphstore{ // ----------------- Setter & Getter ----------------- + Vertex(uint64_t id){ + this->id = id; + } + uint64_t getID(){ return id; } @@ -71,29 +75,20 @@ namespace morphstore{ this->properties[property.first] = property.second;//std::move(property.second); } + // get size of vertex in bytes: + size_t get_data_size_of_vertex() { + size_t size = 0; + size += sizeof(uint64_t); // id + size += sizeof(unsigned short int); // entity + // properties: + size += sizeof(std::unordered_map); + for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ + size += sizeof(char)*(property->first.length() + property->second.length()); + } - // ----------------- (pure) virtual functions ----------------- - // todo: remove (not a vertex but a graph.h function) - virtual void add_edges(const std::vector edges) = 0; - virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void print_neighbors() = 0; - - - virtual size_t get_data_size_of_vertex() = 0; - - // todo: remove (not a vertex but a graph.h function) - virtual uint64_t get_number_edges(){ - return 0; - }; - - // for BFS alg.: adj-list - // todo: remove (not a vertex but a graph.h function) - virtual std::vector get_neighbors_ids() { - // return empty vector: implementation only needed in adj - Vertex - return std::vector(); + return size; } - // ----------------- DEBUGGING ----------------- void print_properties() { for (const auto entry : properties) { diff --git a/test/core/storage/graph/adjacencylist/CMakeLists.txt b/test/core/storage/graph/adjacencylist/CMakeLists.txt index bb2e4d94..cd5dc231 100644 --- a/test/core/storage/graph/adjacencylist/CMakeLists.txt +++ b/test/core/storage/graph/adjacencylist/CMakeLists.txt @@ -1,6 +1,8 @@ if ( CTEST_ALL OR CTEST_STORAGE ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist_test_app ) - + FILE( REMOVE ${CMAKE_BINARY_DIR}test/core/storage/graph/adjacencylist/simple_adj_graph_test_app ) + + add_executable( simple_adj_graph_test_app simple_adj_graph_test.cpp) add_executable( ldbc_graph_adjacencylist_test_app ldbc_graph_adjacencylist.cpp) target_compile_options( ldbc_graph_adjacencylist_test_app PRIVATE -Werror @@ -11,5 +13,6 @@ if ( CTEST_ALL OR CTEST_STORAGE ) $<$:-DDEBUG> ) target_link_libraries( ldbc_graph_adjacencylist_test_app PRIVATE "-ldl" stdc++fs) + add_test( simple_adj_graph_test simple_adj_graph_test_app ) add_test( ldbc_graph_adjacency_test ldbc_graph_adjacencylist_test_app ) endif() \ No newline at end of file diff --git a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp new file mode 100644 index 00000000..988ebc12 --- /dev/null +++ b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp @@ -0,0 +1,64 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file simple_graph_test_adj.cpp + * @brief Test for generating simple graph in adj. list format (+ BFS measurements) + * @todo + */ + +#include +#include +//#include +//#include // for high_resolution_clock + +int main( void ){ + + // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; + + // Graph init: + std::unique_ptr g1 = std::make_unique(); + + // generate vertices & edges from LDBC files and insert into graph structure + uint64_t v1 = g1->add_vertex_with_properties({{"age", "12"}}); + uint64_t v2 = g1->add_vertex(); + uint64_t v3 = g1->add_vertex(); + + std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; + std::map vertexTypeMap = {{0, "Person"}}; + g1->setEdgeTypeDictionary(edgeTypeMap); + g1->setVertexTypeDictionary(vertexTypeMap); + + g1->add_edge(v1, v2, 1); + g1->add_edge(v2, v3, 1); + g1->add_edge(v2, v3, 2); + + + // (DEBUG) + g1->statistics(); + g1->print_edge_by_id(1); + g1->print_neighbors_of_vertex(v1); + g1->print_neighbors_of_vertex(v2); + g1->print_neighbors_of_vertex(v3); + + return 0; +} From b06832eaba8bb74e2274a8372436291fc008b027 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 24 Mar 2020 16:01:18 +0100 Subject: [PATCH 097/216] Differentiate between expected and actual vertex/edge count --- include/core/operators/graph/top_down_bfs.h | 2 +- .../storage/graph/formats/adjacencylist.h | 4 +-- include/core/storage/graph/formats/csr.h | 14 +++++----- include/core/storage/graph/graph.h | 28 +++++++++---------- include/core/storage/graph/ldbc_import.h | 6 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/core/operators/graph/top_down_bfs.h b/include/core/operators/graph/top_down_bfs.h index f6ba23b4..6529edba 100644 --- a/include/core/operators/graph/top_down_bfs.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -40,7 +40,7 @@ namespace morphstore{ // constructor with smart pointer to graph as parameter/reference BFS(std::unique_ptr& g) : graph(std::move(g)){ - graphSize = graph->getNumberVertices(); + graphSize = graph->getVertexCount(); } uint64_t get_graph_size(){ diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 36dfaee8..aac38cf4 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -47,8 +47,8 @@ namespace morphstore{ adjacencylistPerVertex.reserve(numberVertices); edges.reserve(numberEdges); - setNumberEdges(numberEdges); - setNumberVertices(numberVertices); + this->expectedEdgeCount = numberEdges; + this->expectedVertexCount = numberVertices; } // adding a single vertex diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index c8138799..d4783028 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -49,8 +49,8 @@ namespace morphstore{ // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { - setNumberVertices(numberVertices); - setNumberEdges(numberEdges); + this->expectedVertexCount = numberVertices; + this->expectedEdgeCount = numberEdges; vertices.reserve(numberVertices); @@ -79,7 +79,7 @@ namespace morphstore{ // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { - std::cerr << "Singe edge addition not yet implemented for CSR" << sourceId << targetId << type; + std::cout << "Singe edge addition not yet implemented for CSR" << sourceId << targetId << type; } // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC @@ -96,7 +96,7 @@ namespace morphstore{ } // to avoid buffer overflow: - if(sourceID < getNumberVertices()-1){ + if(sourceID < getExpectedVertexCount()-1){ node_array[sourceID+1] = nextOffset; } } @@ -152,8 +152,8 @@ namespace morphstore{ uint64_t numberEdges = get_degree(id); // avoiding out of bounds ... - if( offset < getNumberEdges()){ - neighbors.insert(neighbors.end(), edge_array+offset, edge_array+offset+numberEdges); + if( offset < getExpectedEdgeCount()){ + neighbors.insert(neighbors.end(), edgeId_array+offset, edgeId_array+offset+numberEdges); } return neighbors; @@ -186,7 +186,7 @@ namespace morphstore{ // pointer to arrays: index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); // edges array values: - for(uint64_t i = 0; i < getNumberEdges(); i++){ + for(uint64_t i = 0; i < getExpectedEdgeCount(); i++){ index_size += sizeof(uint64_t); // node_array with offsets data_size += edge_value_array[i].size_in_bytes(); // edge value array with object } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index fe849a1d..d9b685dd 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -43,8 +43,8 @@ namespace morphstore{ class Graph{ protected: - uint64_t numberVertices; - uint64_t numberEdges; + uint64_t expectedVertexCount; + uint64_t expectedEdgeCount; // Data-structure for Vertex-Properties std::unordered_map> vertices; @@ -80,20 +80,20 @@ namespace morphstore{ this->edgeTypeDictionary = rel; } - uint64_t getNumberVertices() const { - return vertices.size(); + uint64_t getExpectedVertexCount() const { + return expectedVertexCount; } - void setNumberVertices(uint64_t numV) { - Graph::numberVertices = numV; + uint64_t getVertexCount() const { + return vertices.size(); } - uint64_t getNumberEdges() const { - return edges.size(); + uint64_t getExpectedEdgeCount() const { + return expectedEdgeCount; } - void setNumberEdges(uint64_t numE) { - Graph::numberEdges = numE; + uint64_t getEdgeCount() const { + return edges.size(); } std::string get_vertexType_by_number(unsigned short int type){ @@ -137,9 +137,9 @@ namespace morphstore{ // function to return a list of pair < vertex id, degree > DESC: std::vector> get_list_of_degree_DESC(){ std::vector> vertexDegreeList; - vertexDegreeList.reserve(numberVertices); + vertexDegreeList.reserve(expectedVertexCount); // fill the vector with every vertex key and his degree - for(uint64_t i = 0; i < numberVertices; ++i){ + for(uint64_t i = 0; i < expectedVertexCount; ++i){ vertexDegreeList.push_back({i, this->get_degree(i)}); } // sort the vector on degree DESC @@ -195,8 +195,8 @@ namespace morphstore{ virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << getNumberVertices() << std::endl; - std::cout << "Number of relations/edges: " << getNumberEdges() << std::endl; + std::cout << "Number of vertices: " << vertices.size() << std::endl; + std::cout << "Number of relations/edges: " << edges.size() << std::endl; std::cout << "--------------------------------------------" << std::endl; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 81d1db8c..7760fbb7 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -105,9 +105,8 @@ namespace morphstore{ } } - print_file_names(); - if(verticesPaths.empty()) { + print_file_names(); throw std::invalid_argument("No vertex files found"); } } @@ -626,10 +625,11 @@ namespace morphstore{ // this function writes the actual data from the intermediate vertexEdgesLookup into the graph void generate_edges(Graph& graph){ + std::cout << " Writing edges into graph " << std::endl; // firstly, sorting the intermediates with their target IDs ASC sort_VertexEdgesLookup(); - uint64_t graphSize = graph.getNumberVertices(); + uint64_t graphSize = graph.getVertexCount(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ // add edge data: From 711a7ec00e4339dd412215006b5b9f39e4e8aa5e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 24 Mar 2020 17:02:22 +0100 Subject: [PATCH 098/216] Remove edgeValue_array from csr and allow multiple properties for edges * as well as some renamings * adding a simple graph test for csr * real asserts in simple tests --- include/core/storage/graph/edge/edge.h | 67 ++++++----------- .../storage/graph/formats/adjacencylist.h | 12 +-- include/core/storage/graph/formats/csr.h | 75 +++++++++++-------- include/core/storage/graph/graph.h | 19 +++-- include/core/storage/graph/ldbc_import.h | 21 +++--- .../graph/adjacencylist/CMakeLists.txt | 2 +- .../adjacencylist/simple_adj_graph_test.cpp | 26 +++---- test/core/storage/graph/csr/CMakeLists.txt | 5 +- .../graph/csr/simple_csr_graph_test.cpp | 69 +++++++++++++++++ 9 files changed, 184 insertions(+), 112 deletions(-) create mode 100644 test/core/storage/graph/csr/simple_csr_graph_test.cpp diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index a73a6ce5..1ad1748e 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -28,7 +28,7 @@ #include #include #include - +#include namespace morphstore{ class Edge{ @@ -37,8 +37,8 @@ namespace morphstore{ // Edge characteristics uint64_t sourceID, targetID, id; unsigned short int type; - // todo: allow map instead of pair - std::pair property; + + std::unordered_map properties; uint64_t getNextEdgeId() const { static uint64_t currentMaxEdgeId = 0; @@ -46,20 +46,11 @@ namespace morphstore{ } public: - - // Constructors with parameters - Edge(uint64_t from, uint64_t to, unsigned short int type){ - setSourceId(from); - setTargetId(to); - setType(type); - this->id = getNextEdgeId(); - } - - Edge(uint64_t from, uint64_t to, unsigned short int type, std::pair prop){ - setSourceId(from); - setTargetId(to); - setType(type); - setProperty(prop); + Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type, const std::unordered_map properties = {}){ + this->sourceID = sourceId; + this->targetID = targetId; + this->type = type; + this->properties = properties; this->id = getNextEdgeId(); } @@ -70,10 +61,10 @@ namespace morphstore{ return *this; // do the copy - setSourceId(edge.sourceID); - setTargetId(edge.targetID); - setType(edge.type); - setProperty(edge.property); + this->sourceID = edge.sourceID; + this->targetID = edge.targetID; + this->type = edge.type; + this->properties = edge.properties; // return the existing object so we can chain this operator return *this; @@ -89,35 +80,22 @@ namespace morphstore{ return sourceID; } - void setSourceId(uint64_t sourceId) { - sourceID = sourceId; - } - uint64_t getTargetId() const { return targetID; } - void setTargetId(uint64_t targetId) { - targetID = targetId; - } - unsigned short getType() const { return type; } - void setType(unsigned short type) { - Edge::type = type; - } - - const std::pair &getProperty() const { - return property; + const std::unordered_map &getProperties() const { + return properties; } void setProperty(const std::pair prop) { // first check if there is any key value data, otherwise problems with segfaults if(prop.first != "" && prop.second != ""){ - Edge::property.first = prop.first; - Edge::property.second = prop.second; + properties[prop.first] = prop.second; } } @@ -133,18 +111,21 @@ namespace morphstore{ size += sizeof(uint64_t) * 2; // source- and target-id size += sizeof(unsigned short int); // relation - // property: - size += sizeof(std::pair); - size += sizeof(char)*(property.first.length() + property.second.length()); - + // properties: + size += sizeof(std::unordered_map); + for(auto property = properties.begin(); property != properties.end(); ++property){ + size += sizeof(char)*(property->first.length() + property->second.length()); + } return size; } // ----------------- DEBUGGING ----------------- void print_properties() { - std::cout << "{" << getProperty().first << ": " << getProperty().second << "}"; - std::cout << "\n"; + std::cout << std::endl; + for (const auto entry : properties) { + std::cout << " {" << entry.first << ": " << entry.second << "}" << std::endl; + } } }; } diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index aac38cf4..12b2ea0e 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -59,7 +59,7 @@ namespace morphstore{ } // adding a vertex with its properties - uint64_t add_vertex_with_properties(const std::unordered_map props) override { + uint64_t add_vertex(const std::unordered_map props) override { std::shared_ptr v = std::make_shared(getNextVertexId()); v->setProperties(props); vertices[v->getID()] = v; @@ -117,7 +117,7 @@ namespace morphstore{ // get number of neighbors of vertex with id - uint64_t get_degree(uint64_t id) override { + uint64_t get_out_degree(uint64_t id) override { if (adjacencylistPerVertex.find(id) == adjacencylistPerVertex.end()) { return 0; } @@ -165,7 +165,7 @@ namespace morphstore{ index_size += sizeof(std::unordered_map>); for(auto& it : edges){ - // index size of vertex: size of id and sizeof pointer + // index size of edge: size of id and sizeof pointer index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); // data size: data_size += it.second->size_in_bytes(); @@ -192,10 +192,12 @@ namespace morphstore{ for (const auto edgeId : *adjacencylistPerVertex[id]) { auto edge = edges[edgeId]; std::cout << " Edge-ID: " << edge->getId() + << " Type: " << get_edgeType_by_number(edge->getType()) << " Source-ID: " << edge->getSourceId() << " Target-ID: " << edge->getTargetId() - << " Property: { " << edge->getProperty().first << ": " << edge->getProperty().second << " }" - << std::endl; + << " Property: { "; + edge->print_properties(); + std::cout << std::endl << " }" << std::endl; } } } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index d4783028..76db1e82 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -33,13 +33,11 @@ namespace morphstore{ private: /* graph topology: - * node array: index is vertex-id; array cell contains offset in edge_array - * edge array: contains target id of the edge (TODO: should contain edge-id) - * edge value array: contains edge object with addtional information (same index with edge array) + * offset array: index is vertex-id; array cell contains offset in edgeId array + * edgeId array: contains edge id */ - uint64_t* node_array = nullptr; - uint64_t* edge_array = nullptr; - Edge* edge_value_array = nullptr; + uint64_t* offset_array = nullptr; + uint64_t* edgeId_array = nullptr; public: @@ -53,13 +51,13 @@ namespace morphstore{ this->expectedEdgeCount = numberEdges; vertices.reserve(numberVertices); + edges.reserve(numberEdges); - node_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); - edge_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); - edge_value_array = (Edge*) malloc(numberEdges * sizeof(Edge)); + offset_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); + edgeId_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); // init node array: - node_array[0] = 0; + offset_array[0] = 0; } // adding a single vertex (without any properties, etc...) @@ -70,7 +68,7 @@ namespace morphstore{ } // adding a vertex with its properties - uint64_t add_vertex_with_properties(const std::unordered_map props ) override { + uint64_t add_vertex(const std::unordered_map props ) override { std::shared_ptr v = std::make_shared(getNextVertexId()); v->setProperties(props); vertices[v->getID()] = v; @@ -85,19 +83,20 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { - uint64_t offset = node_array[sourceID]; + uint64_t offset = offset_array[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); // fill the arrays - for(const auto & edge : edgesToAdd){ - edge_value_array[offset] = edge; - edge_array[offset] = edge.getTargetId(); + for(const auto& edge : edgesToAdd){ + std::shared_ptr ePtr = std::make_shared(edge); + edges[ePtr->getId()] = ePtr; + edgeId_array[offset] = ePtr->getId(); ++offset; } // to avoid buffer overflow: if(sourceID < getExpectedVertexCount()-1){ - node_array[sourceID+1] = nextOffset; + offset_array[sourceID+1] = nextOffset; } } @@ -120,36 +119,43 @@ namespace morphstore{ } // get number of edges of vertex with id - uint64_t get_degree(uint64_t id) override { - uint64_t offset = node_array[id]; + uint64_t get_out_degree(uint64_t id) override { + uint64_t offset = offset_array[id]; // special case: last vertex id has no next offset uint64_t nextOffset; - if(id == getNumberVertices() -1){ - nextOffset = getNumberEdges(); + if(id == getExpectedVertexCount() -1){ + nextOffset = getExpectedEdgeCount(); }else{ - nextOffset = node_array[id+1]; + nextOffset = offset_array[id+1]; } if(offset == nextOffset) return 0; - uint64_t numberEdges = nextOffset - offset; - return numberEdges; + uint64_t degree = nextOffset - offset; + return degree; } // for debugging: void print_neighbors_of_vertex(uint64_t id) override{ - uint64_t offset = node_array[id]; - uint64_t numberEdges = get_degree(id); + std::cout << "Neighbours for Vertex with id " << id << std::endl; + uint64_t offset = offset_array[id]; + uint64_t numberEdges = get_out_degree(id); for(uint64_t i = offset; i < offset+numberEdges; ++i){ - std::cout << "Source-ID: " << edge_value_array[i].getSourceId() << " - Target-ID: " << edge_value_array[i].getTargetId() << " - Property: { " << edge_value_array[i].getProperty().first << ": " << edge_value_array[i].getProperty().second << " }" << " || "; + uint64_t edgeId = edgeId_array[i]; + std::cout << "Source-ID: " << edges[edgeId]->getSourceId() + << " - Target-ID: " << edges[edgeId]->getTargetId() + << " Property: { "; + edges[i]->print_properties(); + std::cout << std::endl + << " }" << std::endl; } } // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { std::vector neighbors; - uint64_t offset = node_array[id]; - uint64_t numberEdges = get_degree(id); + uint64_t offset = offset_array[id]; + uint64_t numberEdges = get_out_degree(id); // avoiding out of bounds ... if( offset < getExpectedEdgeCount()){ @@ -165,6 +171,7 @@ namespace morphstore{ size_t data_size = 0; size_t index_size = 0; // TODO: use Graph::get_size_of_graph() for vertices, edges, vertexTypeDictionary and edgeTypeDictionary + // lookup dicts: entity dict + relation dict. index_size += 2 * sizeof(std::map); for(auto& ent : vertexTypeDictionary){ @@ -176,19 +183,27 @@ namespace morphstore{ index_size += sizeof(char)*(rel.second.length()); } - // container for indexes: + // container for vertices: index_size += sizeof(std::unordered_map>); for(auto& it : vertices){ index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); data_size += it.second->get_data_size_of_vertex(); } + + // container for edges: + index_size += sizeof(std::unordered_map>); + for(auto& it : edges){ + // index size of edge: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += it.second->size_in_bytes(); + } // pointer to arrays: index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); // edges array values: for(uint64_t i = 0; i < getExpectedEdgeCount(); i++){ index_size += sizeof(uint64_t); // node_array with offsets - data_size += edge_value_array[i].size_in_bytes(); // edge value array with object } index_data_size = {index_size, data_size}; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index d9b685dd..bec80491 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -130,17 +130,22 @@ namespace morphstore{ } // function which returns a pointer to vertex by id - std::shared_ptr get_vertex_by_id(uint64_t id){ + std::shared_ptr get_vertex(uint64_t id){ return vertices[id]; } + // function which returns a pointer to vertex by id + std::shared_ptr get_edge(uint64_t id){ + return edges[id]; + } + // function to return a list of pair < vertex id, degree > DESC: std::vector> get_list_of_degree_DESC(){ std::vector> vertexDegreeList; vertexDegreeList.reserve(expectedVertexCount); // fill the vector with every vertex key and his degree for(uint64_t i = 0; i < expectedVertexCount; ++i){ - vertexDegreeList.push_back({i, this->get_degree(i)}); + vertexDegreeList.push_back({i, this->get_out_degree(i)}); } // sort the vector on degree DESC std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { @@ -179,12 +184,12 @@ namespace morphstore{ virtual storageFormat getStorageFormat() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; virtual uint64_t add_vertex() = 0; - virtual uint64_t add_vertex_with_properties(const std::unordered_map props ) = 0; + virtual uint64_t add_vertex(const std::unordered_map props ) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; virtual void add_type_to_vertex(const uint64_t id, const unsigned short int type) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; - virtual uint64_t get_degree(uint64_t id) = 0; + virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; virtual std::pair get_size_of_graph() = 0; @@ -195,8 +200,8 @@ namespace morphstore{ virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; - std::cout << "Number of vertices: " << vertices.size() << std::endl; - std::cout << "Number of relations/edges: " << edges.size() << std::endl; + std::cout << "Number of vertices: " << getVertexCount() << std::endl; + std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "--------------------------------------------" << std::endl; } @@ -208,7 +213,7 @@ namespace morphstore{ std::cout << "\n"; std::cout << "Properties: "; v->print_properties(); - std::cout << "#Edges: " << this->get_degree(v->getID()); + std::cout << "#Edges: " << this->get_out_degree(v->getID()); std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 7760fbb7..b3203f2f 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -220,7 +220,7 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex_with_properties(properties); + uint64_t systemID = graph.add_vertex(properties); // add vertexType number to vertex graph.add_type_to_vertex(systemID, vertexTypeNumber); // map vertexType and ldbc id to system generated id @@ -286,6 +286,7 @@ namespace morphstore{ // function which clears all intermediates after import void clear_intermediates() { + std::cout << "CleanUp"; globalIdLookupMap.clear(); edgeTypeLookup.clear(); vertexTypeLookup.clear(); @@ -448,7 +449,7 @@ namespace morphstore{ std::cout.flush(); //this variable is used for the relationLookup-keys, starting by 0 - unsigned short int relationNumber = 0; + unsigned short int edgeTypeNumber = 0; bool isEdge = false; // flag which is used to differentiate for edge-lookup-entrys (to avoid e.g. email as an edge) // iterate through vector of vertex-addresses @@ -538,7 +539,7 @@ namespace morphstore{ bool hasProperties = false; std::string propertyKey; - uint64_t fromID, toID; + uint64_t sourceVertexId, targetVertexId; // read buffer and do the magic ... for(size_t i = 0; i < fileSize; ++i){ @@ -571,24 +572,24 @@ namespace morphstore{ }else{ // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property // get the system-(global) id's from local ids - fromID = globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); + sourceVertexId = globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); // remove from id from string row.erase(0, row.find(delimiter) + delimiter.length()); std::string value; if(!hasProperties){ // WITHOUT properties: just from the first delimiter on - toID = globalIdLookupMap.at({targetVertexType, row}); + targetVertexId = globalIdLookupMap.at({targetVertexType, row}); // insert relation into vertexRealtionsLookup: - vertexEdgesLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber)); + vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber)); }else{ // with properties means: toID is until the next delimiter, and then the value for the property - toID = globalIdLookupMap.at({targetVertexType, row.substr(0, row.find(delimiter))}); + targetVertexId = globalIdLookupMap.at({targetVertexType, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; // insert relation into vertexEdgesLookup with its edge-property: - vertexEdgesLookup[fromID].push_back(morphstore::Edge(fromID, toID, relationNumber, {propertyKey, value})); + vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber, {{propertyKey, value}})); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -603,8 +604,8 @@ namespace morphstore{ // check if the name already exists if(!exist_relation_name(edgeType)){ // insert relation-number with string into map - edgeTypeLookup.insert(std::make_pair( relationNumber, edgeType)); - ++relationNumber; + edgeTypeLookup.insert(std::make_pair( edgeTypeNumber, edgeType)); + ++edgeTypeNumber; } } diff --git a/test/core/storage/graph/adjacencylist/CMakeLists.txt b/test/core/storage/graph/adjacencylist/CMakeLists.txt index cd5dc231..7b274f83 100644 --- a/test/core/storage/graph/adjacencylist/CMakeLists.txt +++ b/test/core/storage/graph/adjacencylist/CMakeLists.txt @@ -1,6 +1,6 @@ if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist_test_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}test/core/storage/graph/adjacencylist/simple_adj_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist_test_app ) add_executable( simple_adj_graph_test_app simple_adj_graph_test.cpp) add_executable( ldbc_graph_adjacencylist_test_app ldbc_graph_adjacencylist.cpp) diff --git a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp index 988ebc12..1805ce57 100644 --- a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp +++ b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp @@ -21,14 +21,11 @@ * @todo */ -#include #include +#include //#include -//#include // for high_resolution_clock int main( void ){ - - // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- std::cout << "\n"; std::cout << "**********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; @@ -38,8 +35,7 @@ int main( void ){ // Graph init: std::unique_ptr g1 = std::make_unique(); - // generate vertices & edges from LDBC files and insert into graph structure - uint64_t v1 = g1->add_vertex_with_properties({{"age", "12"}}); + uint64_t v1 = g1->add_vertex({{"age", "12"}}); uint64_t v2 = g1->add_vertex(); uint64_t v3 = g1->add_vertex(); @@ -48,17 +44,17 @@ int main( void ){ g1->setEdgeTypeDictionary(edgeTypeMap); g1->setVertexTypeDictionary(vertexTypeMap); - g1->add_edge(v1, v2, 1); - g1->add_edge(v2, v3, 1); - g1->add_edge(v2, v3, 2); - + g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); + g1->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); - // (DEBUG) - g1->statistics(); - g1->print_edge_by_id(1); - g1->print_neighbors_of_vertex(v1); g1->print_neighbors_of_vertex(v2); - g1->print_neighbors_of_vertex(v3); + + assert(g1->getVertexCount() == 3); + assert(g1->getEdgeCount() == 3); + assert((int) g1->get_edge(0)->getProperties().size() == 2); + assert(g1->get_out_degree(v3) == 0); + assert(g1->get_out_degree(v1) == 1); + assert(g1->get_out_degree(v2) == 2); return 0; } diff --git a/test/core/storage/graph/csr/CMakeLists.txt b/test/core/storage/graph/csr/CMakeLists.txt index 209120d9..31c6d2ea 100644 --- a/test/core/storage/graph/csr/CMakeLists.txt +++ b/test/core/storage/graph/csr/CMakeLists.txt @@ -1,6 +1,8 @@ if ( CTEST_ALL OR CTEST_STORAGE ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/ldbc_graph_csr_test_app ) - + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/simple_graph_csr_test_app ) + + add_executable( simple_csr_graph_test_app simple_csr_graph_test.cpp) add_executable( ldbc_graph_csr_test_app ldbc_graph_csr.cpp) target_compile_options( ldbc_graph_csr_test_app PRIVATE -Werror @@ -11,5 +13,6 @@ if ( CTEST_ALL OR CTEST_STORAGE ) $<$:-DDEBUG> ) target_link_libraries( ldbc_graph_csr_test_app PRIVATE "-ldl" stdc++fs) + add_test( simple_csr_graph_test simple_csr_graph_test_app ) add_test( ldbc_graph_csr_test ldbc_graph_csr_test_app ) endif() \ No newline at end of file diff --git a/test/core/storage/graph/csr/simple_csr_graph_test.cpp b/test/core/storage/graph/csr/simple_csr_graph_test.cpp new file mode 100644 index 00000000..96676718 --- /dev/null +++ b/test/core/storage/graph/csr/simple_csr_graph_test.cpp @@ -0,0 +1,69 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file simple_graph_test_adj.cpp + * @brief Test for generating simple graph in adj. list format (+ BFS measurements) + * @todo + */ + +#include +#include +//#include + +int main( void ){ + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: CSR-List Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; + + // Graph init: + std::unique_ptr g1 = std::make_unique(); + g1->allocate_graph_structure(3, 3); + + // generate vertices & edges from LDBC files and insert into graph structure + uint64_t v1 = g1->add_vertex({{"age", "12"}}); + uint64_t v2 = g1->add_vertex(); + uint64_t v3 = g1->add_vertex(); + + std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; + std::map vertexTypeMap = {{0, "Person"}}; + g1->setEdgeTypeDictionary(edgeTypeMap); + g1->setVertexTypeDictionary(vertexTypeMap); + + + g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); + g1->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); + + + // (DEBUG) + /*g1->statistics(); + g1->print_edge_by_id(0); + g1->print_neighbors_of_vertex(v1); + g1->print_neighbors_of_vertex(v2); + g1->print_neighbors_of_vertex(v3);*/ + + assert(g1->getVertexCount() == 3); + assert(g1->getEdgeCount() == 3); + assert((int) g1->get_edge(0)->getProperties().size() == 2); + assert(g1->get_out_degree(v3) == 0); + assert(g1->get_out_degree(v1) == 1); + assert(g1->get_out_degree(v2) == 2); + + return 0; +} From a6b169d62049595b99f6f754318d947676e9b22e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 24 Mar 2020 17:39:27 +0100 Subject: [PATCH 099/216] Unify vertex constructor removing unneccessary set methods --- .../storage/graph/formats/adjacencylist.h | 24 ------------------- include/core/storage/graph/formats/csr.h | 24 ------------------- include/core/storage/graph/graph.h | 9 ++++--- include/core/storage/graph/ldbc_import.h | 5 ++-- include/core/storage/graph/vertex/vertex.h | 8 +++---- .../adjacencylist/simple_adj_graph_test.cpp | 8 +++---- .../graph/csr/simple_csr_graph_test.cpp | 10 ++++---- 7 files changed, 20 insertions(+), 68 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 12b2ea0e..05fabd70 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -51,21 +51,6 @@ namespace morphstore{ this->expectedVertexCount = numberVertices; } - // adding a single vertex - uint64_t add_vertex() override { - std::shared_ptr v = std::make_shared(getNextVertexId()); - vertices[v->getID()] = v; - return v->getID(); - } - - // adding a vertex with its properties - uint64_t add_vertex(const std::unordered_map props) override { - std::shared_ptr v = std::make_shared(getNextVertexId()); - v->setProperties(props); - vertices[v->getID()] = v; - return v->getID(); - } - // function to add a single property to vertex void add_property_to_vertex(uint64_t id, const std::pair property) override { if (exist_vertexId(id)) { @@ -75,15 +60,6 @@ namespace morphstore{ } } - // adding type to vertex - void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { - if (exist_vertexId(id)) { - vertices[id]->setType(type); - } else { - std::cout << "Vertex with ID " << id << " not found." << std::endl; - } - } - // adding a single edge to vertex: void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { Edge e = Edge(sourceId, targetId, type); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 76db1e82..4de43c9c 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -60,21 +60,6 @@ namespace morphstore{ offset_array[0] = 0; } - // adding a single vertex (without any properties, etc...) - uint64_t add_vertex() override { - std::shared_ptr v = std::make_shared(getNextVertexId()); - vertices[v->getID()] = v; - return v->getID(); - } - - // adding a vertex with its properties - uint64_t add_vertex(const std::unordered_map props ) override { - std::shared_ptr v = std::make_shared(getNextVertexId()); - v->setProperties(props); - vertices[v->getID()] = v; - return v->getID(); - } - // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { std::cout << "Singe edge addition not yet implemented for CSR" << sourceId << targetId << type; @@ -109,15 +94,6 @@ namespace morphstore{ } } - // adding type to vertex - void add_type_to_vertex(const uint64_t id, const unsigned short int type) override { - if(exist_vertexId(id)){ - vertices[id]->setType(type); - }else{ - std::cout << "Vertex with ID " << id << " not found./type_to_vertex." << std::endl; - } - } - // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) override { uint64_t offset = offset_array[id]; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index bec80491..6d2917ee 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -96,6 +96,12 @@ namespace morphstore{ return edges.size(); } + uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { + std::shared_ptr v = std::make_shared(getNextVertexId(), type, props); + vertices[v->getID()] = v; + return v->getID(); + }; + std::string get_vertexType_by_number(unsigned short int type){ if(vertexTypeDictionary.find( type ) != vertexTypeDictionary.end()){ return vertexTypeDictionary.at(type); @@ -183,10 +189,7 @@ namespace morphstore{ virtual storageFormat getStorageFormat() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; - virtual uint64_t add_vertex() = 0; - virtual uint64_t add_vertex(const std::unordered_map props ) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; - virtual void add_type_to_vertex(const uint64_t id, const unsigned short int type) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index b3203f2f..bcc39830 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -220,9 +220,8 @@ namespace morphstore{ //----------------------------------------------------- // create vertex and insert into graph with properties - uint64_t systemID = graph.add_vertex(properties); - // add vertexType number to vertex - graph.add_type_to_vertex(systemID, vertexTypeNumber); + uint64_t systemID = graph.add_vertex(vertexTypeNumber, properties); + // map vertexType and ldbc id to system generated id globalIdLookupMap.insert({{vertexType, ldbcID}, systemID}); //----------------------------------------------------- diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index f209f8c1..b9f6470b 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -46,8 +46,10 @@ namespace morphstore{ // ----------------- Setter & Getter ----------------- - Vertex(uint64_t id){ + Vertex(uint64_t id, unsigned short int type, const std::unordered_map props){ this->id = id; + this->type = type; + this->properties = props; } uint64_t getID(){ @@ -66,10 +68,6 @@ namespace morphstore{ return properties; } - void setProperties(const std::unordered_map props) { - Vertex::properties = props; - } - // function that adds a single property key-value pair to vertex void add_property(const std::pair property){ this->properties[property.first] = property.second;//std::move(property.second); diff --git a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp index 1805ce57..a089a49b 100644 --- a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp +++ b/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp @@ -35,15 +35,15 @@ int main( void ){ // Graph init: std::unique_ptr g1 = std::make_unique(); - uint64_t v1 = g1->add_vertex({{"age", "12"}}); - uint64_t v2 = g1->add_vertex(); - uint64_t v3 = g1->add_vertex(); - std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; std::map vertexTypeMap = {{0, "Person"}}; g1->setEdgeTypeDictionary(edgeTypeMap); g1->setVertexTypeDictionary(vertexTypeMap); + uint64_t v1 = g1->add_vertex(0,{{"age", "12"}}); + uint64_t v2 = g1->add_vertex(0); + uint64_t v3 = g1->add_vertex(0); + g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); g1->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); diff --git a/test/core/storage/graph/csr/simple_csr_graph_test.cpp b/test/core/storage/graph/csr/simple_csr_graph_test.cpp index 96676718..f111abda 100644 --- a/test/core/storage/graph/csr/simple_csr_graph_test.cpp +++ b/test/core/storage/graph/csr/simple_csr_graph_test.cpp @@ -36,15 +36,15 @@ int main( void ){ std::unique_ptr g1 = std::make_unique(); g1->allocate_graph_structure(3, 3); - // generate vertices & edges from LDBC files and insert into graph structure - uint64_t v1 = g1->add_vertex({{"age", "12"}}); - uint64_t v2 = g1->add_vertex(); - uint64_t v3 = g1->add_vertex(); - std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; std::map vertexTypeMap = {{0, "Person"}}; g1->setEdgeTypeDictionary(edgeTypeMap); g1->setVertexTypeDictionary(vertexTypeMap); + + uint64_t v1 = g1->add_vertex(0, {{"age", "12"}}); + uint64_t v2 = g1->add_vertex(0); + uint64_t v3 = g1->add_vertex(0); + g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); From 15e52d73a4a408d14135ad73a0c59f2d6fdc523d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 29 Mar 2020 17:08:24 +0200 Subject: [PATCH 100/216] Restructure graph tests * now having template methods for each kind of test --- test/CMakeLists.txt | 4 +- .../graph/adjacencylist/CMakeLists.txt | 18 ----- test/core/storage/graph/csr/CMakeLists.txt | 18 ----- .../core/storage/graph/csr/ldbc_graph_csr.cpp | 65 --------------- test/core/storage/graph/ldbc/CMakeLists.txt | 12 +++ .../graph/ldbc/ldbc_adj_graph_test.cpp | 34 ++++++++ .../graph/ldbc/ldbc_csr_graph_test.cpp | 34 ++++++++ .../ldbc_graph_test.h} | 63 ++++++++++----- test/core/storage/graph/simple/CMakeLists.txt | 10 +++ .../simple_adj_graph_test.cpp | 37 +-------- .../graph/simple/simple_csr_graph_test.cpp | 31 +++++++ .../simple_graph_test.h} | 80 +++++++++++-------- 12 files changed, 214 insertions(+), 192 deletions(-) delete mode 100644 test/core/storage/graph/adjacencylist/CMakeLists.txt delete mode 100644 test/core/storage/graph/csr/CMakeLists.txt delete mode 100644 test/core/storage/graph/csr/ldbc_graph_csr.cpp create mode 100644 test/core/storage/graph/ldbc/CMakeLists.txt create mode 100644 test/core/storage/graph/ldbc/ldbc_adj_graph_test.cpp create mode 100644 test/core/storage/graph/ldbc/ldbc_csr_graph_test.cpp rename test/core/storage/graph/{adjacencylist/ldbc_graph_adjacencylist.cpp => ldbc/ldbc_graph_test.h} (63%) create mode 100644 test/core/storage/graph/simple/CMakeLists.txt rename test/core/storage/graph/{adjacencylist => simple}/simple_adj_graph_test.cpp (52%) create mode 100644 test/core/storage/graph/simple/simple_csr_graph_test.cpp rename test/core/storage/graph/{csr/simple_csr_graph_test.cpp => simple/simple_graph_test.h} (50%) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 125ec463..41670bb8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory( core/persistence ) add_subdirectory( core/storage ) add_subdirectory( core/utils ) -add_subdirectory( core/storage/graph/adjacencylist ) -add_subdirectory( core/storage/graph/csr ) +add_subdirectory( core/storage/graph/simple ) +add_subdirectory( core/storage/graph/ldbc ) add_subdirectory(vector) diff --git a/test/core/storage/graph/adjacencylist/CMakeLists.txt b/test/core/storage/graph/adjacencylist/CMakeLists.txt deleted file mode 100644 index 7b274f83..00000000 --- a/test/core/storage/graph/adjacencylist/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}test/core/storage/graph/adjacencylist/simple_adj_graph_test_app ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist_test_app ) - - add_executable( simple_adj_graph_test_app simple_adj_graph_test.cpp) - add_executable( ldbc_graph_adjacencylist_test_app ldbc_graph_adjacencylist.cpp) - target_compile_options( ldbc_graph_adjacencylist_test_app PRIVATE - -Werror - -Wall - -Wextra - -pedantic - -fstack-protector-all - $<$:-DDEBUG> ) - target_link_libraries( ldbc_graph_adjacencylist_test_app PRIVATE "-ldl" stdc++fs) - - add_test( simple_adj_graph_test simple_adj_graph_test_app ) - add_test( ldbc_graph_adjacency_test ldbc_graph_adjacencylist_test_app ) -endif() \ No newline at end of file diff --git a/test/core/storage/graph/csr/CMakeLists.txt b/test/core/storage/graph/csr/CMakeLists.txt deleted file mode 100644 index 31c6d2ea..00000000 --- a/test/core/storage/graph/csr/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -if ( CTEST_ALL OR CTEST_STORAGE ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/ldbc_graph_csr_test_app ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/csr/simple_graph_csr_test_app ) - - add_executable( simple_csr_graph_test_app simple_csr_graph_test.cpp) - add_executable( ldbc_graph_csr_test_app ldbc_graph_csr.cpp) - target_compile_options( ldbc_graph_csr_test_app PRIVATE - -Werror - -Wall - -Wextra - -pedantic - -fstack-protector-all - $<$:-DDEBUG> ) - target_link_libraries( ldbc_graph_csr_test_app PRIVATE "-ldl" stdc++fs) - - add_test( simple_csr_graph_test simple_csr_graph_test_app ) - add_test( ldbc_graph_csr_test ldbc_graph_csr_test_app ) -endif() \ No newline at end of file diff --git a/test/core/storage/graph/csr/ldbc_graph_csr.cpp b/test/core/storage/graph/csr/ldbc_graph_csr.cpp deleted file mode 100644 index 9da0f041..00000000 --- a/test/core/storage/graph/csr/ldbc_graph_csr.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file ldbc_graph_csr.cpp - * @brief Test for generating social network graph in CSR format + BFS measurements - * @todo - */ - -#include -#include -#include - -#include // for high_resolution_clock - -int main( void ){ - - // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- - /* - std::cout << "\n"; - std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: CSR Storage Format *" << std::endl; - std::cout << "**********************************************************" << std::endl; - std::cout << "\n"; - */ - - // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); - - // Graph init: - std::unique_ptr g1 = std::make_unique(); - - // generate vertices & edges from LDBC files and insert into graph structure - ldbcImport->import(*g1); - - // measure degree distribution and write to file (file path as parameter): - // g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); - - // some statistics (DEBUG) - g1->statistics(); - - // (DEBUG) Test Vertex, which contains edges with properties (SERVER): - // g1->print_vertex_by_id(1035174); - // g1->print_neighbors_of_vertex(1035174); - - // Execute BFS measurements: - // std::unique_ptr bfs = std::make_unique(g1); - // bfs->do_measurements(10000, "/home/pfeiffer/measurements/csr/bfs_SF1.csv"); - - return 0; -} diff --git a/test/core/storage/graph/ldbc/CMakeLists.txt b/test/core/storage/graph/ldbc/CMakeLists.txt new file mode 100644 index 00000000..c9014af9 --- /dev/null +++ b/test/core/storage/graph/ldbc/CMakeLists.txt @@ -0,0 +1,12 @@ +if ( CTEST_ALL OR CTEST_STORAGE ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc/ldbc_csr_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/ldbc/ldbc_adj_graph_test_app ) + + add_executable( ldbc_csr_graph_test_app ldbc_csr_graph_test.cpp) + add_executable( ldbc_adj_graph_test_app ldbc_adj_graph_test.cpp) + target_link_libraries(ldbc_adj_graph_test_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(ldbc_csr_graph_test_app PRIVATE "-ldl" stdc++fs) + + add_test( ldbc_csr_graph_test ldbc_csr_graph_test_app ) + add_test( ldbc_adj_graph_test ldbc_adj_graph_test_app ) +endif() \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_adj_graph_test.cpp b/test/core/storage/graph/ldbc/ldbc_adj_graph_test.cpp new file mode 100644 index 00000000..6804104c --- /dev/null +++ b/test/core/storage/graph/ldbc/ldbc_adj_graph_test.cpp @@ -0,0 +1,34 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file ldbc_graph_adjacency.cpp + * @brief Test for generating social network graph in adj. list format + BFS measurements + * @todo + */ +#include +#include "ldbc_graph_test.h" + +int main( void ){ + ldbcGraphFormatTest(); + + // Execute BFS measurements: + //std::unique_ptr bfs = std::make_unique(g1); + //bfs->do_measurements(10000, "/home/florentin/Morphstore/Output/adj_bfs_SF1.csv"); + + return 0; +} diff --git a/test/core/storage/graph/ldbc/ldbc_csr_graph_test.cpp b/test/core/storage/graph/ldbc/ldbc_csr_graph_test.cpp new file mode 100644 index 00000000..cf22e888 --- /dev/null +++ b/test/core/storage/graph/ldbc/ldbc_csr_graph_test.cpp @@ -0,0 +1,34 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file ldbc_graph_adjacency.cpp + * @brief Test for generating social network graph in adj. list format + BFS measurements + * @todo + */ +#include +#include "ldbc_graph_test.h" + +int main( void ){ + ldbcGraphFormatTest(); + + // Execute BFS measurements: + //std::unique_ptr bfs = std::make_unique(g1); + //bfs->do_measurements(10000, "/home/florentin/Morphstore/Output/adj_bfs_SF1.csv"); + + return 0; +} diff --git a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp b/test/core/storage/graph/ldbc/ldbc_graph_test.h similarity index 63% rename from test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp rename to test/core/storage/graph/ldbc/ldbc_graph_test.h index eea8c86b..ae9cd330 100644 --- a/test/core/storage/graph/adjacencylist/ldbc_graph_adjacencylist.cpp +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -23,43 +23,62 @@ #include #include -#include -#include // for high_resolution_clock +void print_header(morphstore::Graph::storageFormat format) { + std::string storageFormat; -int main( void ){ + switch (format) + { + case morphstore::Graph::storageFormat::csr: + storageFormat = "CSR"; + break; + case morphstore::Graph::storageFormat::adjacencylist: + storageFormat = "Adjacency-List"; + break; + }; - // ------------------------------------ LDBC-IMPORT TEST ----------------------------------- - /* std::cout << "\n"; std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; + std::cout << "* MorphStore-Storage-Test: LDBC " << storageFormat << " Storage Format *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; - */ +} + +template +void ldbcGraphFormatTest (void) { + + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + + std::string sourceDir = ""; + std::string targetDir = ""; + + if (sourceDir.empty()) { + throw std::invalid_argument("Where are the ldbc files??"); + } + + if (targetDir.empty()) { + throw std::invalid_argument("Degree count has to be saved somewhere"); + } + + std::unique_ptr graph = std::make_unique(); + print_header(graph->getStorageFormat()); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) - std::unique_ptr ldbcImport = std::make_unique("/home/pfeiffer/ldbc_sn_data/social_network_1/"); + std::unique_ptr ldbcImport = std::make_unique(sourceDir); - // Graph init: - std::unique_ptr g1 = std::make_unique(); // generate vertices & edges from LDBC files and insert into graph structure - ldbcImport->import(*g1); + ldbcImport->import(*graph); // measure degree distribution and write to file (file path as parameter): - // g1->measure_degree_count("/home/pfeiffer/measurements/adjacency_list/graph_degree_count_SF10.csv"); + graph->measure_degree_count(targetDir + "adj_graph_degree_count_SF10.csv"); // some statistics (DEBUG) - g1->statistics(); + std::cout << "Some statistics" << std::endl; + graph->statistics(); // (DEBUG) Test Vertex, which contains edges with properties (SERVER): - // g1->print_vertex_by_id(1035174); - // g1->print_neighbors_of_vertex(1035174); - - // Execute BFS measurements: - // std::unique_ptr bfs = std::make_unique(g1); - // bfs->do_measurements(10000, "/home/pfeiffer/measurements/adjacency_list/bfs_SF1.csv"); - - return 0; -} + graph->print_vertex_by_id(1035174); + graph->print_edge_by_id(10); + graph->print_neighbors_of_vertex(1035174); +} \ No newline at end of file diff --git a/test/core/storage/graph/simple/CMakeLists.txt b/test/core/storage/graph/simple/CMakeLists.txt new file mode 100644 index 00000000..1b6f5b91 --- /dev/null +++ b/test/core/storage/graph/simple/CMakeLists.txt @@ -0,0 +1,10 @@ +if ( CTEST_ALL OR CTEST_STORAGE ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/simple/simple_csr_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/storage/graph/simple/simple_adj_graph_test_app ) + + add_executable( simple_csr_graph_test_app simple_csr_graph_test.cpp) + add_executable( simple_adj_graph_test_app simple_adj_graph_test.cpp) + + add_test( simple_csr_graph_test simple_csr_graph_test_app ) + add_test( simple_adj_graph_test simple_adj_graph_test_app ) +endif() \ No newline at end of file diff --git a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp b/test/core/storage/graph/simple/simple_adj_graph_test.cpp similarity index 52% rename from test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp rename to test/core/storage/graph/simple/simple_adj_graph_test.cpp index a089a49b..461c4223 100644 --- a/test/core/storage/graph/adjacencylist/simple_adj_graph_test.cpp +++ b/test/core/storage/graph/simple/simple_adj_graph_test.cpp @@ -17,44 +17,15 @@ /** * @file simple_graph_test_adj.cpp - * @brief Test for generating simple graph in adj. list format (+ BFS measurements) + * @brief Test for generating simple graph in adj. list format * @todo */ #include -#include -//#include +#include "simple_graph_test.h" -int main( void ){ - std::cout << "\n"; - std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: Adjacency-List Storage Format *" << std::endl; - std::cout << "**********************************************************" << std::endl; - std::cout << "\n"; - - // Graph init: - std::unique_ptr g1 = std::make_unique(); - - std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; - std::map vertexTypeMap = {{0, "Person"}}; - g1->setEdgeTypeDictionary(edgeTypeMap); - g1->setVertexTypeDictionary(vertexTypeMap); - - uint64_t v1 = g1->add_vertex(0,{{"age", "12"}}); - uint64_t v2 = g1->add_vertex(0); - uint64_t v3 = g1->add_vertex(0); - - g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); - g1->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); - - g1->print_neighbors_of_vertex(v2); - - assert(g1->getVertexCount() == 3); - assert(g1->getEdgeCount() == 3); - assert((int) g1->get_edge(0)->getProperties().size() == 2); - assert(g1->get_out_degree(v3) == 0); - assert(g1->get_out_degree(v1) == 1); - assert(g1->get_out_degree(v2) == 2); +int main(void) { + simpleGraphFormatTest(); return 0; } diff --git a/test/core/storage/graph/simple/simple_csr_graph_test.cpp b/test/core/storage/graph/simple/simple_csr_graph_test.cpp new file mode 100644 index 00000000..7f4a5a9d --- /dev/null +++ b/test/core/storage/graph/simple/simple_csr_graph_test.cpp @@ -0,0 +1,31 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file simple_graph_test_csr.cpp + * @brief Test for generating simple graph in csr list format + * @todo + */ + +#include +#include "simple_graph_test.h" + +int main(void) { + simpleGraphFormatTest(); + + return 0; +} diff --git a/test/core/storage/graph/csr/simple_csr_graph_test.cpp b/test/core/storage/graph/simple/simple_graph_test.h similarity index 50% rename from test/core/storage/graph/csr/simple_csr_graph_test.cpp rename to test/core/storage/graph/simple/simple_graph_test.h index f111abda..5d0ac4c4 100644 --- a/test/core/storage/graph/csr/simple_csr_graph_test.cpp +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -16,54 +16,66 @@ **********************************************************************************************/ /** - * @file simple_graph_test_adj.cpp - * @brief Test for generating simple graph in adj. list format (+ BFS measurements) + * @file simple_graph_test.cpp + * @brief Base test for testing graph formats on a very simple graph * @todo */ - -#include +#include #include -//#include -int main( void ){ +void print_header(morphstore::Graph::storageFormat format) { + std::string storageFormat; + + switch (format) + { + case morphstore::Graph::storageFormat::csr: + storageFormat = "CSR"; + break; + case morphstore::Graph::storageFormat::adjacencylist: + storageFormat = "Adjacency-List"; + break; + }; + std::cout << "\n"; std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: CSR-List Storage Format *" << std::endl; + std::cout << "* MorphStore-Storage-Test: Simple " << storageFormat << " Storage Format *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; +} - // Graph init: - std::unique_ptr g1 = std::make_unique(); - g1->allocate_graph_structure(3, 3); +template +void simpleGraphFormatTest (void) { + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; - std::map vertexTypeMap = {{0, "Person"}}; - g1->setEdgeTypeDictionary(edgeTypeMap); - g1->setVertexTypeDictionary(vertexTypeMap); - - uint64_t v1 = g1->add_vertex(0, {{"age", "12"}}); - uint64_t v2 = g1->add_vertex(0); - uint64_t v3 = g1->add_vertex(0); - + std::unique_ptr graph = std::make_unique(); + print_header(graph->getStorageFormat()); + graph->allocate_graph_structure(3, 3); - g1->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); - g1->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); + std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; + std::map vertexTypeMap = {{0, "Person"}}; + graph->setEdgeTypeDictionary(edgeTypeMap); + graph->setVertexTypeDictionary(vertexTypeMap); + uint64_t v1 = graph->add_vertex(0, {{"age", "12"}}); + uint64_t v2 = graph->add_vertex(0); + uint64_t v3 = graph->add_vertex(0); - // (DEBUG) - /*g1->statistics(); - g1->print_edge_by_id(0); - g1->print_neighbors_of_vertex(v1); - g1->print_neighbors_of_vertex(v2); - g1->print_neighbors_of_vertex(v3);*/ + graph->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); + graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); - assert(g1->getVertexCount() == 3); - assert(g1->getEdgeCount() == 3); - assert((int) g1->get_edge(0)->getProperties().size() == 2); - assert(g1->get_out_degree(v3) == 0); - assert(g1->get_out_degree(v1) == 1); - assert(g1->get_out_degree(v2) == 2); + // (DEBUG) + /*graph->statistics(); + graph->print_edge_by_id(0); + graph->print_neighbors_of_vertex(v1); + graph->print_neighbors_of_vertex(v2); + graph->print_neighbors_of_vertex(v3);*/ - return 0; + assert(graph->getVertexCount() == 3); + assert(graph->getEdgeCount() == 3); + assert((int)graph->get_edge(0)->getProperties().size() == 2); + assert(graph->get_out_degree(v3) == 0); + assert(graph->get_out_degree(v1) == 1); + assert(graph->get_out_degree(v2) == 2); } + From 5973ea81c98120495082f44c3aa5a029c6d6d1a3 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 29 Mar 2020 17:26:50 +0200 Subject: [PATCH 101/216] Throw runtime error for single edge addition for CSR graph --- include/core/storage/graph/formats/csr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 4de43c9c..4567c06f 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -26,6 +26,7 @@ #include "../graph.h" #include "../vertex/vertex.h" +#include namespace morphstore{ @@ -62,7 +63,7 @@ namespace morphstore{ // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { - std::cout << "Singe edge addition not yet implemented for CSR" << sourceId << targetId << type; + throw std::runtime_error("Singe edge addition not yet implemented for CSR" + sourceId + targetId + type); } // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC From c6cfcdd8cc1f5bfbdd720179e8873628072d7581 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 29 Mar 2020 18:56:49 +0200 Subject: [PATCH 102/216] Extract bfs test into extra package and adding a string representation for the graph format --- include/core/storage/graph/graph.h | 12 +++ test/CMakeLists.txt | 1 + test/core/operators/graph/ldbc/CMakeLists.txt | 12 +++ .../graph/ldbc/bfs_ldbc_adj_graph_test.cpp | 30 +++++++ .../graph/ldbc/bfs_ldbc_csr_graph_test.cpp | 29 +++++++ .../graph/ldbc/bfs_ldbc_graph_test.h | 81 +++++++++++++++++++ .../core/storage/graph/ldbc/ldbc_graph_test.h | 25 ++---- 7 files changed, 173 insertions(+), 17 deletions(-) create mode 100644 test/core/operators/graph/ldbc/CMakeLists.txt create mode 100644 test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test.cpp create mode 100644 test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test.cpp create mode 100644 test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 6d2917ee..9d04e07c 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -62,6 +62,18 @@ namespace morphstore{ enum storageFormat {csr, adjacencylist }; + const std::string get_storage_format_string() { + switch (this->getStorageFormat()) { + case csr: + return "CSR"; + case adjacencylist: + return "Adjacency_List"; + default: + return ""; + } + return ""; + } + // -------------------- Setters & Getters -------------------- const std::map &getVertexTypeDictionary() const { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 41670bb8..64da1226 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,5 +7,6 @@ add_subdirectory( core/utils ) add_subdirectory( core/storage/graph/simple ) add_subdirectory( core/storage/graph/ldbc ) +add_subdirectory( core/operators/graph/ldbc ) add_subdirectory(vector) diff --git a/test/core/operators/graph/ldbc/CMakeLists.txt b/test/core/operators/graph/ldbc/CMakeLists.txt new file mode 100644 index 00000000..ca06cc54 --- /dev/null +++ b/test/core/operators/graph/ldbc/CMakeLists.txt @@ -0,0 +1,12 @@ +if ( CTEST_ALL OR CTEST_OPERATORS ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test_app ) + + add_executable( bfs_ldbc_csr_graph_test_app bfs_ldbc_csr_graph_test.cpp) + add_executable( bfs_ldbc_adj_graph_test_app bfs_ldbc_adj_graph_test.cpp) + target_link_libraries(bfs_ldbc_csr_graph_test_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(bfs_ldbc_adj_graph_test_app PRIVATE "-ldl" stdc++fs) + + add_test( bfs_ldbc_csr_graph_test_app bfs_ldbc_csr_graph_test_app ) + add_test( bfs_ldbc_adj_graph_test_app bfs_ldbc_adj_graph_test_app ) +endif() \ No newline at end of file diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test.cpp b/test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test.cpp new file mode 100644 index 00000000..f22bc1ff --- /dev/null +++ b/test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test.cpp @@ -0,0 +1,30 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_ldbc__adj_graph_test.cpp + * @brief Test for bfs of social network graph in adj list format + * @todo + */ +#include +#include "bfs_ldbc_graph_test.h" + +int main( void ){ + bfs_ldbc_graph_test(); + + return 0; +} diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test.cpp b/test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test.cpp new file mode 100644 index 00000000..99c6f276 --- /dev/null +++ b/test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test.cpp @@ -0,0 +1,29 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_ldbc_csr_graph_test.cpp + * @brief Test for bfs of social network graph in csr list format + * @todo + */ +#include +#include "bfs_ldbc_graph_test.h" + +int main( void ){ + bfs_ldbc_graph_test(); + return 0; +} diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h new file mode 100644 index 00000000..88a203ca --- /dev/null +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -0,0 +1,81 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_ldbc_graph_test.cpp + * @brief Test methods for bfs on social network graph + * @todo + */ + +#include +#include +#include + +void print_header(morphstore::Graph::storageFormat format) { + std::string storageFormat; + + switch (format) + { + case morphstore::Graph::storageFormat::csr: + storageFormat = "CSR"; + break; + case morphstore::Graph::storageFormat::adjacencylist: + storageFormat = "Adjacency-List"; + break; + }; + + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Storage-Test: LDBC " << storageFormat << " Storage Format *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; +} + +template +void bfs_ldbc_graph_test (void) { + + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + + std::string sourceDir = ""; + std::string targetDir = ""; + + if (sourceDir.empty()) { + throw std::invalid_argument("Where are the ldbc files??"); + } + + if (targetDir.empty()) { + throw std::invalid_argument("Degree count has to be saved somewhere"); + } + + std::unique_ptr graph = std::make_unique(); + print_header(graph->getStorageFormat()); + + // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) + std::unique_ptr ldbcImport = std::make_unique(sourceDir); + + + // generate vertices & edges from LDBC files and insert into graph structure + ldbcImport->import(*graph); + + // some statistics (DEBUG) + std::cout << "Some statistics" << std::endl; + graph->statistics(); + + auto bfs = std::make_unique(graph); + + bfs->do_measurements(10000, targetDir + "bfs_" + graph->get_storage_format_string()); +} \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index ae9cd330..fba7b388 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -16,27 +16,15 @@ **********************************************************************************************/ /** - * @file ldbc_graph_adjacency.cpp - * @brief Test for generating social network graph in adj. list format + BFS measurements + * @file ldbc_graph_test.cpp + * @brief Test for generating social network graph in a given graph format * @todo */ #include #include -void print_header(morphstore::Graph::storageFormat format) { - std::string storageFormat; - - switch (format) - { - case morphstore::Graph::storageFormat::csr: - storageFormat = "CSR"; - break; - case morphstore::Graph::storageFormat::adjacencylist: - storageFormat = "Adjacency-List"; - break; - }; - +void print_header(std::string storageFormat) { std::cout << "\n"; std::cout << "**********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: LDBC " << storageFormat << " Storage Format *" << std::endl; @@ -61,7 +49,10 @@ void ldbcGraphFormatTest (void) { } std::unique_ptr graph = std::make_unique(); - print_header(graph->getStorageFormat()); + + std::string storageFormat = graph->get_storage_format_string(); + + print_header(storageFormat); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) std::unique_ptr ldbcImport = std::make_unique(sourceDir); @@ -71,7 +62,7 @@ void ldbcGraphFormatTest (void) { ldbcImport->import(*graph); // measure degree distribution and write to file (file path as parameter): - graph->measure_degree_count(targetDir + "adj_graph_degree_count_SF10.csv"); + graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); // some statistics (DEBUG) std::cout << "Some statistics" << std::endl; From 7302cf7205efd93981dc4ea5b3878c602b9694cf Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 30 Mar 2020 15:00:03 +0200 Subject: [PATCH 103/216] Add bfs tests for a simple graph * Fix csr neighbour Ids by transforming edgeIds to targetIds * add asserts to assure right allocations for csr * remove "storageFormat" enum for a string representation --- .../storage/graph/formats/adjacencylist.h | 30 ++++---- include/core/storage/graph/formats/csr.h | 50 ++++++++------ include/core/storage/graph/graph.h | 19 +---- test/CMakeLists.txt | 1 + .../graph/ldbc/bfs_ldbc_graph_test.h | 22 ++---- .../operators/graph/simple/CMakeLists.txt | 12 ++++ .../simple/bfs_simple_adj_graph_test.cpp | 30 ++++++++ .../simple/bfs_simple_csr_graph_test.cpp | 29 ++++++++ .../graph/simple/bfs_simple_graph_test.h | 69 +++++++++++++++++++ .../core/storage/graph/ldbc/ldbc_graph_test.h | 2 +- .../storage/graph/simple/simple_graph_test.h | 16 +---- 11 files changed, 197 insertions(+), 83 deletions(-) create mode 100644 test/core/operators/graph/simple/CMakeLists.txt create mode 100644 test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp create mode 100644 test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp create mode 100644 test/core/operators/graph/simple/bfs_simple_graph_test.h diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 05fabd70..1d3ebc67 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -28,6 +28,7 @@ #include "../vertex/vertex.h" #include +#include namespace morphstore{ @@ -37,8 +38,8 @@ namespace morphstore{ std::unordered_map>> adjacencylistPerVertex; public: - storageFormat getStorageFormat() const override { - return adjacencylist; + std::string get_storage_format() const override { + return "Adjacency_List"; } // function: to set graph allocations @@ -77,7 +78,7 @@ namespace morphstore{ adjacencylistPerVertex[sourceId] = adjacencyList; } - for(const auto& edge : edgesToAdd) { + for(const auto edge : edgesToAdd) { edges[edge.getId()] = std::make_shared(edge); if(exist_vertexId(edge.getTargetId())) { adjacencyList->push_back(edge.getId()); @@ -94,11 +95,12 @@ namespace morphstore{ // get number of neighbors of vertex with id uint64_t get_out_degree(uint64_t id) override { - if (adjacencylistPerVertex.find(id) == adjacencylistPerVertex.end()) { + auto entry = adjacencylistPerVertex.find(id); + if (entry == adjacencylistPerVertex.end()) { return 0; } else { - return adjacencylistPerVertex[id]->size(); + return entry->second->size(); } } @@ -106,8 +108,13 @@ namespace morphstore{ std::vector get_neighbors_ids(uint64_t id) override { std::vector targetVertexIds = std::vector(); - for(auto const edgeId: *adjacencylistPerVertex[id]) { - targetVertexIds.push_back(edges[edgeId]->getTargetId()); + auto entry = adjacencylistPerVertex.find(id); + + if (entry != adjacencylistPerVertex.end()) { + for(uint64_t const edgeId: *(entry->second)) { + assert(edges.find(edgeId) != edges.end()); + targetVertexIds.push_back(edges[edgeId]->getTargetId()); + } } return targetVertexIds; @@ -166,14 +173,7 @@ namespace morphstore{ } else { for (const auto edgeId : *adjacencylistPerVertex[id]) { - auto edge = edges[edgeId]; - std::cout << " Edge-ID: " << edge->getId() - << " Type: " << get_edgeType_by_number(edge->getType()) - << " Source-ID: " << edge->getSourceId() - << " Target-ID: " << edge->getTargetId() - << " Property: { "; - edge->print_properties(); - std::cout << std::endl << " }" << std::endl; + print_edge_by_id(edgeId); } } } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 4567c06f..4f293a30 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -27,6 +27,7 @@ #include "../graph.h" #include "../vertex/vertex.h" #include +#include namespace morphstore{ @@ -42,8 +43,8 @@ namespace morphstore{ public: - storageFormat getStorageFormat() const override { - return csr; + std::string get_storage_format() const override { + return "CSR"; } // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays @@ -69,6 +70,7 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { + assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); uint64_t offset = offset_array[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); @@ -111,35 +113,27 @@ namespace morphstore{ return degree; } - // for debugging: - void print_neighbors_of_vertex(uint64_t id) override{ - std::cout << "Neighbours for Vertex with id " << id << std::endl; - uint64_t offset = offset_array[id]; - uint64_t numberEdges = get_out_degree(id); - - for(uint64_t i = offset; i < offset+numberEdges; ++i){ - uint64_t edgeId = edgeId_array[i]; - std::cout << "Source-ID: " << edges[edgeId]->getSourceId() - << " - Target-ID: " << edges[edgeId]->getTargetId() - << " Property: { "; - edges[i]->print_properties(); - std::cout << std::endl - << " }" << std::endl; - } - } - // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { - std::vector neighbors; + std::vector neighbourEdgeIds; uint64_t offset = offset_array[id]; uint64_t numberEdges = get_out_degree(id); // avoiding out of bounds ... if( offset < getExpectedEdgeCount()){ - neighbors.insert(neighbors.end(), edgeId_array+offset, edgeId_array+offset+numberEdges); + neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_array+offset, edgeId_array+offset+numberEdges); } - return neighbors; + std::vector targetVertexIds; + + // resolving each edgeId + for (auto edgeId: neighbourEdgeIds) + { + assert(edges.find(edgeId) != edges.end()); + targetVertexIds.push_back(edges[edgeId]->getTargetId()); + } + + return targetVertexIds; } // get size of storage format: @@ -187,6 +181,18 @@ namespace morphstore{ return index_data_size; } + + // for debugging: + void print_neighbors_of_vertex(uint64_t id) override{ + std::cout << "Neighbours for Vertex with id " << id << std::endl; + uint64_t offset = offset_array[id]; + uint64_t numberEdges = get_out_degree(id); + + for(uint64_t i = offset; i < offset+numberEdges; ++i){ + uint64_t edgeId = edgeId_array[i]; + print_edge_by_id(edgeId); + } + } }; } #endif //MORPHSTORE_CSR_H diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 9d04e07c..379f59c1 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -36,6 +36,7 @@ #include #include #include +#include namespace morphstore{ @@ -59,21 +60,6 @@ namespace morphstore{ return currentMaxVertexId++; } public: - - enum storageFormat {csr, adjacencylist }; - - const std::string get_storage_format_string() { - switch (this->getStorageFormat()) { - case csr: - return "CSR"; - case adjacencylist: - return "Adjacency_List"; - default: - return ""; - } - return ""; - } - // -------------------- Setters & Getters -------------------- const std::map &getVertexTypeDictionary() const { @@ -109,6 +95,7 @@ namespace morphstore{ } uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { + assert(expectedVertexCount > getVertexCount()); std::shared_ptr v = std::make_shared(getNextVertexId(), type, props); vertices[v->getID()] = v; return v->getID(); @@ -199,7 +186,7 @@ namespace morphstore{ // -------------------- pure virtual functions -------------------- - virtual storageFormat getStorageFormat() const = 0; + virtual std::string get_storage_format() const = 0; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 64da1226..ba42a897 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory( core/utils ) add_subdirectory( core/storage/graph/simple ) add_subdirectory( core/storage/graph/ldbc ) +add_subdirectory( core/operators/graph/simple ) add_subdirectory( core/operators/graph/ldbc ) add_subdirectory(vector) diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index 88a203ca..d5922855 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -25,22 +25,11 @@ #include #include -void print_header(morphstore::Graph::storageFormat format) { - std::string storageFormat; - - switch (format) - { - case morphstore::Graph::storageFormat::csr: - storageFormat = "CSR"; - break; - case morphstore::Graph::storageFormat::adjacencylist: - storageFormat = "Adjacency-List"; - break; - }; +void print_header(std::string storageFormat) { std::cout << "\n"; std::cout << "**********************************************************" << std::endl; - std::cout << "* MorphStore-Storage-Test: LDBC " << storageFormat << " Storage Format *" << std::endl; + std::cout << "* MorphStore-Operator-Test: LDBC " << storageFormat << " BFS Test *" << std::endl; std::cout << "**********************************************************" << std::endl; std::cout << "\n"; } @@ -62,7 +51,9 @@ void bfs_ldbc_graph_test (void) { } std::unique_ptr graph = std::make_unique(); - print_header(graph->getStorageFormat()); + std::string storageFormat = graph->get_storage_format(); + + print_header(storageFormat); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) std::unique_ptr ldbcImport = std::make_unique(sourceDir); @@ -77,5 +68,6 @@ void bfs_ldbc_graph_test (void) { auto bfs = std::make_unique(graph); - bfs->do_measurements(10000, targetDir + "bfs_" + graph->get_storage_format_string()); + assert(bfs->do_BFS(0) == 229144); + //bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); } \ No newline at end of file diff --git a/test/core/operators/graph/simple/CMakeLists.txt b/test/core/operators/graph/simple/CMakeLists.txt new file mode 100644 index 00000000..81a6bd93 --- /dev/null +++ b/test/core/operators/graph/simple/CMakeLists.txt @@ -0,0 +1,12 @@ +if ( CTEST_ALL OR CTEST_OPERATORS ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/simple/bfs_simple_csr_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/simple/bfs_simple_adj_graph_test_app ) + + add_executable( bfs_simple_csr_graph_test_app bfs_simple_csr_graph_test.cpp) + add_executable( bfs_simple_adj_graph_test_app bfs_simple_adj_graph_test.cpp) + target_link_libraries(bfs_simple_csr_graph_test_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(bfs_simple_adj_graph_test_app PRIVATE "-ldl" stdc++fs) + + add_test( bfs_simple_csr_graph_test_app bfs_simple_csr_graph_test_app ) + add_test( bfs_simple_adj_graph_test_app bfs_simple_adj_graph_test_app ) +endif() \ No newline at end of file diff --git a/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp b/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp new file mode 100644 index 00000000..50248273 --- /dev/null +++ b/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp @@ -0,0 +1,30 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_simple__adj_graph_test.cpp + * @brief Test for bfs of social network graph in adj list format + * @todo + */ +#include +#include "bfs_simple_graph_test.h" + +int main( void ){ + bfs_simple_graph_test(); + + return 0; +} diff --git a/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp b/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp new file mode 100644 index 00000000..e324eb58 --- /dev/null +++ b/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp @@ -0,0 +1,29 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_simple_csr_graph_test.cpp + * @brief Test for bfs of social network graph in csr list format + * @todo + */ +#include +#include "bfs_simple_graph_test.h" + +int main( void ){ + bfs_simple_graph_test(); + return 0; +} diff --git a/test/core/operators/graph/simple/bfs_simple_graph_test.h b/test/core/operators/graph/simple/bfs_simple_graph_test.h new file mode 100644 index 00000000..1f3036aa --- /dev/null +++ b/test/core/operators/graph/simple/bfs_simple_graph_test.h @@ -0,0 +1,69 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_simple_graph_test.cpp + * @brief Test methods for bfs on social network graph + * @todo + */ + +#include +#include +#include + +void print_header(std::string storageFormat) { + + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Operator-Test: Simple " << storageFormat << " BFS Test *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; +} + +template +void bfs_simple_graph_test (void) { + + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + + std::unique_ptr graph = std::make_unique(); + print_header(graph->get_storage_format()); + + graph->allocate_graph_structure(4, 4); + + std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; + std::map vertexTypeMap = {{0, "Person"}}; + graph->setEdgeTypeDictionary(edgeTypeMap); + graph->setVertexTypeDictionary(vertexTypeMap); + + uint64_t v1 = graph->add_vertex(0); + uint64_t v2 = graph->add_vertex(0); + uint64_t v3 = graph->add_vertex(0); + graph->add_vertex(0); + + graph->add_edges(v1, {morphstore::Edge(v1, v2, 1)}); + graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); + graph->add_edges(v3, {morphstore::Edge(v3, v2, 1)}); + // some statistics (DEBUG) + std::cout << "Some statistics" << std::endl; + graph->statistics(); + + auto bfs = std::make_unique(graph); + + //assert(graph->getVertexCount() == 4); + //assert(graph->getEdgeCount() == 3); + assert(bfs->do_BFS(v1) == 2); +} \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index fba7b388..e4e6edf8 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -50,7 +50,7 @@ void ldbcGraphFormatTest (void) { std::unique_ptr graph = std::make_unique(); - std::string storageFormat = graph->get_storage_format_string(); + std::string storageFormat = graph->get_storage_format(); print_header(storageFormat); diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 5d0ac4c4..6a52fb9a 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -23,19 +23,7 @@ #include #include -void print_header(morphstore::Graph::storageFormat format) { - std::string storageFormat; - - switch (format) - { - case morphstore::Graph::storageFormat::csr: - storageFormat = "CSR"; - break; - case morphstore::Graph::storageFormat::adjacencylist: - storageFormat = "Adjacency-List"; - break; - }; - +void print_header(std::string storageFormat) { std::cout << "\n"; std::cout << "**********************************************************" << std::endl; std::cout << "* MorphStore-Storage-Test: Simple " << storageFormat << " Storage Format *" << std::endl; @@ -48,7 +36,7 @@ void simpleGraphFormatTest (void) { static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); std::unique_ptr graph = std::make_unique(); - print_header(graph->getStorageFormat()); + print_header(graph->get_storage_format()); graph->allocate_graph_structure(3, 3); From 519bf595e82729bf403df4b6a1dbf03ce31bdb86 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 30 Mar 2020 18:25:19 +0200 Subject: [PATCH 104/216] Upgrade cpp version to 17 --- CMakeLists.txt | 2 +- include/core/storage/graph/ldbc_import.h | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12f3b5a5..18a8d0da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required( VERSION 3.10 ) project( MorphStore ) -set( CMAKE_CXX_STANDARD 14 ) +set( CMAKE_CXX_STANDARD 17 ) macro(morph_flag) add_definitions(${ARGN}) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index bcc39830..332cd24d 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -27,8 +27,8 @@ #include #include -// experimental/filesystem to read file directories -#include + +#include #include #include #include @@ -95,7 +95,7 @@ namespace morphstore{ // function which iterates through directory to receive file names (entire path) void insert_file_names(std::string dir) { - for (const auto &entry : std::experimental::filesystem::directory_iterator(dir)) { + for (const auto &entry : std::filesystem::directory_iterator(dir)) { // ignore files starting with a '.' (+ 1 as '/' is the first character otherwise) if (entry.path().string()[dir.size() + 1] == '.') { continue; @@ -115,7 +115,6 @@ namespace morphstore{ void differentiate(std::string path, std::string dir) { // if the string contains a '_' -> it's a relation file; otherwise a vertex file // if string contains word_word it is an edge files (vertex files only contain one word) - // todo: remove dir name to remain only the *.csv // a vertex file contains exactly one word and after that only numbers are allowed f.i. _0_0 std::regex vertexFileRegExp("^\\/([a-zA-Z]+\\_)([0-9_]*).csv$"); @@ -373,7 +372,6 @@ namespace morphstore{ if (!verticesPaths.empty()) { //this variable is used for the vertexTypeLookup-keys, starting by 0 - unsigned short int vertexTypeNumber = 0; // iterate through vector of vertex-addresses for (const auto &file : verticesPaths) { @@ -430,9 +428,6 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); - // insert vertexType-number with string into map - vertexTypeLookup.insert(std::make_pair( vertexTypeNumber, vertexType)); - ++vertexTypeNumber; } } From 1ea20d2887eb5347d061bb3b41438e42a742d2a8 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 30 Mar 2020 18:36:51 +0200 Subject: [PATCH 105/216] Fix edge type bug and only build vertex_type_lookup once and removing some redundant checks --- include/core/storage/graph/graph.h | 1 + include/core/storage/graph/ldbc_import.h | 220 +++++++++++------------ 2 files changed, 104 insertions(+), 117 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 379f59c1..e4b35ccf 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -67,6 +67,7 @@ namespace morphstore{ } void setVertexTypeDictionary(const std::map& ent) { + assert(ent.size() != 0); this->vertexTypeDictionary = ent; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 332cd24d..86add86b 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -111,9 +111,9 @@ namespace morphstore{ } } - // this function differentiates, whether the file is a vertex or relation and puts it into the specific vector + // this function differentiates, whether the file is a vertex or edge and puts it into the specific vector void differentiate(std::string path, std::string dir) { - // if the string contains a '_' -> it's a relation file; otherwise a vertex file + // if the string contains a '_' -> it's a edge file; otherwise a vertex file // if string contains word_word it is an edge files (vertex files only contain one word) // a vertex file contains exactly one word and after that only numbers are allowed f.i. _0_0 @@ -133,8 +133,6 @@ namespace morphstore{ std::cout << "(1/2) Generating LDBC-Vertices ..."; std::cout.flush(); - //this variable is used for the vertexTypeLookup-keys, starting by 0 - unsigned short int vertexTypeNumber = 0; // iterate through vector of vertex-addresses for (const auto &file : verticesPaths) @@ -143,6 +141,7 @@ namespace morphstore{ std::vector attributes; std::string vertexType = getEntityType(file); + int vertexTypeNumber = get_vertex_type_number(vertexType); char *buffer; @@ -160,10 +159,11 @@ namespace morphstore{ // calculate file size if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast(vertexFile.tellg()); vertexFile.clear(); - vertexFile.seekg(0, - std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + vertexFile.seekg(0, std::ios::beg); } // allocate memory @@ -209,8 +209,7 @@ namespace morphstore{ std::string ldbcID = row.substr(0, row.find(delimiter)); while ((next = row.find(delimiter, last)) != std::string::npos) { - properties.insert( - std::make_pair(attributes[attrIndex], row.substr(last, next - last))); + properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next - last))); last = next + 1; ++attrIndex; } @@ -234,33 +233,28 @@ namespace morphstore{ delete[] buffer; // free memory vertexFile.close(); - // insert vertexType-number with string into map - vertexTypeLookup.insert(std::make_pair(vertexTypeNumber, vertexType)); ++vertexTypeNumber; attributes.clear(); } - // graph gets full vertexType-list here: - graph.setVertexTypeDictionary(vertexTypeLookup); } - // function which returns true, if parameter is a vertexType in ldbc-files - bool is_vertexType(const std::string &vertexType) { + // function which returns the vertex_type_number if parameter is a vertexType in the ldbc-files. else -1 + int get_vertex_type_number(const std::string &vertexType) { // iterate through entities-map to look up for paramater for (auto const &entry : vertexTypeLookup) { if (entry.second == vertexType) { - return true; + return entry.first; } } - return false; + return -1; } - // function which returns true, if the relation already exist - bool exist_relation_name(const std::string &relation) { - // Todo: replace whole function by by entitiesLookup.find(relation) + // function which returns true, if the edge type already exist + bool exist_edge_type_name(const std::string &edge_type) { // iterate through edges-map to look up for paramater for (auto const &entry : edgeTypeLookup) { - if (entry.second == relation) { + if (entry.second == edge_type) { return true; } } @@ -300,19 +294,19 @@ namespace morphstore{ if (!edgesPaths.empty()) { - // iterate through vector of relation-addresses + // iterate through vector of edge-type file paths for (const auto &file : edgesPaths) { - std::string relation = getEntityType(file); + std::string edge_type = getEntityType(file); // TOdo: use regExp ([a-zA-Z]+)_([a-zA-Z]+)_([a-zA-Z]+) - std::string sourceVertexType = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); + std::string sourceVertexType = edge_type.substr(0, edge_type.find('_')); + edge_type.erase(0, edge_type.find('_') + 1); - std::string edgeType = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); + std::string edgeType = edge_type.substr(0, edge_type.find('_')); + edge_type.erase(0, edge_type.find('_') + 1); - std::string targetVertexType = relation; + std::string targetVertexType = edge_type; char *buffer; @@ -340,8 +334,8 @@ namespace morphstore{ edgeFile.read(buffer, fileSize); // read data as one big block bool firstLine = true; - // check from file name whether it's a relation file or multi value attribute file - if (is_vertexType(targetVertexType)) { + // check from file name whether it's a edge file or multi value attribute file + if (get_vertex_type_number(targetVertexType) != -1) { for (size_t i = 0; i < fileSize; ++i) { if (buffer[i] == '\n') { @@ -364,72 +358,62 @@ namespace morphstore{ return result; } - // get number of vertices from files + // get number of vertices from files and fill vertexTypeDictionary uint64_t get_total_number_vertices() { uint64_t result = 0; - if (!verticesPaths.empty()) { - - //this variable is used for the vertexTypeLookup-keys, starting by 0 - - // iterate through vector of vertex-addresses - for (const auto &file : verticesPaths) { - std::string vertexType = getEntityType(file); - - char *buffer; - - uint64_t fileSize = 0; - - std::string address = getDirectory() + file; - - std::ifstream vertexFile(address, std::ios::binary | + // iterate through vector of vertex-addresses + for (const auto &file : verticesPaths) { + char *buffer; + uint64_t fileSize = 0; + std::string address = getDirectory() + file; + std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening - if (!vertexFile) { - std::cerr << "Error, opening file. "; - exit(EXIT_FAILURE); - } - - // calculate file size - if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - vertexFile.clear(); - vertexFile.seekg(0, - std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - } - - // allocate memory - buffer = (char *) malloc(fileSize * sizeof(char)); - vertexFile.read(buffer, fileSize); // read data as one big block - size_t start = 0; - std::string delimiter = "|"; + if (!vertexFile) { + std::cerr << "Error, opening file. "; + exit(EXIT_FAILURE); + } - // read buffer and do the magic ... - for (size_t i = 0; i < fileSize; ++i) { - if (buffer[i] == '\n') { - // get a row into string form buffer with start- and end-point - std::string row(&buffer[start], &buffer[i]); + // calculate file size + if (vertexFile.is_open()) { + fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + vertexFile.clear(); + // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + vertexFile.seekg(0, std::ios::beg); + } - // remove unnecessary '\n' at the beginning of a string - if (row.find('\n') != std::string::npos) { - row.erase(0, 1); - } + // allocate memory + buffer = (char *)malloc(fileSize * sizeof(char)); + vertexFile.read(buffer, fileSize); // read data as one big block + size_t start = 0; + std::string delimiter = "|"; - // first line of *.csv contains the attributes -> write to attributes vector - if (start != 0) { - ++result; - } + // read buffer and do the magic ... + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') + { + // get a row into string form buffer with start- and end-point + std::string row(&buffer[start], &buffer[i]); - start = i; // set new starting point for buffer (otherwise it's concatenated) + // remove unnecessary '\n' at the beginning of a string + if (row.find('\n') != std::string::npos) { + row.erase(0, 1); } - } - - delete[] buffer; // free memory - vertexFile.close(); + // first line of *.csv contains the attributes -> write to attributes vector + if (start != 0) { + ++result; + } + // set new starting point for buffer (otherwise it's concatenated) + start = i; + } } + + delete[] buffer; // free memory + vertexFile.close(); } return result; } @@ -442,24 +426,21 @@ namespace morphstore{ std::cout << "(2/2) Generating LDBC-Edges ..."; std::cout.flush(); - //this variable is used for the relationLookup-keys, starting by 0 + //this variable is used for the edgeLookup-keys, starting by 0 unsigned short int edgeTypeNumber = 0; - bool isEdge = false; // flag which is used to differentiate for edge-lookup-entrys (to avoid e.g. email as an edge) // iterate through vector of vertex-addresses for (const auto &file : edgesPaths) { + // get the edge-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + // TODO: use regExp + std::string edge_type = getEntityType(file); + std::string sourceVertexType = edge_type.substr(0, edge_type.find('_')); + edge_type.erase(0, edge_type.find('_') + 1); - isEdge = false; + std::string edgeType = edge_type.substr(0, edge_type.find('_')); + edge_type.erase(0, edge_type.find('_') + 1); - // get the relation-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment - std::string relation = getEntityType(file); - std::string sourceVertexType = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); - - std::string edgeType = relation.substr(0, relation.find('_')); - relation.erase(0, relation.find('_') + 1); - - std::string targetVertexType = relation; + std::string targetVertexType = edge_type; char* buffer; @@ -488,8 +469,8 @@ namespace morphstore{ size_t start = 0; std::string delimiter = "|"; - // check from file name whether it's a relation file or multi value attribute file - if(!is_vertexType(targetVertexType)){ + // check from file name whether it's an edge file or multi value attribute file + if(get_vertex_type_number(targetVertexType) == -1) { // Multi-value-attributes: just take the last recently one std::string propertyKey; std::unordered_map multiValueAttr; @@ -526,10 +507,13 @@ namespace morphstore{ } } - // handling of relation-files ... + // handling of edge-files ... else{ - - isEdge = true; + // check if the name already exists + if (!exist_edge_type_name(edgeType)) { + ++edgeTypeNumber; + edgeTypeLookup.insert(std::make_pair(edgeTypeNumber, edgeType)); + } bool hasProperties = false; std::string propertyKey; @@ -551,10 +535,10 @@ namespace morphstore{ size_t count = 0; // first line of *.csv: Differentiate whether it's - // (1) relation without properties: e.g. Person.id|Person.id -> #delimiter = 1 - // (2) relation with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 + // (1) edge without properties: e.g. Person.id|Person.id -> #delimiter = 1 + // (2) edge with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 if(start == 0){ - // if there are 2 delimiter ('|') -> relation file with properties + // if there are 2 delimiter ('|') -> edge file with properties while ((next = row.find(delimiter, last)) != std::string::npos){ last = next + 1; ++count; @@ -563,7 +547,7 @@ namespace morphstore{ hasProperties = true; propertyKey = row.substr(last); } - }else{ + } else { // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property // get the system-(global) id's from local ids sourceVertexId = globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); @@ -574,7 +558,7 @@ namespace morphstore{ // WITHOUT properties: just from the first delimiter on targetVertexId = globalIdLookupMap.at({targetVertexType, row}); - // insert relation into vertexRealtionsLookup: + // insert edge into vertexRealtionsLookup: vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber)); }else{ // with properties means: toID is until the next delimiter, and then the value for the property @@ -582,7 +566,7 @@ namespace morphstore{ row.erase(0, row.find(delimiter) + delimiter.length()); value = row; - // insert relation into vertexEdgesLookup with its edge-property: + // insert edge into vertexEdgesLookup with its edge-property: vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber, {{propertyKey, value}})); } } @@ -592,19 +576,8 @@ namespace morphstore{ } delete[] buffer; // free memory edgeFile.close(); - - //check if the relation name is a relation (no multi value file) - if(isEdge){ - // check if the name already exists - if(!exist_relation_name(edgeType)){ - // insert relation-number with string into map - edgeTypeLookup.insert(std::make_pair( edgeTypeNumber, edgeType)); - ++edgeTypeNumber; - } - } - } - // graph gets full relation-list here: + // graph gets full edge-type-list here: graph.setEdgeTypeDictionary(edgeTypeLookup); } } @@ -632,6 +605,14 @@ namespace morphstore{ } } + void generate_vertex_type_lookup() { + uint64_t vertex_type_number = 0; + for(std::string vertex_file: verticesPaths) { + vertexTypeLookup.insert(std::make_pair(vertex_type_number, getEntityType(vertex_file))); + vertex_type_number++; + } + } + // MAIN IMPORT FUNCTION: see steps in comments void import(Graph& graph) { std::cout << "Importing LDBC-files into graph ... "; @@ -639,6 +620,11 @@ namespace morphstore{ // (1) get number vertices and number edges: uint64_t numberVertices = get_total_number_vertices(); + + // populate vertex_type_lookup for differentiating between edge and property files + generate_vertex_type_lookup(); + graph.setVertexTypeDictionary(vertexTypeLookup); + uint64_t numberEdges = get_total_number_edges(); // (2) allocate graph memory From 8822abc19b78b1875901c24e085476bd778fef56 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 30 Mar 2020 18:38:55 +0200 Subject: [PATCH 106/216] Add vscode to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c1de3bc9..c54d8736 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ recentMorphStoreProjectConf.log Dockerfile .DS_Store doc/doxygen/latex -doc/doxygen/html \ No newline at end of file +doc/doxygen/html +.vscode/ \ No newline at end of file From f3520d16373d5f0f761472aaea889b83204a364a Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 30 Mar 2020 19:31:11 +0200 Subject: [PATCH 107/216] Put vertex properties into seperate container thus remaining vertex objects are fixed sized now .. also removing duplicate code --- .../storage/graph/formats/adjacencylist.h | 47 +-------- include/core/storage/graph/formats/csr.h | 45 +-------- include/core/storage/graph/graph.h | 96 +++++++++++++++++-- include/core/storage/graph/vertex/vertex.h | 54 +++++------ 4 files changed, 116 insertions(+), 126 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 1d3ebc67..b3574403 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -44,21 +44,8 @@ namespace morphstore{ // function: to set graph allocations void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { - vertices.reserve(numberVertices); + Graph::allocate_graph_structure(numberVertices, numberEdges); adjacencylistPerVertex.reserve(numberVertices); - edges.reserve(numberEdges); - - this->expectedEdgeCount = numberEdges; - this->expectedVertexCount = numberVertices; - } - - // function to add a single property to vertex - void add_property_to_vertex(uint64_t id, const std::pair property) override { - if (exist_vertexId(id)) { - vertices[id]->add_property(property); - } else { - std::cout << "Vertex with ID " << id << " not found." << std::endl; - } } // adding a single edge to vertex: @@ -123,36 +110,8 @@ namespace morphstore{ // for measuring the size in bytes: std::pair get_size_of_graph() override { std::pair index_data_size; - size_t data_size = 0; - size_t index_size = 0; - - // lookup type dicts - index_size += 2 * sizeof(std::map); - for(auto& ent : vertexTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(ent.second.length()); - } - for(auto& rel : edgeTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(rel.second.length()); - } - - // container for indexes: - index_size += sizeof(std::unordered_map>); - for(auto& it : vertices){ - // index size of vertex: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += it.second->get_data_size_of_vertex(); - } - - index_size += sizeof(std::unordered_map>); - for(auto& it : edges){ - // index size of edge: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += it.second->size_in_bytes(); - } + + auto [index_size, data_size] = Graph::get_size_of_graph(); // adjacencyListPerVertex for(auto& it : adjacencylistPerVertex){ diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 4f293a30..d6999b24 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -49,11 +49,7 @@ namespace morphstore{ // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { - this->expectedVertexCount = numberVertices; - this->expectedEdgeCount = numberEdges; - - vertices.reserve(numberVertices); - edges.reserve(numberEdges); + Graph::allocate_graph_structure(numberVertices, numberEdges); offset_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); edgeId_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); @@ -88,15 +84,6 @@ namespace morphstore{ } } - // function to add a single property to vertex - void add_property_to_vertex(uint64_t id, const std::pair property) override { - if(exist_vertexId(id)){ - vertices[id]->add_property(property); - }else{ - std::cout << "Vertex with ID " << id << " not found./property_to_vertex" << std::endl; - } - } - // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) override { uint64_t offset = offset_array[id]; @@ -139,36 +126,8 @@ namespace morphstore{ // get size of storage format: std::pair get_size_of_graph() override { std::pair index_data_size; - size_t data_size = 0; - size_t index_size = 0; - // TODO: use Graph::get_size_of_graph() for vertices, edges, vertexTypeDictionary and edgeTypeDictionary - - // lookup dicts: entity dict + relation dict. - index_size += 2 * sizeof(std::map); - for(auto& ent : vertexTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(ent.second.length()); - } - for(auto& rel : edgeTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(rel.second.length()); - } - - // container for vertices: - index_size += sizeof(std::unordered_map>); - for(auto& it : vertices){ - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - data_size += it.second->get_data_size_of_vertex(); - } - // container for edges: - index_size += sizeof(std::unordered_map>); - for(auto& it : edges){ - // index size of edge: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += it.second->size_in_bytes(); - } + auto [index_size, data_size] = Graph::get_size_of_graph(); // pointer to arrays: index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index e4b35ccf..24aa7355 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -51,6 +51,12 @@ namespace morphstore{ std::unordered_map> vertices; std::unordered_map> edges; + // store outside of entity objects as they have a variable size and can be better compressed this way + // TODO: try other property storage formats than per node .. (triple-store or per property) + std::unordered_map> vertex_properties; + std::unordered_map> edge_properties; + + // Lookup for types: number to string std::map vertexTypeDictionary; std::map edgeTypeDictionary; @@ -97,9 +103,11 @@ namespace morphstore{ uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { assert(expectedVertexCount > getVertexCount()); - std::shared_ptr v = std::make_shared(getNextVertexId(), type, props); - vertices[v->getID()] = v; - return v->getID(); + std::shared_ptr v = std::make_shared(getNextVertexId(), type); + uint64_t id = v->getID(); + vertices[id] = v; + vertex_properties.insert(std::make_pair(id, props)); + return id; }; std::string get_vertexType_by_number(unsigned short int type){ @@ -136,8 +144,8 @@ namespace morphstore{ } // function which returns a pointer to vertex by id - std::shared_ptr get_vertex(uint64_t id){ - return vertices[id]; + VertexWithProperties get_vertex(uint64_t id){ + return VertexWithProperties(vertices[id], vertex_properties[id]); } // function which returns a pointer to vertex by id @@ -185,16 +193,83 @@ namespace morphstore{ fs.close(); } + void add_property_to_vertex(uint64_t id, const std::pair property) { + vertex_properties[id].insert(property); + }; + // -------------------- pure virtual functions -------------------- virtual std::string get_storage_format() const = 0; - virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) = 0; - virtual void add_property_to_vertex(uint64_t id, const std::pair property) = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; - virtual std::pair get_size_of_graph() = 0; + + virtual std::pair get_size_of_graph(){ + std::pair index_data_size; + size_t data_size = 0; + size_t index_size = 0; + + // lookup type dicts + index_size += 2 * sizeof(std::map); + for(auto& ent : vertexTypeDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(ent.second.length()); + } + for(auto& rel : edgeTypeDictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(rel.second.length()); + } + + // container for indexes: + index_size += sizeof(std::unordered_map>); + for(auto& it : vertices){ + // index size of vertex: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += it.second->get_data_size_of_vertex(); + } + + index_size += sizeof(std::unordered_map>); + for(auto& it : edges){ + // index size of edge: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += it.second->size_in_bytes(); + } + + // TODO: extra propertymappings class + + // node-properties: + index_size += sizeof(std::unordered_map>); + for(auto& property_mapping: vertex_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { + data_size += sizeof(char) * (property->first.length() + property->second.length()); + } + } + + // edge-properties: + index_size += sizeof(std::unordered_map>); + for(auto& property_mapping: edge_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { + data_size += sizeof(char) * (property->first.length() + property->second.length()); + } + } + + return index_data_size; + }; + + virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) { + this->expectedVertexCount = numberVertices; + this->expectedEdgeCount = numberEdges; + + vertices.reserve(numberVertices); + vertex_properties.reserve(numberVertices); + edges.reserve(numberEdges); + edge_properties.reserve(numberEdges); + }; // -------------------- debugging functions -------------------- @@ -215,7 +290,10 @@ namespace morphstore{ std::cout << "Type: \t" << get_vertexType_by_number(v->getType()) << std::endl; std::cout << "\n"; std::cout << "Properties: "; - v->print_properties(); + for (const auto entry : vertex_properties[id]) { + std::cout << "{" << entry.first << ": " << entry.second << "}"; + } + std::cout << "\n"; std::cout << "#Edges: " << this->get_out_degree(v->getID()); std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index b9f6470b..129bdc1d 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -29,6 +29,7 @@ #include #include #include +#include namespace morphstore{ @@ -39,17 +40,12 @@ namespace morphstore{ uint64_t id; // optional: type, properties unsigned short int type; - std::unordered_map properties; public: - - // ----------------- Setter & Getter ----------------- - - Vertex(uint64_t id, unsigned short int type, const std::unordered_map props){ + Vertex(uint64_t id, unsigned short int type){ this->id = id; this->type = type; - this->properties = props; } uint64_t getID(){ @@ -60,40 +56,38 @@ namespace morphstore{ return type; } - void setType(const unsigned short type) { - Vertex::type = type; - } - - const std::unordered_map &getProperties() const { - return properties; - } - - // function that adds a single property key-value pair to vertex - void add_property(const std::pair property){ - this->properties[property.first] = property.second;//std::move(property.second); - } - // get size of vertex in bytes: size_t get_data_size_of_vertex() { size_t size = 0; size += sizeof(uint64_t); // id size += sizeof(unsigned short int); // entity - // properties: - size += sizeof(std::unordered_map); - for(std::unordered_map::iterator property = properties.begin(); property != properties.end(); ++property){ - size += sizeof(char)*(property->first.length() + property->second.length()); - } return size; } + }; - // ----------------- DEBUGGING ----------------- - void print_properties() { - for (const auto entry : properties) { - std::cout << "{" << entry.first << ": " << entry.second << "}"; + // convinience class for returning whole vertices + class VertexWithProperties { + private: + std::shared_ptr vertex; + std::unordered_map properties; + public: + VertexWithProperties(std::shared_ptr vertex, const std::unordered_map properties) { + this->vertex = vertex; + this->properties = properties; + } + + uint64_t getID() { + return vertex->getID(); + } + + unsigned short getType() const { + return vertex->getType(); + } + + std::unordered_map getProperties() { + return properties; } - std::cout << "\n"; - } }; } From ef5826e4d2fa2c0018616a75b7d36717f356c777 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 31 Mar 2020 11:40:32 +0200 Subject: [PATCH 108/216] Put edge properties in a seperate container now stored together .. making the edge object fixed sized --- include/core/storage/graph/edge/edge.h | 47 ++++++++----------- include/core/storage/graph/graph.h | 14 ++++-- include/core/storage/graph/ldbc_import.h | 16 ++++++- .../storage/graph/simple/simple_graph_test.h | 7 ++- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 1ad1748e..b04e3b09 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -29,6 +29,8 @@ #include #include #include +#include + namespace morphstore{ class Edge{ @@ -38,19 +40,16 @@ namespace morphstore{ uint64_t sourceID, targetID, id; unsigned short int type; - std::unordered_map properties; - uint64_t getNextEdgeId() const { static uint64_t currentMaxEdgeId = 0; return currentMaxEdgeId++; } public: - Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type, const std::unordered_map properties = {}){ + Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type){ this->sourceID = sourceId; this->targetID = targetId; this->type = type; - this->properties = properties; this->id = getNextEdgeId(); } @@ -64,7 +63,6 @@ namespace morphstore{ this->sourceID = edge.sourceID; this->targetID = edge.targetID; this->type = edge.type; - this->properties = edge.properties; // return the existing object so we can chain this operator return *this; @@ -88,17 +86,6 @@ namespace morphstore{ return type; } - const std::unordered_map &getProperties() const { - return properties; - } - - void setProperty(const std::pair prop) { - // first check if there is any key value data, otherwise problems with segfaults - if(prop.first != "" && prop.second != ""){ - properties[prop.first] = prop.second; - } - } - // function for sorting algorithms in the ldbc-importer: // compare target-ids and return if it's "lower" (we need the sorting for the CSR) bool operator<(const Edge& e) const{ @@ -110,23 +97,27 @@ namespace morphstore{ size_t size = 0; size += sizeof(uint64_t) * 2; // source- and target-id size += sizeof(unsigned short int); // relation - - // properties: - size += sizeof(std::unordered_map); - for(auto property = properties.begin(); property != properties.end(); ++property){ - size += sizeof(char)*(property->first.length() + property->second.length()); - } return size; } + }; + class EdgeWithProperties { + private: + std::shared_ptr edge; + std::unordered_map properties; + public: + EdgeWithProperties(std::shared_ptr edge, const std::unordered_map properties) { + this->edge = edge; + this->properties = properties; + } - // ----------------- DEBUGGING ----------------- - void print_properties() { - std::cout << std::endl; - for (const auto entry : properties) { - std::cout << " {" << entry.first << ": " << entry.second << "}" << std::endl; + std::shared_ptr getEdge() { + return edge; + } + + std::unordered_map getProperties() { + return properties; } - } }; } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 24aa7355..cfd74c7c 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -149,8 +149,8 @@ namespace morphstore{ } // function which returns a pointer to vertex by id - std::shared_ptr get_edge(uint64_t id){ - return edges[id]; + EdgeWithProperties get_edge(uint64_t id){ + return EdgeWithProperties(edges[id], edge_properties[id]); } // function to return a list of pair < vertex id, degree > DESC: @@ -197,6 +197,10 @@ namespace morphstore{ vertex_properties[id].insert(property); }; + void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { + edge_properties[id] = properties; + }; + // -------------------- pure virtual functions -------------------- virtual std::string get_storage_format() const = 0; @@ -279,7 +283,9 @@ namespace morphstore{ virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getVertexCount() << std::endl; + std::cout << "Number of vertices with properties:" << vertex_properties.size() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; + std::cout << "Number of edges with properties:" << edge_properties.size() << std::endl; std::cout << "--------------------------------------------" << std::endl; } @@ -308,7 +314,9 @@ namespace morphstore{ std::cout << "Type: \t" << get_edgeType_by_number(edge->getType()) << std::endl; std::cout << "\n"; std::cout << "Properties: "; - edge->print_properties(); + for (const auto entry : edge_properties[id]) { + std::cout << "{" << entry.first << ": " << entry.second << "}"; + } std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 86add86b..39e21502 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -63,6 +63,7 @@ namespace morphstore{ // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) std::unordered_map> vertexEdgesLookup; + std::unordered_map> edgeProperties; public: @@ -283,6 +284,7 @@ namespace morphstore{ edgeTypeLookup.clear(); vertexTypeLookup.clear(); edgesPaths.clear(); + edgeProperties.clear(); verticesPaths.clear(); vertexEdgesLookup.clear(); } @@ -567,7 +569,10 @@ namespace morphstore{ value = row; // insert edge into vertexEdgesLookup with its edge-property: - vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber, {{propertyKey, value}})); + auto edge = morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber); + vertexEdgesLookup[sourceVertexId].push_back(edge); + // assuming all properties of an edge are defined in the same file + edgeProperties[edge.getId()] = {{propertyKey, value}}; } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -600,8 +605,15 @@ namespace morphstore{ uint64_t graphSize = graph.getVertexCount(); for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ + auto edges = vertexEdgesLookup[vertexID]; // add edge data: - graph.add_edges(vertexID, vertexEdgesLookup[vertexID]); + graph.add_edges(vertexID, edges); + for(auto edge: edges) { + auto entry = edgeProperties.find(edge.getId()); + if (entry != edgeProperties.end()) { + graph.add_properties_to_edge(entry->first, entry->second); + } + } } } diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 6a52fb9a..8d72f052 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -49,7 +49,10 @@ void simpleGraphFormatTest (void) { uint64_t v2 = graph->add_vertex(0); uint64_t v3 = graph->add_vertex(0); - graph->add_edges(v1, {morphstore::Edge(v1, v2, 1, {{"rating", "42"}, {"description", "has the answer to everything"}})}); + auto e1 = morphstore::Edge(v1, v2, 1); + + graph->add_edges(v1, {e1}); + graph->add_properties_to_edge(e1.getId(), {{"rating", "42"}, {"description", "has the answer to everything"}}); graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); // (DEBUG) @@ -61,7 +64,7 @@ void simpleGraphFormatTest (void) { assert(graph->getVertexCount() == 3); assert(graph->getEdgeCount() == 3); - assert((int)graph->get_edge(0)->getProperties().size() == 2); + assert((int)graph->get_edge(e1.getId()).getProperties().size() == 2); assert(graph->get_out_degree(v3) == 0); assert(graph->get_out_degree(v1) == 1); assert(graph->get_out_degree(v2) == 2); From e28ba8380da0fd5411de74657c7351a661b4e543 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 31 Mar 2020 18:37:36 +0200 Subject: [PATCH 109/216] Allow properties of different types by using std::variant. currently std::string and uint64_t --- include/core/storage/graph/edge/edge.h | 8 +- include/core/storage/graph/graph.h | 33 +++-- include/core/storage/graph/ldbc_import.h | 50 ++++--- include/core/storage/graph/ldbc_schema.h | 124 ++++++++++++++++++ include/core/storage/graph/property_type.h | 48 +++++++ include/core/storage/graph/vertex/vertex.h | 8 +- .../storage/graph/simple/simple_graph_test.h | 12 +- 7 files changed, 242 insertions(+), 41 deletions(-) create mode 100644 include/core/storage/graph/ldbc_schema.h create mode 100644 include/core/storage/graph/property_type.h diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index b04e3b09..6b05d567 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -24,6 +24,8 @@ #ifndef MORPHSTORE_EDGE_H #define MORPHSTORE_EDGE_H +#include "../property_type.h" + #include #include #include @@ -104,9 +106,9 @@ namespace morphstore{ class EdgeWithProperties { private: std::shared_ptr edge; - std::unordered_map properties; + std::unordered_map properties; public: - EdgeWithProperties(std::shared_ptr edge, const std::unordered_map properties) { + EdgeWithProperties(std::shared_ptr edge, const std::unordered_map properties) { this->edge = edge; this->properties = properties; } @@ -115,7 +117,7 @@ namespace morphstore{ return edge; } - std::unordered_map getProperties() { + std::unordered_map getProperties() { return properties; } }; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index cfd74c7c..0118ca72 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -26,6 +26,7 @@ #include "vertex/vertex.h" #include "edge/edge.h" +#include "property_type.h" #include #include @@ -53,8 +54,8 @@ namespace morphstore{ // store outside of entity objects as they have a variable size and can be better compressed this way // TODO: try other property storage formats than per node .. (triple-store or per property) - std::unordered_map> vertex_properties; - std::unordered_map> edge_properties; + std::unordered_map> vertex_properties; + std::unordered_map> edge_properties; // Lookup for types: number to string @@ -101,12 +102,14 @@ namespace morphstore{ return edges.size(); } - uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { + uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { assert(expectedVertexCount > getVertexCount()); std::shared_ptr v = std::make_shared(getNextVertexId(), type); uint64_t id = v->getID(); vertices[id] = v; - vertex_properties.insert(std::make_pair(id, props)); + if (!props.empty()) { + vertex_properties.insert(std::make_pair(id, props)); + } return id; }; @@ -193,11 +196,11 @@ namespace morphstore{ fs.close(); } - void add_property_to_vertex(uint64_t id, const std::pair property) { + void add_property_to_vertex(uint64_t id, const std::pair property) { vertex_properties[id].insert(property); }; - void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { + void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { edge_properties[id] = properties; }; @@ -248,8 +251,8 @@ namespace morphstore{ index_size += sizeof(std::unordered_map>); for(auto& property_mapping: vertex_properties) { index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { - data_size += sizeof(char) * (property->first.length() + property->second.length()); + for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { + data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); } } @@ -257,8 +260,8 @@ namespace morphstore{ index_size += sizeof(std::unordered_map>); for(auto& property_mapping: edge_properties) { index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { - data_size += sizeof(char) * (property->first.length() + property->second.length()); + for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { + data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); } } @@ -297,7 +300,10 @@ namespace morphstore{ std::cout << "\n"; std::cout << "Properties: "; for (const auto entry : vertex_properties[id]) { - std::cout << "{" << entry.first << ": " << entry.second << "}"; + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; } std::cout << "\n"; std::cout << "#Edges: " << this->get_out_degree(v->getID()); @@ -315,7 +321,10 @@ namespace morphstore{ std::cout << "\n"; std::cout << "Properties: "; for (const auto entry : edge_properties[id]) { - std::cout << "{" << entry.first << ": " << entry.second << "}"; + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; } std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 39e21502..544d744b 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -18,15 +18,14 @@ /** * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph in CSR or AdjList - * @todo + * @todo support for array properties (for simplicity only last one take currently) */ #ifndef MORPHSTORE_LDBC_IMPORT_H #define MORPHSTORE_LDBC_IMPORT_H -#include -#include - +#include +#include "ldbc_schema.h" #include #include @@ -36,6 +35,13 @@ #include #include #include +#include +#include +#include +#include + + + // hash function used to hash a pair of any kind using XOR (for verticesMap) struct hash_pair { @@ -63,11 +69,12 @@ namespace morphstore{ // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) std::unordered_map> vertexEdgesLookup; - std::unordered_map> edgeProperties; + std::unordered_map> edgeProperties; public: - // Constructor: needs the address of the csv files + // Constructor: needs the address of the csv files !!! both static and dynamic in one folder + // LDBCImport(const std::string &dir) { directory = dir; insert_file_names(directory); @@ -139,7 +146,7 @@ namespace morphstore{ for (const auto &file : verticesPaths) { // data structure for attributes of entity, e.g. taglass -> id, name, url - std::vector attributes; + std::vector> attributes; std::string vertexType = getEntityType(file); int vertexTypeNumber = get_vertex_type_number(vertexType); @@ -193,29 +200,37 @@ namespace morphstore{ // first line of *.csv contains the attributes -> write to attributes vector if (start == 0) { + std::string property_key; + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector while ((next = row.find(delimiter, last)) != std::string::npos) { - attributes.push_back(row.substr(last, next - last)); + property_key = row.substr(last, next - last); + attributes.push_back(std::make_pair(property_key, get_data_type(vertexType, property_key))); last = next + 1; } // last attribute - attributes.push_back(row.substr(last)); + property_key = row.substr(last); + attributes.push_back(std::make_pair(property_key, get_data_type(vertexType, property_key))); } else { // actual data: - std::unordered_map properties; + std::unordered_map properties; size_t attrIndex = 0; std::string ldbcID = row.substr(0, row.find(delimiter)); while ((next = row.find(delimiter, last)) != std::string::npos) - { - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last, next - last))); + { + auto key_to_datatype = attributes[attrIndex]; + property_type property_value = convert_property_value(row.substr(last, next - last), key_to_datatype.second); + properties.insert(std::make_pair(key_to_datatype.first, property_value)); last = next + 1; ++attrIndex; } // last attribute - properties.insert(std::make_pair(attributes[attrIndex], row.substr(last))); + auto key_to_datatype = attributes[attrIndex]; + property_type propertyValue = convert_property_value(row.substr(last), key_to_datatype.second); + properties.insert(std::make_pair(key_to_datatype.first, propertyValue)); //----------------------------------------------------- // create vertex and insert into graph with properties @@ -475,9 +490,10 @@ namespace morphstore{ if(get_vertex_type_number(targetVertexType) == -1) { // Multi-value-attributes: just take the last recently one std::string propertyKey; - std::unordered_map multiValueAttr; + Ldbc_Data_Type data_type; + std::unordered_map multiValueAttr; uint64_t systemID; - std::string value; + property_type value; for(size_t i = 0; i < fileSize; ++i){ if(buffer[i] == '\n'){ @@ -492,10 +508,12 @@ namespace morphstore{ // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' if(start == 0){ propertyKey = row.substr(row.find(delimiter) + 1); + data_type = get_data_type(sourceVertexType ,propertyKey); + }else{ // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) systemID = globalIdLookupMap[{sourceVertexType, row.substr(0, row.find(delimiter))}]; - value = row.substr(row.find(delimiter) + 1); + value = convert_property_value(row.substr(row.find(delimiter) + 1), data_type); multiValueAttr[systemID] = std::move(value); } diff --git a/include/core/storage/graph/ldbc_schema.h b/include/core/storage/graph/ldbc_schema.h new file mode 100644 index 00000000..c33b86e3 --- /dev/null +++ b/include/core/storage/graph/ldbc_schema.h @@ -0,0 +1,124 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file lbc_schema.h + * @brief Schema of the LDBC graph + * @todo +*/ + +#ifndef MORPHSTORE_LDBC_SCHEMA_H +#define MORPHSTORE_LDBC_SCHEMA_H + +#include "property_type.h" + +#include +#include +#include + +namespace morphstore{ + + enum class Ldbc_Data_Type {LONG_STRING, STRING, TEXT, INT_32, ID, DATE_TIME, DATE}; + + // static not included -> f.i. hasTag edge seen as property tag.id + static const std::map> ldbc_dynamic_schema{ + { + // vertices + {"person", { + {"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"firstName", Ldbc_Data_Type::STRING}, + {"lastName", Ldbc_Data_Type::STRING}, + {"gender", Ldbc_Data_Type::STRING}, + {"birthday", Ldbc_Data_Type::DATE}, + // !TODO actually an array of emails + {"email", Ldbc_Data_Type::LONG_STRING}, + // !TODO actually an array of languages + + // (and not currently filled as csv header contains "language") + //{"speaks", Ldbc_Data_Type::STRING}, + // TODO actually values for "speaks" array + {"language", Ldbc_Data_Type::STRING}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}}}, + {"forum", { + {"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"title", Ldbc_Data_Type::LONG_STRING}}}, + {"post", { + {"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}, + {"length", Ldbc_Data_Type::INT_32}, + // TODO: extra nullable type for the following 3: like TEXT? + {"content", Ldbc_Data_Type::TEXT}, + {"language", Ldbc_Data_Type::STRING}, + {"imageFile", Ldbc_Data_Type::STRING}}}, + {"comment", { + {"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}, + {"content", Ldbc_Data_Type::TEXT}, + {"length", Ldbc_Data_Type::INT_32}, + }}, + // edges + {"likes", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, + {"hasMember", {{"joinDate", Ldbc_Data_Type::DATE_TIME}}}, + {"hasModerator", {}}, + {"hasCreator", {}}, + {"containerOf", {}}, + {"replyOf", {}}, + {"knows", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, + + }}; + + Ldbc_Data_Type get_data_type(std::string entity_type, std::string property_key) { + auto perEntity = ldbc_dynamic_schema.find(entity_type); + if (perEntity != ldbc_dynamic_schema.end()) { + auto propertiesMap = perEntity->second; + auto propertyEntry = propertiesMap.find(property_key); + if (propertyEntry != propertiesMap.end()) { + return propertyEntry->second; + } + } + + // ldbc id is saved as an extra property as morphstore::graph generates new ones + // static part of social network not included thus saved as property (!!wrongly!!) + std::regex id_reg_exp("(Tag|Place)+\\.id\\s*"); + if(property_key == "id" || std::regex_match(property_key, id_reg_exp)) return Ldbc_Data_Type::ID; + + std::cout << "Could not find a data type for " << entity_type << " " << property_key; + return Ldbc_Data_Type::STRING; + } + + property_type convert_property_value(std::string value, Ldbc_Data_Type type) { + property_type converted_value; + + switch(type) { + case Ldbc_Data_Type::INT_32: + converted_value = std::stoi(value); + break; + case Ldbc_Data_Type::ID: + converted_value = std::stoull(value); + break; + default: + converted_value = value; + }; + + return converted_value; + } +} + +#endif //MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file diff --git a/include/core/storage/graph/property_type.h b/include/core/storage/graph/property_type.h new file mode 100644 index 00000000..7f133cd9 --- /dev/null +++ b/include/core/storage/graph/property_type.h @@ -0,0 +1,48 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file property_type.h + * @brief variant of supported data types as a property + * @todo +*/ + +#ifndef MORPHSTORE_PROPERTY_TYPE_H +#define MORPHSTORE_PROPERTY_TYPE_H + +#include +#include + +namespace morphstore{ + // only to used if properties are stored per node or triple store + // TODO: handle date and datetime properties and maybe text + using property_type = std::variant; + + struct PropertyValueVisitor { + void operator()(const std::string &s) const { + std::cout << "(string) " << s; + } + void operator()(uint64_t i) const + { + std::cout << "(uint_64t) " << i; + } + }; + +} + + +#endif //MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 129bdc1d..72d89cfe 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_VERTEX_H #define MORPHSTORE_VERTEX_H -#include "../edge/edge.h" +#include "../property_type.h" #include #include @@ -70,9 +70,9 @@ namespace morphstore{ class VertexWithProperties { private: std::shared_ptr vertex; - std::unordered_map properties; + std::unordered_map properties; public: - VertexWithProperties(std::shared_ptr vertex, const std::unordered_map properties) { + VertexWithProperties(std::shared_ptr vertex, const std::unordered_map properties) { this->vertex = vertex; this->properties = properties; } @@ -85,7 +85,7 @@ namespace morphstore{ return vertex->getType(); } - std::unordered_map getProperties() { + std::unordered_map getProperties() { return properties; } }; diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 8d72f052..d8503ade 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -52,15 +52,15 @@ void simpleGraphFormatTest (void) { auto e1 = morphstore::Edge(v1, v2, 1); graph->add_edges(v1, {e1}); - graph->add_properties_to_edge(e1.getId(), {{"rating", "42"}, {"description", "has the answer to everything"}}); + graph->add_properties_to_edge(e1.getId(), {{"rating", 42}, {"description", "has the answer to everything"}}); graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); // (DEBUG) - /*graph->statistics(); - graph->print_edge_by_id(0); - graph->print_neighbors_of_vertex(v1); - graph->print_neighbors_of_vertex(v2); - graph->print_neighbors_of_vertex(v3);*/ + graph->statistics(); + graph->print_edge_by_id(0); + graph->print_neighbors_of_vertex(v1); + graph->print_neighbors_of_vertex(v2); + graph->print_neighbors_of_vertex(v3); assert(graph->getVertexCount() == 3); assert(graph->getEdgeCount() == 3); From 30af372b194bc4c73024f18ecc6c1f34b55f0677 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 31 Mar 2020 19:56:06 +0200 Subject: [PATCH 110/216] Include static part of ldbc dataset - given directory now is expected to have 2 sub directories ("static" and "dynamic") --- include/core/storage/graph/ldbc_import.h | 89 +++++++++++++----------- include/core/storage/graph/ldbc_schema.h | 39 ++++++++--- 2 files changed, 78 insertions(+), 50 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 544d744b..7746a0d7 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -59,9 +59,9 @@ namespace morphstore{ class LDBCImport { private: - std::string directory; - std::vector verticesPaths; - std::vector edgesPaths; + std::filesystem::path base_directory; + std::vector verticesPaths; + std::vector edgesPaths; std::map vertexTypeLookup; std::map edgeTypeLookup; // data structure for lookup local ids with vertexType to global system id: (vertexType, ldbc_id) -> global id @@ -72,24 +72,24 @@ namespace morphstore{ std::unordered_map> edgeProperties; public: - - // Constructor: needs the address of the csv files !!! both static and dynamic in one folder - // + // directory including a static/ and dynamic/ directory like in /ldbc_snb_datagen/social_network/ LDBCImport(const std::string &dir) { - directory = dir; - insert_file_names(directory); + base_directory = dir; + insert_file_names(); } std::string getDirectory() const { - return directory; + return base_directory; } // get the vertex or edge type based on the fileName - std::string getEntityType(std::string fileName) { + std::string getEntityType(std::filesystem::path filePath) { // last [a-zA-Z] to remove ending _ std::regex typeRegExp("[a-zA-Z_]+[a-zA-Z]"); std::smatch match; + std::string fileName = filePath.filename().string(); + if(std::regex_search(fileName, match, typeRegExp)) { //std::cout << "EntityType: " << match[0] << std::endl; //std::cout.flush(); @@ -101,15 +101,22 @@ namespace morphstore{ } - // function which iterates through directory to receive file names (entire path) - void insert_file_names(std::string dir) { - for (const auto &entry : std::filesystem::directory_iterator(dir)) { - // ignore files starting with a '.' (+ 1 as '/' is the first character otherwise) - if (entry.path().string()[dir.size() + 1] == '.') { - continue; - } else { - // insert file path to vertices or edges vector - differentiate(entry.path().string(), dir); + // function which iterates through the base_directory to receive file names (entire path) + void insert_file_names() { + + std::filesystem::path dynamic_data_dir (base_directory / "dynamic"); + std::filesystem::path static_data_dir (base_directory / "static"); + std::vector dirs{dynamic_data_dir, static_data_dir}; + + for(const auto dir: dirs) { + for (const auto &entry : std::filesystem::directory_iterator(dir)) { + // ignore files starting with a '.' (+ 1 as '/' is the first character otherwise) + if (entry.path().string()[dir.u8string().length() + 1] == '.') { + continue; + } else { + // insert file path to vertices or edges vector + differentiate(entry.path().string()); + } } } @@ -120,18 +127,18 @@ namespace morphstore{ } // this function differentiates, whether the file is a vertex or edge and puts it into the specific vector - void differentiate(std::string path, std::string dir) { + void differentiate(std::filesystem::path path) { // if the string contains a '_' -> it's a edge file; otherwise a vertex file // if string contains word_word it is an edge files (vertex files only contain one word) - // a vertex file contains exactly one word and after that only numbers are allowed f.i. _0_0 - std::regex vertexFileRegExp("^\\/([a-zA-Z]+\\_)([0-9_]*).csv$"); - std::string fileName = path.substr(dir.size()); + // a vertex file name contains exactly one word and after that only numbers are allowed f.i. _0_0 + // .*\\/ for path marks the directory path + std::regex vertexFileRegExp("^(.*\\/)([a-zA-Z]+\\_)([0-9_]*).csv$"); - if (std::regex_match(fileName, vertexFileRegExp)) { - verticesPaths.push_back(fileName); + if (std::regex_match(path.u8string(), vertexFileRegExp)) { + verticesPaths.push_back(path); } else { - edgesPaths.push_back(fileName); + edgesPaths.push_back(path); } } @@ -141,7 +148,6 @@ namespace morphstore{ std::cout << "(1/2) Generating LDBC-Vertices ..."; std::cout.flush(); - // iterate through vector of vertex-addresses for (const auto &file : verticesPaths) { @@ -155,7 +161,7 @@ namespace morphstore{ uint64_t fileSize = 0; - std::string address = getDirectory() + file; + std::string address = file; std::ifstream vertexFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening @@ -201,17 +207,25 @@ namespace morphstore{ if (start == 0) { std::string property_key; - + Ldbc_Data_Type data_type; // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector while ((next = row.find(delimiter, last)) != std::string::npos) { property_key = row.substr(last, next - last); - attributes.push_back(std::make_pair(property_key, get_data_type(vertexType, property_key))); + data_type = get_data_type(vertexType, property_key); + if (data_type == Ldbc_Data_Type::ERROR) { + throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + " could not be found in schema"); + } + attributes.push_back(std::make_pair(property_key, data_type)); last = next + 1; } // last attribute property_key = row.substr(last); - attributes.push_back(std::make_pair(property_key, get_data_type(vertexType, property_key))); + data_type = get_data_type(vertexType, property_key); + if (data_type == Ldbc_Data_Type::ERROR) { + throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + " could not be found in schema"); + } + attributes.push_back(std::make_pair(property_key, data_type)); } else { @@ -329,9 +343,7 @@ namespace morphstore{ uint64_t fileSize = 0; - std::string address = getDirectory() + file; - - std::ifstream edgeFile(address, std::ios::binary | + std::ifstream edgeFile(file, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!edgeFile) { @@ -384,8 +396,7 @@ namespace morphstore{ for (const auto &file : verticesPaths) { char *buffer; uint64_t fileSize = 0; - std::string address = getDirectory() + file; - std::ifstream vertexFile(address, std::ios::binary | + std::ifstream vertexFile(file, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!vertexFile) { @@ -463,9 +474,7 @@ namespace morphstore{ uint64_t fileSize = 0; - std::string address = getDirectory() + file; - - std::ifstream edgeFile(address, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream edgeFile(file, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!edgeFile) { std::cerr << "Error, opening file. "; @@ -509,6 +518,8 @@ namespace morphstore{ if(start == 0){ propertyKey = row.substr(row.find(delimiter) + 1); data_type = get_data_type(sourceVertexType ,propertyKey); + if (data_type == Ldbc_Data_Type::ERROR) + throw std::invalid_argument(file.string() + ":" + edgeType + ":" + propertyKey + " could not be found in schema"); }else{ // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) diff --git a/include/core/storage/graph/ldbc_schema.h b/include/core/storage/graph/ldbc_schema.h index c33b86e3..3e92c2fd 100644 --- a/include/core/storage/graph/ldbc_schema.h +++ b/include/core/storage/graph/ldbc_schema.h @@ -32,10 +32,10 @@ namespace morphstore{ - enum class Ldbc_Data_Type {LONG_STRING, STRING, TEXT, INT_32, ID, DATE_TIME, DATE}; + enum class Ldbc_Data_Type {LONG_STRING, STRING, TEXT, INT_32, ID, DATE_TIME, DATE, ERROR}; // static not included -> f.i. hasTag edge seen as property tag.id - static const std::map> ldbc_dynamic_schema{ + static const std::map> ldbc_schema { { // vertices {"person", { @@ -71,22 +71,40 @@ namespace morphstore{ {"browserUsed", Ldbc_Data_Type::STRING}, {"locationIP", Ldbc_Data_Type::STRING}, {"content", Ldbc_Data_Type::TEXT}, - {"length", Ldbc_Data_Type::INT_32}, - }}, + {"length", Ldbc_Data_Type::INT_32}}}, + {"tagclass", { + {"name", Ldbc_Data_Type::LONG_STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}}}, + {"tag", { + {"name", Ldbc_Data_Type::LONG_STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}}}, + {"place", { + {"name", Ldbc_Data_Type::LONG_STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}, + {"type", Ldbc_Data_Type::STRING}}}, + {"organisation", { + {"name", Ldbc_Data_Type::LONG_STRING}, + {"type", Ldbc_Data_Type::STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}}}, // edges {"likes", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, {"hasMember", {{"joinDate", Ldbc_Data_Type::DATE_TIME}}}, {"hasModerator", {}}, {"hasCreator", {}}, + {"hasTag", {}}, {"containerOf", {}}, {"replyOf", {}}, + {"isSubclassOf", {}}, + {"isPartOf", {}}, + {"isLocatedIn", {}}, + {"studyAt", {{"classYear", Ldbc_Data_Type::INT_32}}}, + {"workAt", {{"workFrom", Ldbc_Data_Type::INT_32}}}, {"knows", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, - }}; Ldbc_Data_Type get_data_type(std::string entity_type, std::string property_key) { - auto perEntity = ldbc_dynamic_schema.find(entity_type); - if (perEntity != ldbc_dynamic_schema.end()) { + auto perEntity = ldbc_schema.find(entity_type); + if (perEntity != ldbc_schema.end()) { auto propertiesMap = perEntity->second; auto propertyEntry = propertiesMap.find(property_key); if (propertyEntry != propertiesMap.end()) { @@ -96,11 +114,10 @@ namespace morphstore{ // ldbc id is saved as an extra property as morphstore::graph generates new ones // static part of social network not included thus saved as property (!!wrongly!!) - std::regex id_reg_exp("(Tag|Place)+\\.id\\s*"); - if(property_key == "id" || std::regex_match(property_key, id_reg_exp)) return Ldbc_Data_Type::ID; + if(property_key == "id") return Ldbc_Data_Type::ID; - std::cout << "Could not find a data type for " << entity_type << " " << property_key; - return Ldbc_Data_Type::STRING; + //std::cout << "Could not find a data type for " << entity_type << " " << property_key; + return Ldbc_Data_Type::ERROR; } property_type convert_property_value(std::string value, Ldbc_Data_Type type) { From 07c193ed593d2eddf7d5bcc37abe50ad47d57571 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 31 Mar 2020 22:34:38 +0200 Subject: [PATCH 111/216] Add stub for vertex_storage benchmark --- src/CMakeLists.txt | 2 + src/microbenchmarks/graph/CMakeLists.txt | 7 ++ .../graph/vertex_storage_benchmark.cpp | 66 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 src/microbenchmarks/graph/CMakeLists.txt create mode 100644 src/microbenchmarks/graph/vertex_storage_benchmark.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b92fa98b..cfbc2244 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,8 @@ add_subdirectory( calibration ) add_subdirectory( examples ) add_subdirectory( microbenchmarks ) +add_subdirectory( microbenchmarks/graph ) + # There might be automatically generated subdirectories for the Star Schema # Benchmark (SSB), possibly with different scale factors. The following lines # add all of them. diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt new file mode 100644 index 00000000..662fc43d --- /dev/null +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -0,0 +1,7 @@ +if ( BUILD_ALL OR BUILD_MICROBMS ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/vertex_storage_benchmark_app ) + + add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) + + add_test( vertex_storage_benchmark vertex_storage_benchmark_app ) +endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp new file mode 100644 index 00000000..04a89c83 --- /dev/null +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -0,0 +1,66 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertex_storage_benchmark.cpp + * @brief A little mirco benchmark of the vertex storage (hashmap vs vector>). + */ + +#include +#include +#include + + + +int main(void) { + using namespace morphstore; + + for(int vertex_count=10000; vertex_count < 100000000; vertex_count = vertex_count*10) { + std::cout << "Testing graph with vertex count of:" << vertex_count << std::endl; + std::unique_ptr graph = std::make_unique(); + graph->allocate_graph_structure(vertex_count, 0); + for(int i=0; i < vertex_count; i++) { + graph->add_vertex(i); + } + + auto start = std::chrono::high_resolution_clock::now(); + + // iterate + for(int i=0; i < vertex_count; i++) { + graph->get_vertex(i); + } + + auto stop = std::chrono::high_resolution_clock::now(); + auto iteration_duration = std::chrono::duration_cast(stop - start).count(); + std::cout << "Iteration time: " << iteration_duration << "ms" << std::endl; + // random access + std::random_device rd; + std::uniform_int_distribution dist(0, vertex_count); + + start = std::chrono::high_resolution_clock::now(); + + for(int i=0; i < 10000; i++) { + graph->get_vertex(dist(rd)); + } + + stop = std::chrono::high_resolution_clock::now(); + auto random_access_duration = std::chrono::duration_cast(stop - start).count(); + std::cout << "Random access of 10000 vertices: " << random_access_duration << "ms" << std::endl; + } + + return 0; +} From 31f7761e14ff7a7d19e5bb4a23ce61729938ba68 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 3 Apr 2020 20:20:54 +0200 Subject: [PATCH 112/216] Add loading time and use an avg of 5 for the vertex_storage benchmark Add loading time to benchmark --- src/microbenchmarks/graph/CMakeLists.txt | 2 - .../graph/vertex_storage_benchmark.cpp | 60 +++++++++++++------ 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index 662fc43d..8324ae0d 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -2,6 +2,4 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/vertex_storage_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) - - add_test( vertex_storage_benchmark vertex_storage_benchmark_app ) endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 04a89c83..1c94c857 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -25,41 +25,67 @@ #include +typedef std::chrono::high_resolution_clock highResClock; +using namespace morphstore; + +int64_t getDuration(std::chrono::time_point start) { + auto stop = highResClock::now(); + return std::chrono::duration_cast(stop - start).count(); +} int main(void) { - using namespace morphstore; + // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function + + int number_of_executions = 5; + + std::cout << "Test vertex storage structure (avg of 5 for full_iterate and random access) times in μs" << std::endl; + std::cout << "vertex_count | loading time | full_iterate | 10^4 random access" << std::endl; for(int vertex_count=10000; vertex_count < 100000000; vertex_count = vertex_count*10) { - std::cout << "Testing graph with vertex count of:" << vertex_count << std::endl; + int64_t duration = 0; + + std::cout << vertex_count << " | "; std::unique_ptr graph = std::make_unique(); graph->allocate_graph_structure(vertex_count, 0); + auto start = highResClock::now(); for(int i=0; i < vertex_count; i++) { graph->add_vertex(i); } - auto start = std::chrono::high_resolution_clock::now(); + std::cout << getDuration(start) << " | "; + + duration = 0; - // iterate - for(int i=0; i < vertex_count; i++) { - graph->get_vertex(i); + for(int exec=0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + // iterate + for(int i=0; i < vertex_count; i++) { + graph->get_vertex(i); + } + duration += getDuration(start); } - auto stop = std::chrono::high_resolution_clock::now(); - auto iteration_duration = std::chrono::duration_cast(stop - start).count(); - std::cout << "Iteration time: " << iteration_duration << "ms" << std::endl; + std::cout << duration / number_of_executions << " | "; + + // random access - std::random_device rd; - std::uniform_int_distribution dist(0, vertex_count); - start = std::chrono::high_resolution_clock::now(); + duration = 0; + + for(int exec=0; exec < number_of_executions; exec++) { + std::random_device rd; + std::uniform_int_distribution dist(0, vertex_count - 1); + + auto start = highResClock::now(); + + for(int i=0; i < 10000; i++) { + graph->get_vertex(dist(rd)); + } - for(int i=0; i < 10000; i++) { - graph->get_vertex(dist(rd)); + duration += getDuration(start); } - stop = std::chrono::high_resolution_clock::now(); - auto random_access_duration = std::chrono::duration_cast(stop - start).count(); - std::cout << "Random access of 10000 vertices: " << random_access_duration << "ms" << std::endl; + std::cout << duration / number_of_executions << std::endl; } return 0; From b997b521443dce4e0cef0735dc89f68bf57d50d8 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 3 Apr 2020 19:58:46 +0200 Subject: [PATCH 113/216] Replace hashmap with vector of vectors for storing vertices --- include/core/storage/graph/graph.h | 101 ++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 30 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 0118ca72..fec7150b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -27,6 +27,7 @@ #include "vertex/vertex.h" #include "edge/edge.h" #include "property_type.h" +#include "core/utils/math.h" #include #include @@ -48,9 +49,11 @@ namespace morphstore{ uint64_t expectedVertexCount; uint64_t expectedEdgeCount; + mutable uint64_t currentMaxVertexId = 0; + static const inline uint64_t vertex_vector_size = 4096; // Data-structure for Vertex-Properties - std::unordered_map> vertices; - std::unordered_map> edges; + std::vector>> vertices; + std::unordered_map> edges; // store outside of entity objects as they have a variable size and can be better compressed this way // TODO: try other property storage formats than per node .. (triple-store or per property) @@ -62,10 +65,47 @@ namespace morphstore{ std::map vertexTypeDictionary; std::map edgeTypeDictionary; + + uint64_t get_vertex_vector_number(uint64_t vertex_id) { + return vertex_id / vertex_vector_size; + } + const std::shared_ptr get_vertex_at(uint64_t id) { + uint64_t pos_in_vector = id % vertex_vector_size; + assert (pos_in_vector < vertices.at(get_vertex_vector_number(id)).size()); + + return vertices.at(get_vertex_vector_number(id)).at(pos_in_vector); + } + + void insert_vertex(std::shared_ptr v) { + auto vector_number = get_vertex_vector_number(v->getID()); + assert (vector_number < vertices.size()); + + return vertices.at(vector_number).push_back(v); + } + + // function to check if the vertex-ID is present or not (exists) + bool exist_vertexId(const uint64_t id){ + // ! assumes no deletions + auto vector_pos = get_vertex_vector_number(id); + if (vector_pos < vertices.size()) { + if ((id % vertex_vector_size) < vertices.at(vector_pos).size()) return true; + } + + return false ; + } + + // function to check if the edge-ID is present or not (exists) + bool exist_edgeId(const uint64_t id){ + if(edges.find(id) == edges.end()){ + return false; + } + return true; + } + uint64_t getNextVertexId() const { - static uint64_t currentMaxVertexId = 0; return currentMaxVertexId++; } + public: // -------------------- Setters & Getters -------------------- @@ -91,7 +131,11 @@ namespace morphstore{ } uint64_t getVertexCount() const { - return vertices.size(); + uint64_t count = 0; + for(uint32_t i = 0; i < vertices.size(); i++) { + count += vertices.at(i).size(); + } + return count; } uint64_t getExpectedEdgeCount() const { @@ -106,7 +150,7 @@ namespace morphstore{ assert(expectedVertexCount > getVertexCount()); std::shared_ptr v = std::make_shared(getNextVertexId(), type); uint64_t id = v->getID(); - vertices[id] = v; + insert_vertex(v); if (!props.empty()) { vertex_properties.insert(std::make_pair(id, props)); } @@ -130,25 +174,9 @@ namespace morphstore{ } } - // function to check if the vertex-ID is present or not (exists) - bool exist_vertexId(const uint64_t id){ - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } - - // function to check if the edge-ID is present or not (exists) - bool exist_edgeId(const uint64_t id){ - if(edges.find(id) == edges.end()){ - return false; - } - return true; - } - // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id){ - return VertexWithProperties(vertices[id], vertex_properties[id]); + return VertexWithProperties(get_vertex_at(id), vertex_properties[id]); } // function which returns a pointer to vertex by id @@ -229,12 +257,16 @@ namespace morphstore{ } // container for indexes: - index_size += sizeof(std::unordered_map>); - for(auto& it : vertices){ - // index size of vertex: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += it.second->get_data_size_of_vertex(); + index_size += sizeof(std::vector>>); + + for (auto vertex_vector: vertices) { + index_size += sizeof(std::vector>); + for(auto& vector : vertex_vector){ + // index size of vertex: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); + // data size: + data_size += vector->get_data_size_of_vertex(); + } } index_size += sizeof(std::unordered_map>); @@ -272,7 +304,16 @@ namespace morphstore{ this->expectedVertexCount = numberVertices; this->expectedEdgeCount = numberEdges; - vertices.reserve(numberVertices); + auto vertex_vector_count = round_up_div(numberVertices, vertex_vector_size); + + vertices.reserve(vertex_vector_count); + + for(uint64_t i = 0; i < vertex_vector_count; i++) { + auto vertex_vector = std::vector>(); + vertex_vector.reserve(vertex_vector_size / Vertex::get_data_size_of_vertex()); + vertices.push_back(vertex_vector); + } + vertex_properties.reserve(numberVertices); edges.reserve(numberEdges); edge_properties.reserve(numberEdges); @@ -294,7 +335,7 @@ namespace morphstore{ void print_vertex_by_id(uint64_t id) { std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; - std::shared_ptr v = vertices[id]; + std::shared_ptr v = get_vertex_at(id); std::cout << "Vertex-ID: \t" << v->getID() << std::endl; std::cout << "Type: \t" << get_vertexType_by_number(v->getType()) << std::endl; std::cout << "\n"; From 1dc4b5dab63df9534d74960d072962114e9634fe Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 6 Apr 2020 16:34:53 +0200 Subject: [PATCH 114/216] Fix a memory leak and also throw on invalid edges at CSR * ! Free allocated arrays in CSR * Uniform handling of edges between unloaded vertices * Edge has a toString function --- include/core/storage/graph/edge/edge.h | 4 +++ .../storage/graph/formats/adjacencylist.h | 36 +++++++++---------- include/core/storage/graph/formats/csr.h | 12 +++++++ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 6b05d567..d7e626ac 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -101,6 +101,10 @@ namespace morphstore{ size += sizeof(unsigned short int); // relation return size; } + + std::string to_string() const { + return "(id:" + std::to_string(this->id) + " ," + std::to_string(this->sourceID) + "->" + std::to_string(this->targetID) + ")"; + } }; class EdgeWithProperties { diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index b3574403..b84c01e5 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -56,26 +56,26 @@ namespace morphstore{ // function that adds multiple edges (list of neighbors) at once to vertex void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { - if (exist_vertexId(sourceId)) { - std::shared_ptr> adjacencyList; - if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { - adjacencyList = adjacencylistPerVertex[sourceId]; - } else { - adjacencyList = std::make_shared>(); - adjacencylistPerVertex[sourceId] = adjacencyList; - } + if (!vertices.exist_vertex(sourceId)) { + throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); + } - for(const auto edge : edgesToAdd) { - edges[edge.getId()] = std::make_shared(edge); - if(exist_vertexId(edge.getTargetId())) { - adjacencyList->push_back(edge.getId()); - } - else { - std::cout << "Target-Vertex with ID " << edge.getTargetId() << " not found." << std::endl; - } - } + std::shared_ptr> adjacencyList; + if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { + adjacencyList = adjacencylistPerVertex[sourceId]; } else { - std::cout << "Source-Vertex with ID " << sourceId << " not found." << std::endl; + adjacencyList = std::make_shared>(); + adjacencylistPerVertex[sourceId] = adjacencyList; + } + + for(const auto edge : edgesToAdd) { + edges[edge.getId()] = std::make_shared(edge); + if(vertices.exist_vertex(edge.getTargetId())) { + adjacencyList->push_back(edge.getId()); + } + else { + throw std::runtime_error("Target not found :" + edge.to_string()); + } } } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index d6999b24..000168c0 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -43,6 +43,11 @@ namespace morphstore{ public: + ~CSR() { + free(offset_array); + free(edgeId_array); + } + std::string get_storage_format() const override { return "CSR"; } @@ -70,9 +75,16 @@ namespace morphstore{ uint64_t offset = offset_array[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); + if (!vertices.exist_vertex(sourceID)) { + throw std::runtime_error("Source-id not found " + std::to_string(sourceID)); + } + // fill the arrays for(const auto& edge : edgesToAdd){ std::shared_ptr ePtr = std::make_shared(edge); + if(!vertices.exist_vertex(edge.getTargetId())) { + throw std::runtime_error("Target not found " + edge.to_string()); + } edges[ePtr->getId()] = ePtr; edgeId_array[offset] = ePtr->getId(); ++offset; From d2ce6efd32b4b16098b984b39cff1f11078f8d8e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 6 Apr 2020 22:14:45 +0200 Subject: [PATCH 115/216] Move vertex storage logic inside separate "vertices_container" class * smaller graph header * easy to switch between different data strutures for storing vertices --- include/core/storage/graph/edge/edge.h | 4 +- .../storage/graph/formats/adjacencylist.h | 4 +- include/core/storage/graph/formats/csr.h | 4 +- include/core/storage/graph/graph.h | 152 ++++-------------- include/core/storage/graph/ldbc_import.h | 3 +- include/core/storage/graph/vertex/vertex.h | 31 +++- .../storage/graph/vertex/vertices_container.h | 140 ++++++++++++++++ .../graph/vertex/vertices_hashmap_container.h | 81 ++++++++++ .../vertex/vertices_vectorvector_container.h | 131 +++++++++++++++ .../graph/simple/bfs_simple_graph_test.h | 2 +- .../storage/graph/simple/simple_graph_test.h | 2 +- 11 files changed, 413 insertions(+), 141 deletions(-) create mode 100644 include/core/storage/graph/vertex/vertices_container.h create mode 100644 include/core/storage/graph/vertex/vertices_hashmap_container.h create mode 100644 include/core/storage/graph/vertex/vertices_vectorvector_container.h diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index d7e626ac..1c763ef1 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -95,10 +95,10 @@ namespace morphstore{ } // get size of edge object in bytes: - size_t size_in_bytes() const{ + static size_t size_in_bytes() { size_t size = 0; size += sizeof(uint64_t) * 2; // source- and target-id - size += sizeof(unsigned short int); // relation + size += sizeof(unsigned short int); // type return size; } diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index b84c01e5..cac001a3 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -56,7 +56,7 @@ namespace morphstore{ // function that adds multiple edges (list of neighbors) at once to vertex void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { - if (!vertices.exist_vertex(sourceId)) { + if (!vertices.exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } @@ -70,7 +70,7 @@ namespace morphstore{ for(const auto edge : edgesToAdd) { edges[edge.getId()] = std::make_shared(edge); - if(vertices.exist_vertex(edge.getTargetId())) { + if(vertices.exists_vertex(edge.getTargetId())) { adjacencyList->push_back(edge.getId()); } else { diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 000168c0..7827c137 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -75,14 +75,14 @@ namespace morphstore{ uint64_t offset = offset_array[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); - if (!vertices.exist_vertex(sourceID)) { + if (!vertices.exists_vertex(sourceID)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceID)); } // fill the arrays for(const auto& edge : edgesToAdd){ std::shared_ptr ePtr = std::make_shared(edge); - if(!vertices.exist_vertex(edge.getTargetId())) { + if(!vertices.exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found " + edge.to_string()); } edges[ePtr->getId()] = ePtr; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index fec7150b..8cc90744 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -25,13 +25,15 @@ #define MORPHSTORE_GRAPH_H #include "vertex/vertex.h" +//#include "vertex/vertices_hashmap_container.h" +#include "vertex/vertices_vectorvector_container.h" +//#include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" #include "property_type.h" -#include "core/utils/math.h" #include -#include #include +#include #include #include #include @@ -50,50 +52,19 @@ namespace morphstore{ uint64_t expectedEdgeCount; mutable uint64_t currentMaxVertexId = 0; - static const inline uint64_t vertex_vector_size = 4096; - // Data-structure for Vertex-Properties - std::vector>> vertices; + + // ! currently need to change to right container (abstract seems not to be possible due to pure virtual functions) + VerticesVectorVectorContainer vertices; + std::unordered_map> edges; - // store outside of entity objects as they have a variable size and can be better compressed this way - // TODO: try other property storage formats than per node .. (triple-store or per property) - std::unordered_map> vertex_properties; std::unordered_map> edge_properties; // Lookup for types: number to string - std::map vertexTypeDictionary; std::map edgeTypeDictionary; - uint64_t get_vertex_vector_number(uint64_t vertex_id) { - return vertex_id / vertex_vector_size; - } - const std::shared_ptr get_vertex_at(uint64_t id) { - uint64_t pos_in_vector = id % vertex_vector_size; - assert (pos_in_vector < vertices.at(get_vertex_vector_number(id)).size()); - - return vertices.at(get_vertex_vector_number(id)).at(pos_in_vector); - } - - void insert_vertex(std::shared_ptr v) { - auto vector_number = get_vertex_vector_number(v->getID()); - assert (vector_number < vertices.size()); - - return vertices.at(vector_number).push_back(v); - } - - // function to check if the vertex-ID is present or not (exists) - bool exist_vertexId(const uint64_t id){ - // ! assumes no deletions - auto vector_pos = get_vertex_vector_number(id); - if (vector_pos < vertices.size()) { - if ((id % vertex_vector_size) < vertices.at(vector_pos).size()) return true; - } - - return false ; - } - // function to check if the edge-ID is present or not (exists) bool exist_edgeId(const uint64_t id){ if(edges.find(id) == edges.end()){ @@ -101,7 +72,8 @@ namespace morphstore{ } return true; } - + + // TODO: put this into vertex container? uint64_t getNextVertexId() const { return currentMaxVertexId++; } @@ -109,13 +81,9 @@ namespace morphstore{ public: // -------------------- Setters & Getters -------------------- - const std::map &getVertexTypeDictionary() const { - return vertexTypeDictionary; - } - - void setVertexTypeDictionary(const std::map& ent) { - assert(ent.size() != 0); - this->vertexTypeDictionary = ent; + void set_vertex_type_dictionary(const std::map& types) { + assert(types.size() != 0); + this->vertices.set_vertex_type_dictionary(types); } const std::map &getRelationDictionary() const { @@ -131,11 +99,7 @@ namespace morphstore{ } uint64_t getVertexCount() const { - uint64_t count = 0; - for(uint32_t i = 0; i < vertices.size(); i++) { - count += vertices.at(i).size(); - } - return count; + return vertices.vertex_count(); } uint64_t getExpectedEdgeCount() const { @@ -148,23 +112,11 @@ namespace morphstore{ uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { assert(expectedVertexCount > getVertexCount()); - std::shared_ptr v = std::make_shared(getNextVertexId(), type); - uint64_t id = v->getID(); - insert_vertex(v); - if (!props.empty()) { - vertex_properties.insert(std::make_pair(id, props)); - } - return id; + Vertex v = Vertex(getNextVertexId(), type); + vertices.add_vertex(v, props); + return v.getID(); }; - std::string get_vertexType_by_number(unsigned short int type){ - if(vertexTypeDictionary.find( type ) != vertexTypeDictionary.end()){ - return vertexTypeDictionary.at(type); - }else{ - return "No Matching of type-number in the database! For type " + std::to_string(type); - } - } - std::string get_edgeType_by_number(unsigned short int type){ if(edgeTypeDictionary.find( type ) != edgeTypeDictionary.end()){ return edgeTypeDictionary.at(type); @@ -176,7 +128,7 @@ namespace morphstore{ // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id){ - return VertexWithProperties(get_vertex_at(id), vertex_properties[id]); + return vertices.get_vertex(id); } // function which returns a pointer to vertex by id @@ -225,7 +177,7 @@ namespace morphstore{ } void add_property_to_vertex(uint64_t id, const std::pair property) { - vertex_properties[id].insert(property); + vertices.add_property_to_vertex(id, property); }; void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { @@ -241,16 +193,10 @@ namespace morphstore{ virtual std::vector get_neighbors_ids(uint64_t id) = 0; virtual std::pair get_size_of_graph(){ - std::pair index_data_size; - size_t data_size = 0; - size_t index_size = 0; + // including vertices + its properties + its type dict + auto [index_size, data_size] = vertices.get_size(); // lookup type dicts - index_size += 2 * sizeof(std::map); - for(auto& ent : vertexTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(ent.second.length()); - } for(auto& rel : edgeTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(rel.second.length()); @@ -259,16 +205,6 @@ namespace morphstore{ // container for indexes: index_size += sizeof(std::vector>>); - for (auto vertex_vector: vertices) { - index_size += sizeof(std::vector>); - for(auto& vector : vertex_vector){ - // index size of vertex: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += vector->get_data_size_of_vertex(); - } - } - index_size += sizeof(std::unordered_map>); for(auto& it : edges){ // index size of edge: size of id and sizeof pointer @@ -278,16 +214,6 @@ namespace morphstore{ } // TODO: extra propertymappings class - - // node-properties: - index_size += sizeof(std::unordered_map>); - for(auto& property_mapping: vertex_properties) { - index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { - data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); - } - } - // edge-properties: index_size += sizeof(std::unordered_map>); for(auto& property_mapping: edge_properties) { @@ -297,24 +223,15 @@ namespace morphstore{ } } - return index_data_size; + return std::make_pair(index_size, data_size); }; virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) { this->expectedVertexCount = numberVertices; this->expectedEdgeCount = numberEdges; - auto vertex_vector_count = round_up_div(numberVertices, vertex_vector_size); + vertices.allocate(numberVertices); - vertices.reserve(vertex_vector_count); - - for(uint64_t i = 0; i < vertex_vector_count; i++) { - auto vertex_vector = std::vector>(); - vertex_vector.reserve(vertex_vector_size / Vertex::get_data_size_of_vertex()); - vertices.push_back(vertex_vector); - } - - vertex_properties.reserve(numberVertices); edges.reserve(numberEdges); edge_properties.reserve(numberEdges); }; @@ -327,27 +244,16 @@ namespace morphstore{ virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getVertexCount() << std::endl; - std::cout << "Number of vertices with properties:" << vertex_properties.size() << std::endl; + std::cout << "Number of vertices with properties:" << vertices.vertices_with_properties_count() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edge_properties.size() << std::endl; std::cout << "--------------------------------------------" << std::endl; } void print_vertex_by_id(uint64_t id) { - std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; - std::shared_ptr v = get_vertex_at(id); - std::cout << "Vertex-ID: \t" << v->getID() << std::endl; - std::cout << "Type: \t" << get_vertexType_by_number(v->getType()) << std::endl; - std::cout << "\n"; - std::cout << "Properties: "; - for (const auto entry : vertex_properties[id]) { - auto value = entry.second; - std::cout << "{" << entry.first << ": "; - std::visit(PropertyValueVisitor{}, value); - std::cout << "}"; - } + vertices.print_vertex_by_id(id); std::cout << "\n"; - std::cout << "#Edges: " << this->get_out_degree(v->getID()); + std::cout << "#Edges: " << this->get_out_degree(id); std::cout << "\n"; std::cout << "-----------------------------------------------" << std::endl; } @@ -372,11 +278,7 @@ namespace morphstore{ } void print_type_dicts(){ - std::cout << "VertexType-Dict: " << std::endl; - for(auto const& entry : vertexTypeDictionary){ - std::cout << entry.first << " -> " << entry.second << std::endl; - } - std::cout << "\n"; + vertices.print_type_dict(); std::cout << "EdgeType-Dict: " << std::endl; for(auto const& rel : edgeTypeDictionary){ diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 7746a0d7..880052b9 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -616,6 +616,7 @@ namespace morphstore{ } } + // TODO: is this function really needed? // function for sorting the vertexEdgesLookup ASC (needed in CSR) // sorting for every vertex its vector list with target-ids ASC void sort_VertexEdgesLookup(){ @@ -664,7 +665,7 @@ namespace morphstore{ // populate vertex_type_lookup for differentiating between edge and property files generate_vertex_type_lookup(); - graph.setVertexTypeDictionary(vertexTypeLookup); + graph.set_vertex_type_dictionary(vertexTypeLookup); uint64_t numberEdges = get_total_number_edges(); diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 72d89cfe..d2387f14 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -41,14 +41,17 @@ namespace morphstore{ // optional: type, properties unsigned short int type; - public: + // default constr. needed for VertexWithProperties(Vertex vertex, const std::unordered_map properties) + // otherwise compiler won't accept + Vertex(){} + Vertex(uint64_t id, unsigned short int type){ this->id = id; this->type = type; } - uint64_t getID(){ + uint64_t getID() const { return id; } @@ -56,8 +59,22 @@ namespace morphstore{ return type; } + // this is needed when using VerticesVectorArrayContainer when doing vertex_array[offset] = vertex + Vertex& operator= (const Vertex &vertex){ + // self-assignment guard + if (this == &vertex) + return *this; + + // do the copy + this->id = vertex.id; + this->type = vertex.type; + + // return the existing object so we can chain this operator + return *this; + } + // get size of vertex in bytes: - size_t get_data_size_of_vertex() { + static size_t get_data_size_of_vertex() { size_t size = 0; size += sizeof(uint64_t); // id size += sizeof(unsigned short int); // entity @@ -69,20 +86,20 @@ namespace morphstore{ // convinience class for returning whole vertices class VertexWithProperties { private: - std::shared_ptr vertex; + Vertex vertex; std::unordered_map properties; public: - VertexWithProperties(std::shared_ptr vertex, const std::unordered_map properties) { + VertexWithProperties(Vertex vertex, const std::unordered_map properties) { this->vertex = vertex; this->properties = properties; } uint64_t getID() { - return vertex->getID(); + return vertex.getID(); } unsigned short getType() const { - return vertex->getType(); + return vertex.getType(); } std::unordered_map getProperties() { diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h new file mode 100644 index 00000000..27dbfe76 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -0,0 +1,140 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices_container.h + * @brief abstract class for storing vertices + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_CONTAINER_H +#define MORPHSTORE_VERTICES_CONTAINER_H + +#include "vertex.h" +#include "../property_type.h" + +#include +#include +#include +#include + +namespace morphstore{ + + class VerticesContainer { + protected: + std::map vertex_type_dictionary; + + // TODO: try other property storage formats than per node .. (triple-store or per property) + std::unordered_map> vertex_properties; + + virtual Vertex get_vertex_without_properties(uint64_t id) = 0; + + std::string get_vertex_type(unsigned short int type) const { + if (vertex_type_dictionary.find(type) != vertex_type_dictionary.end()) { + return vertex_type_dictionary.at(type); + } + else { + return "No Matching of type-number in the database! For type " + std::to_string(type); + } + } + + public: + + virtual void insert_vertex(Vertex v) = 0; + virtual bool exists_vertex(const uint64_t id) const = 0; + virtual uint64_t vertex_count() const = 0; + + + virtual void allocate(uint64_t numberVertices) { + vertex_properties.reserve(numberVertices); + } + + void add_vertex(Vertex v, const std::unordered_map properties = {}) { + insert_vertex(v); + if (!properties.empty()) { + vertex_properties.insert(std::make_pair(v.getID(), properties)); + } + } + + void add_property_to_vertex(uint64_t id, const std::pair property) { + assert(exists_vertex(id)); + vertex_properties[id].insert(property); + }; + + void set_vertex_type_dictionary(const std::map& types) { + assert(types.size() != 0); + this->vertex_type_dictionary = types; + } + + + const VertexWithProperties get_vertex(uint64_t id) { + assert(exists_vertex(id)); + return VertexWithProperties(get_vertex_without_properties(id), vertex_properties[id]); + } + + uint64_t vertices_with_properties_count() { + return vertex_properties.size(); + } + + virtual std::pair get_size(){ + size_t data_size = 0; + size_t index_size = 0; + + // lookup type dicts + index_size += 2 * sizeof(std::map); + for(auto& ent : vertex_type_dictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(ent.second.length()); + } + + // vertex-properties: + index_size += sizeof(std::unordered_map>); + for (auto &property_mapping : vertex_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (auto property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { + data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); + } + } + + return std::make_pair(index_size, data_size); + } + + void print_type_dict(){ + std::cout << "VertexType-Dict: " << std::endl; + for (auto const &entry : vertex_type_dictionary) { + std::cout << entry.first << " -> " << entry.second << std::endl; + } + } + + void print_vertex_by_id(const uint64_t id) { + std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; + VertexWithProperties v = get_vertex(id); + std::cout << "Vertex-ID: \t" << v.getID() << std::endl; + std::cout << "Type: \t" << get_vertex_type(v.getType()) << std::endl; + std::cout << "\n"; + std::cout << "Properties: "; + for (const auto entry : v.getProperties()) { + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; + } + } + }; +} + +#endif //MORPHSTORE_VERTICES_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h new file mode 100644 index 00000000..760df3d0 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -0,0 +1,81 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices__hashmap_container.h + * @brief storing vertices using a hashmap + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H +#define MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H + +#include "vertex.h" +#include "vertices_container.h" + +#include +#include + +namespace morphstore{ + + class VerticesHashMapContainer : public VerticesContainer{ + protected: + std::unordered_map> vertices; + + Vertex get_vertex_without_properties(uint64_t id) override{ + return *vertices[id]; + } + + public: + void allocate(const uint64_t numberVertices) override { + VerticesContainer::allocate(numberVertices); + this->vertices.reserve(numberVertices); + } + + void insert_vertex(const Vertex v) override { + vertices[v.getID()] = std::make_unique(v); + } + + bool exists_vertex(const uint64_t id) const override { + if(vertices.find(id) == vertices.end()){ + return false; + } + return true; + } + + uint64_t vertex_count() const { + return vertices.size(); + } + + std::pair get_size() override { + auto [index_size, data_size] = VerticesContainer::get_size(); + + // container for indexes: + index_size += sizeof(std::unordered_map>); + for (auto &it : vertices) { + // index size of vertex: size of id and sizeof pointer + index_size += sizeof(uint64_t) + sizeof(std::unique_ptr); + // data size: + data_size += it.second->get_data_size_of_vertex(); + } + + return std::make_pair(index_size, data_size); + } + }; +} + +#endif //MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_container.h new file mode 100644 index 00000000..f6294641 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_vectorvector_container.h @@ -0,0 +1,131 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices__vectorvector_container.h + * @brief storing vertices using a vector of vectors + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H +#define MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H + +#include "vertex.h" +#include "vertices_container.h" + +#include +#include +#include +#include + +namespace morphstore{ + + using vertex_vector_ptr = std::shared_ptr>>; + + class VerticesVectorVectorContainer : public VerticesContainer{ + protected: + std::vector vertices; + uint64_t number_of_vertices = 0; + vertex_vector_ptr current_vector; + static const inline uint64_t vertex_vector_size = 4096; + static const inline uint64_t vertices_per_vector = vertex_vector_size / Vertex::get_data_size_of_vertex(); + + vertex_vector_ptr allocate_vertex_array() { + auto vertex_vector = std::make_shared>>(); + vertex_vector->reserve(vertex_vector_size / Vertex::get_data_size_of_vertex()); + vertices.push_back(vertex_vector); + + //std::cout << " Added a page" << std::endl; + //std::cout.flush(); + + return vertex_vector; + } + + inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { + return vertex_id / vertex_vector_size; + } + + inline uint64_t get_pos_in_vector(uint64_t vertex_id) const { + return vertex_id % vertices_per_vector; + } + + + Vertex get_vertex_without_properties(uint64_t id) override { + uint64_t vector_number = get_vertex_vector_number(id); + uint64_t pos_in_vector = get_pos_in_vector(id); + + /*std::cout << " id: " << id + << " vectors_number: " << vector_number + << " pos in vector: " << pos_in_vector + << " max_pos_in_vector: " << pos_in_vector + << " max_pos_in_vector: " << vertices_per_vector + << " number of vectors: " << vertices.size() << std::endl; + std::cout.flush(); + */ + + assert (vector_number <= vertices.size()); + assert (pos_in_vector < vertices_per_vector); + + return *vertices.at(vector_number)->at(pos_in_vector); + } + + + public: + void allocate(const uint64_t numberVertices) { + VerticesContainer::allocate(numberVertices); + current_vector = allocate_vertex_array(); + } + + void insert_vertex(Vertex v) { + // equals current array is full + if (current_vector->size() == vertices_per_vector) { + current_vector = allocate_vertex_array(); + } + + current_vector->push_back(std::make_shared(v)); + number_of_vertices++;; + } + + bool exists_vertex(const uint64_t id) const override { + // !assumes no deletion (should be replaced when an id-index exists) + return number_of_vertices > id; + } + + uint64_t vertex_count() const override { + return number_of_vertices; + } + + std::pair get_size() override { + auto [index_size, data_size] = VerticesContainer::get_size(); + + // vector_count, current vertex_vector + index_size += 2 * sizeof(uint64_t); + + index_size += sizeof(std::vector); + index_size += vertices.size() * sizeof(vertex_vector_ptr); + + for(auto vector: vertices) { + index_size += vector->size() * sizeof(std::shared_ptr); + data_size += vector->size() * Vertex::get_data_size_of_vertex(); + } + + return std::make_pair(index_size, data_size); + } + }; +} + +#endif //MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H \ No newline at end of file diff --git a/test/core/operators/graph/simple/bfs_simple_graph_test.h b/test/core/operators/graph/simple/bfs_simple_graph_test.h index 1f3036aa..66bf9ccb 100644 --- a/test/core/operators/graph/simple/bfs_simple_graph_test.h +++ b/test/core/operators/graph/simple/bfs_simple_graph_test.h @@ -47,7 +47,7 @@ void bfs_simple_graph_test (void) { std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; std::map vertexTypeMap = {{0, "Person"}}; graph->setEdgeTypeDictionary(edgeTypeMap); - graph->setVertexTypeDictionary(vertexTypeMap); + graph->set_vertex_type_dictionary(vertexTypeMap); uint64_t v1 = graph->add_vertex(0); uint64_t v2 = graph->add_vertex(0); diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index d8503ade..0438b1f2 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -43,7 +43,7 @@ void simpleGraphFormatTest (void) { std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; std::map vertexTypeMap = {{0, "Person"}}; graph->setEdgeTypeDictionary(edgeTypeMap); - graph->setVertexTypeDictionary(vertexTypeMap); + graph->set_vertex_type_dictionary(vertexTypeMap); uint64_t v1 = graph->add_vertex(0, {{"age", "12"}}); uint64_t v2 = graph->add_vertex(0); From b9043e4d1bfd3634882d46d5cfbe14f8a1480620 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 6 Apr 2020 22:31:24 +0200 Subject: [PATCH 116/216] Add link to ldbc schema --- include/core/storage/graph/ldbc_schema.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/ldbc_schema.h b/include/core/storage/graph/ldbc_schema.h index 3e92c2fd..b6b79544 100644 --- a/include/core/storage/graph/ldbc_schema.h +++ b/include/core/storage/graph/ldbc_schema.h @@ -17,7 +17,7 @@ /** * @file lbc_schema.h - * @brief Schema of the LDBC graph + * @brief Schema of the LDBC graph based on https://raw.githubusercontent.com/ldbc/ldbc_snb_docs/dev/figures/schema-comfortable.png * @todo */ From 4c520ddcbb6492befa163e64c30f6db55908f816 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 7 Apr 2020 13:38:27 +0200 Subject: [PATCH 117/216] Add variant of vertices container using arrays instead of vectors --- include/core/storage/graph/graph.h | 6 +- .../vertex/vertices_vectorarray_container.h | 130 ++++++++++++++++++ .../vertex/vertices_vectorvector_container.h | 4 +- 3 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 include/core/storage/graph/vertex/vertices_vectorarray_container.h diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 8cc90744..9bd46a46 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -25,9 +25,9 @@ #define MORPHSTORE_GRAPH_H #include "vertex/vertex.h" -//#include "vertex/vertices_hashmap_container.h" +#include "vertex/vertices_hashmap_container.h" #include "vertex/vertices_vectorvector_container.h" -//#include "vertex/vertices_vectorarray_container.h" +#include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" #include "property_type.h" @@ -54,7 +54,7 @@ namespace morphstore{ mutable uint64_t currentMaxVertexId = 0; // ! currently need to change to right container (abstract seems not to be possible due to pure virtual functions) - VerticesVectorVectorContainer vertices; + VerticesVectorArrayContainer vertices; std::unordered_map> edges; diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h new file mode 100644 index 00000000..4921f3b4 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -0,0 +1,130 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices__vectorarray_container.h + * @brief storing vertices using a vector of arrays + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H +#define MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H + +#include "vertex.h" +#include "vertices_container.h" + +#include +#include + +namespace morphstore{ + + class VerticesVectorArrayContainer : public VerticesContainer{ + protected: + std::vector vertices; + + static const inline uint64_t vertex_array_size = 4096; + static const inline uint64_t vertices_per_array = vertex_array_size / sizeof(Vertex); + + uint64_t number_of_vertices = 0; + Vertex* current_array; + uint64_t current_array_offset = 0; + + + Vertex* allocate_vertex_array() { + auto array_pointer = (Vertex *) std::aligned_alloc( + sizeof(Vertex), + vertices_per_array * sizeof(Vertex)); + + vertices.push_back(array_pointer); + //std::cout << " Added a page" << std::endl; + //std::cout.flush(); + return array_pointer; + } + + inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { + return vertex_id / vertex_array_size; + } + + inline uint64_t get_pos_in_array(uint64_t vertex_id) const { + return vertex_id % vertices_per_array; + } + + Vertex get_vertex_without_properties(uint64_t id) override { + uint64_t array_number = get_vertex_vector_number(id); + uint64_t pos_in_array = get_pos_in_array(id); + + assert (pos_in_array < vertices_per_array); + assert (array_number < vertices.size()); + + return vertices.at(array_number)[pos_in_array]; + } + + public: + // TODO: make array_size based on constructor + //VerticesVectorArrayContainer(array_size) + + void allocate(const uint64_t numberVertices) { + VerticesContainer::allocate(numberVertices); + current_array = allocate_vertex_array(); + } + + void insert_vertex(Vertex v) { + // equals current array is full + if (current_array_offset == vertices_per_array) { + current_array = allocate_vertex_array(); + current_array_offset = 0; + } + + current_array[current_array_offset] = v; + current_array_offset++; + number_of_vertices++; + } + + bool exists_vertex(const uint64_t id) const override { + // assumes no deletion! + return number_of_vertices > id; + } + + uint64_t vertex_count() const override { + return number_of_vertices; + } + + std::pair get_size() override { + auto [index_size, data_size] = VerticesContainer::get_size(); + + // vector count, current_array_offset + index_size += 2 * sizeof(uint64_t); + // current_array + index_size += sizeof(Vertex*); + index_size += sizeof(std::vector); + index_size += vertices.size() * sizeof(Vertex*); + // allocated memory for vertices + data_size += vertices.size() * Vertex::get_data_size_of_vertex() * vertices_per_array; + + return std::make_pair(index_size, data_size); + } + + ~VerticesVectorArrayContainer() { + //std::cout << "freeing vertex pages"; + for (auto array_pointer : this->vertices) { + free(array_pointer); + } + } + }; +} + +#endif //MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_container.h index f6294641..e65e8abd 100644 --- a/include/core/storage/graph/vertex/vertices_vectorvector_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorvector_container.h @@ -42,11 +42,11 @@ namespace morphstore{ uint64_t number_of_vertices = 0; vertex_vector_ptr current_vector; static const inline uint64_t vertex_vector_size = 4096; - static const inline uint64_t vertices_per_vector = vertex_vector_size / Vertex::get_data_size_of_vertex(); + static const inline uint64_t vertices_per_vector = vertex_vector_size / sizeof(Vertex); vertex_vector_ptr allocate_vertex_array() { auto vertex_vector = std::make_shared>>(); - vertex_vector->reserve(vertex_vector_size / Vertex::get_data_size_of_vertex()); + vertex_vector->reserve(vertex_vector_size / sizeof(Vertex)); vertices.push_back(vertex_vector); //std::cout << " Added a page" << std::endl; From cc282442961502902fd14bd8447a24f61c576406 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 7 Apr 2020 14:40:38 +0200 Subject: [PATCH 118/216] Use std::optional in ldbc_importer instead of misusing -1 for non-value --- include/core/storage/graph/ldbc_import.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 880052b9..edef6a68 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -39,7 +39,7 @@ #include #include #include - +#include @@ -155,7 +155,7 @@ namespace morphstore{ std::vector> attributes; std::string vertexType = getEntityType(file); - int vertexTypeNumber = get_vertex_type_number(vertexType); + int vertexTypeNumber = get_vertex_type_number(vertexType).value(); char *buffer; @@ -268,8 +268,8 @@ namespace morphstore{ } } - // function which returns the vertex_type_number if parameter is a vertexType in the ldbc-files. else -1 - int get_vertex_type_number(const std::string &vertexType) { + // function which returns the vertex_type_number (has no value if non existing) + std::optional get_vertex_type_number(const std::string &vertexType) { // iterate through entities-map to look up for paramater for (auto const &entry : vertexTypeLookup) { if (entry.second == vertexType) { @@ -277,7 +277,7 @@ namespace morphstore{ } } - return -1; + return {}; } // function which returns true, if the edge type already exist @@ -364,7 +364,7 @@ namespace morphstore{ bool firstLine = true; // check from file name whether it's a edge file or multi value attribute file - if (get_vertex_type_number(targetVertexType) != -1) { + if (get_vertex_type_number(targetVertexType).has_value()) { for (size_t i = 0; i < fileSize; ++i) { if (buffer[i] == '\n') { @@ -496,7 +496,7 @@ namespace morphstore{ std::string delimiter = "|"; // check from file name whether it's an edge file or multi value attribute file - if(get_vertex_type_number(targetVertexType) == -1) { + if(!get_vertex_type_number(targetVertexType).has_value()) { // Multi-value-attributes: just take the last recently one std::string propertyKey; Ldbc_Data_Type data_type; From 7eba8b9bbf9ca465ebddd6d96d0d84c2f1aceb37 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 7 Apr 2020 15:22:48 +0200 Subject: [PATCH 119/216] Correct get_size() functions --- .../storage/graph/formats/adjacencylist.h | 17 ++++++------- include/core/storage/graph/formats/csr.h | 17 ++++++------- include/core/storage/graph/graph.h | 25 ++++++++----------- .../storage/graph/vertex/vertices_container.h | 14 +++++------ .../graph/vertex/vertices_hashmap_container.h | 16 ++++++------ .../vertex/vertices_vectorarray_container.h | 2 +- .../vertex/vertices_vectorvector_container.h | 2 +- 7 files changed, 42 insertions(+), 51 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index cac001a3..60dd6a8c 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -108,20 +108,19 @@ namespace morphstore{ } // for measuring the size in bytes: - std::pair get_size_of_graph() override { - std::pair index_data_size; - + std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); // adjacencyListPerVertex - for(auto& it : adjacencylistPerVertex){ - // data size: - data_size += sizeof(it); - } + index_size += sizeof(std::unordered_map>>); + index_size += adjacencylistPerVertex.size() * (sizeof(uint64_t) + sizeof(std::shared_ptr>)); - index_data_size = {index_size, data_size}; + for(const auto& iterator : adjacencylistPerVertex){ + // might be wrong in case of compression + data_size += sizeof(uint64_t) * iterator.second->size(); + } - return index_data_size; + return {index_size, data_size}; } // for debugging: print neighbors a vertex diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 7827c137..36f4d024 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -136,21 +136,20 @@ namespace morphstore{ } // get size of storage format: - std::pair get_size_of_graph() override { - std::pair index_data_size; + std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); - + // might be only valid for the uncompressed case // pointer to arrays: index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); - // edges array values: - for(uint64_t i = 0; i < getExpectedEdgeCount(); i++){ - index_size += sizeof(uint64_t); // node_array with offsets - } - index_data_size = {index_size, data_size}; + // edgeId array values: + index_size += getExpectedEdgeCount() * sizeof(uint64_t); + + // offset array values: + index_size += getExpectedVertexCount() * sizeof(uint64_t); - return index_data_size; + return {index_size, data_size}; } // for debugging: diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 9bd46a46..4c220e95 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -192,34 +192,29 @@ namespace morphstore{ virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; - virtual std::pair get_size_of_graph(){ + virtual std::pair get_size_of_graph() const { // including vertices + its properties + its type dict auto [index_size, data_size] = vertices.get_size(); // lookup type dicts - for(auto& rel : edgeTypeDictionary){ + for(const auto& rel : edgeTypeDictionary){ index_size += sizeof(unsigned short int); index_size += sizeof(char)*(rel.second.length()); } - // container for indexes: - index_size += sizeof(std::vector>>); - - index_size += sizeof(std::unordered_map>); - for(auto& it : edges){ - // index size of edge: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::shared_ptr); - // data size: - data_size += it.second->size_in_bytes(); - } + index_size += sizeof(std::unordered_map>); + // index size of edge: size of id and sizeof pointer + index_size += edges.size() * (sizeof(uint64_t) + sizeof(std::shared_ptr)); + data_size += edges.size() * Edge::size_in_bytes(); // TODO: extra propertymappings class // edge-properties: index_size += sizeof(std::unordered_map>); - for(auto& property_mapping: edge_properties) { + for(const auto& property_mapping: edge_properties) { index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (std::unordered_map::iterator property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { - data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); + // properties of a single edge + for (const auto& property : property_mapping.second) { + data_size += sizeof(char) * property.first.length() + sizeof(property.second); } } diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 27dbfe76..93da5012 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -90,27 +90,27 @@ namespace morphstore{ return vertex_properties.size(); } - virtual std::pair get_size(){ + virtual std::pair get_size() const { size_t data_size = 0; size_t index_size = 0; // lookup type dicts index_size += 2 * sizeof(std::map); - for(auto& ent : vertex_type_dictionary){ + for(auto& type_mapping : vertex_type_dictionary){ index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(ent.second.length()); + index_size += sizeof(char)*(type_mapping.second.length()); } // vertex-properties: index_size += sizeof(std::unordered_map>); - for (auto &property_mapping : vertex_properties) { + for (const auto &property_mapping : vertex_properties) { index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (auto property = property_mapping.second.begin(); property != property_mapping.second.end(); ++property) { - data_size += sizeof(char) * (property->first.length() + sizeof(property->second)); + for (const auto &property : property_mapping.second) { + data_size += sizeof(char) * property.first.length() + sizeof(property.second); } } - return std::make_pair(index_size, data_size); + return {index_size, data_size}; } void print_type_dict(){ diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index 760df3d0..461a237a 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -61,19 +61,17 @@ namespace morphstore{ return vertices.size(); } - std::pair get_size() override { + std::pair get_size() const override { auto [index_size, data_size] = VerticesContainer::get_size(); // container for indexes: - index_size += sizeof(std::unordered_map>); - for (auto &it : vertices) { - // index size of vertex: size of id and sizeof pointer - index_size += sizeof(uint64_t) + sizeof(std::unique_ptr); - // data size: - data_size += it.second->get_data_size_of_vertex(); - } + index_size += sizeof(std::unordered_map>); + // index size of vertex: size of id and sizeof pointer + index_size += vertices.size() * (sizeof(uint64_t) + sizeof(std::unique_ptr)); + data_size += vertices.size() * Vertex::get_data_size_of_vertex(); + - return std::make_pair(index_size, data_size); + return {index_size, data_size}; } }; } diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 4921f3b4..1dfd4bde 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -103,7 +103,7 @@ namespace morphstore{ return number_of_vertices; } - std::pair get_size() override { + std::pair get_size() const override { auto [index_size, data_size] = VerticesContainer::get_size(); // vector count, current_array_offset diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_container.h index e65e8abd..cfd361c2 100644 --- a/include/core/storage/graph/vertex/vertices_vectorvector_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorvector_container.h @@ -109,7 +109,7 @@ namespace morphstore{ return number_of_vertices; } - std::pair get_size() override { + std::pair get_size() const override { auto [index_size, data_size] = VerticesContainer::get_size(); // vector_count, current vertex_vector From 967b3f8d17364fdc2f75c53713b8080e5ffa2151 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 10 Apr 2020 18:51:17 +0200 Subject: [PATCH 120/216] Choose vertices container type at construction time * also adding 2 more containers (non pointer versions for hash-map and vector of pointer to vectors) --- .../storage/graph/formats/adjacencylist.h | 6 +- include/core/storage/graph/formats/csr.h | 5 +- include/core/storage/graph/graph.h | 57 +++++--- .../storage/graph/vertex/vertices_container.h | 4 +- .../graph/vertex/vertices_hashmap_container.h | 14 +- .../vertex/vertices_hashmap_ptr_container.h | 83 +++++++++++ .../vertex/vertices_vectorarray_container.h | 8 +- .../vertex/vertices_vectorvector_container.h | 20 +-- .../vertices_vectorvector_ptr_container.h | 136 ++++++++++++++++++ .../graph/vertex_storage_benchmark.cpp | 87 ++++++----- 10 files changed, 350 insertions(+), 70 deletions(-) create mode 100644 include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h create mode 100644 include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 60dd6a8c..0fa6d360 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -38,6 +38,8 @@ namespace morphstore{ std::unordered_map>> adjacencylistPerVertex; public: + AdjacencyList(VerticesContainerType vertices_container_type = VectorArrayContainer) : Graph(vertices_container_type) {} + std::string get_storage_format() const override { return "Adjacency_List"; } @@ -56,7 +58,7 @@ namespace morphstore{ // function that adds multiple edges (list of neighbors) at once to vertex void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { - if (!vertices.exists_vertex(sourceId)) { + if (!vertices->exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } @@ -70,7 +72,7 @@ namespace morphstore{ for(const auto edge : edgesToAdd) { edges[edge.getId()] = std::make_shared(edge); - if(vertices.exists_vertex(edge.getTargetId())) { + if(vertices->exists_vertex(edge.getTargetId())) { adjacencyList->push_back(edge.getId()); } else { diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 36f4d024..383da7cd 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -42,6 +42,7 @@ namespace morphstore{ uint64_t* edgeId_array = nullptr; public: + CSR(VerticesContainerType vertices_container_type = VectorArrayContainer) : Graph(vertices_container_type) {} ~CSR() { free(offset_array); @@ -75,14 +76,14 @@ namespace morphstore{ uint64_t offset = offset_array[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); - if (!vertices.exists_vertex(sourceID)) { + if (!vertices->exists_vertex(sourceID)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceID)); } // fill the arrays for(const auto& edge : edgesToAdd){ std::shared_ptr ePtr = std::make_shared(edge); - if(!vertices.exists_vertex(edge.getTargetId())) { + if(!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found " + edge.to_string()); } edges[ePtr->getId()] = ePtr; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 4c220e95..fca88054 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -26,7 +26,9 @@ #include "vertex/vertex.h" #include "vertex/vertices_hashmap_container.h" +#include "vertex/vertices_hashmap_ptr_container.h" #include "vertex/vertices_vectorvector_container.h" +#include "vertex/vertices_vectorvector_ptr_container.h" #include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" #include "property_type.h" @@ -53,8 +55,7 @@ namespace morphstore{ mutable uint64_t currentMaxVertexId = 0; - // ! currently need to change to right container (abstract seems not to be possible due to pure virtual functions) - VerticesVectorArrayContainer vertices; + std::unique_ptr vertices; std::unordered_map> edges; @@ -67,10 +68,10 @@ namespace morphstore{ // function to check if the edge-ID is present or not (exists) bool exist_edgeId(const uint64_t id){ - if(edges.find(id) == edges.end()){ - return false; - } - return true; + if (edges.find(id) == edges.end()) { + return false; + } + return true; } // TODO: put this into vertex container? @@ -81,9 +82,33 @@ namespace morphstore{ public: // -------------------- Setters & Getters -------------------- + Graph(VerticesContainerType vertices_container_type = VectorArrayContainer) { + switch (vertices_container_type) { + case VerticesContainerType::VectorArrayContainer: + vertices = std::make_unique(); + break; + case VerticesContainerType::VectorVectorContainer: + vertices = std::make_unique(); + break; + case VerticesContainerType::VectorVectorPtrContainer: + vertices = std::make_unique(); + break; + case VerticesContainerType::HashMapContainer: + vertices = std::make_unique(); + break; + case VerticesContainerType::HashMapPtrContainer: + vertices = std::make_unique(); + break; + } + } + + std::string vertices_container_description() { + return vertices->container_description(); + } + void set_vertex_type_dictionary(const std::map& types) { assert(types.size() != 0); - this->vertices.set_vertex_type_dictionary(types); + this->vertices->set_vertex_type_dictionary(types); } const std::map &getRelationDictionary() const { @@ -99,7 +124,7 @@ namespace morphstore{ } uint64_t getVertexCount() const { - return vertices.vertex_count(); + return vertices->vertex_count(); } uint64_t getExpectedEdgeCount() const { @@ -113,7 +138,7 @@ namespace morphstore{ uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { assert(expectedVertexCount > getVertexCount()); Vertex v = Vertex(getNextVertexId(), type); - vertices.add_vertex(v, props); + vertices->add_vertex(v, props); return v.getID(); }; @@ -128,7 +153,7 @@ namespace morphstore{ // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id){ - return vertices.get_vertex(id); + return vertices->get_vertex(id); } // function which returns a pointer to vertex by id @@ -177,7 +202,7 @@ namespace morphstore{ } void add_property_to_vertex(uint64_t id, const std::pair property) { - vertices.add_property_to_vertex(id, property); + vertices->add_property_to_vertex(id, property); }; void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { @@ -194,7 +219,7 @@ namespace morphstore{ virtual std::pair get_size_of_graph() const { // including vertices + its properties + its type dict - auto [index_size, data_size] = vertices.get_size(); + auto [index_size, data_size] = vertices->get_size(); // lookup type dicts for(const auto& rel : edgeTypeDictionary){ @@ -225,7 +250,7 @@ namespace morphstore{ this->expectedVertexCount = numberVertices; this->expectedEdgeCount = numberEdges; - vertices.allocate(numberVertices); + vertices->allocate(numberVertices); edges.reserve(numberEdges); edge_properties.reserve(numberEdges); @@ -239,14 +264,14 @@ namespace morphstore{ virtual void statistics(){ std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getVertexCount() << std::endl; - std::cout << "Number of vertices with properties:" << vertices.vertices_with_properties_count() << std::endl; + std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edge_properties.size() << std::endl; std::cout << "--------------------------------------------" << std::endl; } void print_vertex_by_id(uint64_t id) { - vertices.print_vertex_by_id(id); + vertices->print_vertex_by_id(id); std::cout << "\n"; std::cout << "#Edges: " << this->get_out_degree(id); std::cout << "\n"; @@ -273,7 +298,7 @@ namespace morphstore{ } void print_type_dicts(){ - vertices.print_type_dict(); + vertices->print_type_dict(); std::cout << "EdgeType-Dict: " << std::endl; for(auto const& rel : edgeTypeDictionary){ diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 93da5012..d0a95965 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -33,7 +33,7 @@ #include namespace morphstore{ - + enum VerticesContainerType {HashMapContainer, HashMapPtrContainer, VectorVectorContainer, VectorVectorPtrContainer, VectorArrayContainer}; class VerticesContainer { protected: std::map vertex_type_dictionary; @@ -53,7 +53,7 @@ namespace morphstore{ } public: - + virtual std::string container_description() const = 0; virtual void insert_vertex(Vertex v) = 0; virtual bool exists_vertex(const uint64_t id) const = 0; virtual uint64_t vertex_count() const = 0; diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index 461a237a..736e69b6 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -34,20 +34,24 @@ namespace morphstore{ class VerticesHashMapContainer : public VerticesContainer{ protected: - std::unordered_map> vertices; + std::unordered_map vertices; Vertex get_vertex_without_properties(uint64_t id) override{ - return *vertices[id]; + return vertices[id]; } public: + std::string container_description() const override { + return "unordered_map"; + } + void allocate(const uint64_t numberVertices) override { VerticesContainer::allocate(numberVertices); this->vertices.reserve(numberVertices); } void insert_vertex(const Vertex v) override { - vertices[v.getID()] = std::make_unique(v); + vertices[v.getID()] = v; } bool exists_vertex(const uint64_t id) const override { @@ -65,9 +69,9 @@ namespace morphstore{ auto [index_size, data_size] = VerticesContainer::get_size(); // container for indexes: - index_size += sizeof(std::unordered_map>); + index_size += sizeof(std::unordered_map); // index size of vertex: size of id and sizeof pointer - index_size += vertices.size() * (sizeof(uint64_t) + sizeof(std::unique_ptr)); + index_size += vertices.size() * sizeof(uint64_t); data_size += vertices.size() * Vertex::get_data_size_of_vertex(); diff --git a/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h b/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h new file mode 100644 index 00000000..dd57fdb3 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h @@ -0,0 +1,83 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices__hashmap_container.h + * @brief storing vertices using a hashmap + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H +#define MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H + +#include "vertex.h" +#include "vertices_container.h" + +#include +#include + +namespace morphstore{ + + class VerticesHashMapPtrContainer : public VerticesContainer{ + protected: + std::unordered_map> vertices; + + Vertex get_vertex_without_properties(uint64_t id) override{ + return *vertices[id]; + } + + public: + std::string container_description() const override { + return "unordered_map>"; + } + + void allocate(const uint64_t numberVertices) override { + VerticesContainer::allocate(numberVertices); + this->vertices.reserve(numberVertices); + } + + void insert_vertex(const Vertex v) override { + vertices[v.getID()] = std::make_unique(v); + } + + bool exists_vertex(const uint64_t id) const override { + if(vertices.find(id) == vertices.end()){ + return false; + } + return true; + } + + uint64_t vertex_count() const { + return vertices.size(); + } + + std::pair get_size() const override { + auto [index_size, data_size] = VerticesContainer::get_size(); + + // container for indexes: + index_size += sizeof(std::unordered_map>); + // index size of vertex: size of id and sizeof pointer + index_size += vertices.size() * (sizeof(uint64_t) + sizeof(std::unique_ptr)); + data_size += vertices.size() * Vertex::get_data_size_of_vertex(); + + + return {index_size, data_size}; + } + }; +} + +#endif //MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 1dfd4bde..3267aab3 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -67,8 +67,8 @@ namespace morphstore{ uint64_t array_number = get_vertex_vector_number(id); uint64_t pos_in_array = get_pos_in_array(id); - assert (pos_in_array < vertices_per_array); - assert (array_number < vertices.size()); + //assert (pos_in_array < vertices_per_array); + //assert (array_number < vertices.size()); return vertices.at(array_number)[pos_in_array]; } @@ -77,6 +77,10 @@ namespace morphstore{ // TODO: make array_size based on constructor //VerticesVectorArrayContainer(array_size) + std::string container_description() const override { + return "vector"; + } + void allocate(const uint64_t numberVertices) { VerticesContainer::allocate(numberVertices); current_array = allocate_vertex_array(); diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_container.h index cfd361c2..2645c413 100644 --- a/include/core/storage/graph/vertex/vertices_vectorvector_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorvector_container.h @@ -34,7 +34,7 @@ namespace morphstore{ - using vertex_vector_ptr = std::shared_ptr>>; + using vertex_vector_ptr = std::shared_ptr>; class VerticesVectorVectorContainer : public VerticesContainer{ protected: @@ -45,8 +45,8 @@ namespace morphstore{ static const inline uint64_t vertices_per_vector = vertex_vector_size / sizeof(Vertex); vertex_vector_ptr allocate_vertex_array() { - auto vertex_vector = std::make_shared>>(); - vertex_vector->reserve(vertex_vector_size / sizeof(Vertex)); + auto vertex_vector = std::make_shared>(); + vertex_vector->reserve(vertices_per_vector); vertices.push_back(vertex_vector); //std::cout << " Added a page" << std::endl; @@ -77,14 +77,19 @@ namespace morphstore{ std::cout.flush(); */ - assert (vector_number <= vertices.size()); - assert (pos_in_vector < vertices_per_vector); + //assert (vector_number <= vertices.size()); + //assert (pos_in_vector < vertices_per_vector); - return *vertices.at(vector_number)->at(pos_in_vector); + return vertices.at(vector_number)->at(pos_in_vector); } public: + + std::string container_description() const override { + return "vector>>"; + } + void allocate(const uint64_t numberVertices) { VerticesContainer::allocate(numberVertices); current_vector = allocate_vertex_array(); @@ -96,7 +101,7 @@ namespace morphstore{ current_vector = allocate_vertex_array(); } - current_vector->push_back(std::make_shared(v)); + current_vector->push_back(v); number_of_vertices++;; } @@ -119,7 +124,6 @@ namespace morphstore{ index_size += vertices.size() * sizeof(vertex_vector_ptr); for(auto vector: vertices) { - index_size += vector->size() * sizeof(std::shared_ptr); data_size += vector->size() * Vertex::get_data_size_of_vertex(); } diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h new file mode 100644 index 00000000..d12cf650 --- /dev/null +++ b/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h @@ -0,0 +1,136 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file vertices__vectorvector_ptr_container.h + * @brief storing vertices using a vector of vectors + * @todo +*/ + +#ifndef MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H +#define MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H + +#include "vertex.h" +#include "vertices_container.h" + +#include +#include +#include +#include + +namespace morphstore{ + + using vertex_vector_ptr_ptr = std::shared_ptr>>; + + class VerticesVectorVectorPtrContainer : public VerticesContainer{ + protected: + std::vector vertices; + uint64_t number_of_vertices = 0; + vertex_vector_ptr_ptr current_vector; + static const inline uint64_t vertex_vector_size = 4096; + static const inline uint64_t vertices_per_vector = vertex_vector_size / sizeof(std::shared_ptr); + + vertex_vector_ptr_ptr allocate_vertex_array() { + auto vertex_vector = std::make_shared>>(); + vertex_vector->reserve(vertices_per_vector); + vertices.push_back(vertex_vector); + + //std::cout << " Added a page" << std::endl; + //std::cout.flush(); + + return vertex_vector; + } + + inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { + return vertex_id / vertex_vector_size; + } + + inline uint64_t get_pos_in_vector(uint64_t vertex_id) const { + return vertex_id % vertices_per_vector; + } + + + Vertex get_vertex_without_properties(uint64_t id) override { + uint64_t vector_number = get_vertex_vector_number(id); + uint64_t pos_in_vector = get_pos_in_vector(id); + + /*std::cout << " id: " << id + << " vectors_number: " << vector_number + << " pos in vector: " << pos_in_vector + << " max_pos_in_vector: " << pos_in_vector + << " max_pos_in_vector: " << vertices_per_vector + << " number of vectors: " << vertices.size() << std::endl; + std::cout.flush(); + */ + + //assert (vector_number <= vertices.size()); + // assert (pos_in_vector < vertices_per_vector); + + return *vertices.at(vector_number)->at(pos_in_vector); + } + + + public: + + std::string container_description() const override { + return "vector>>>"; + } + + void allocate(const uint64_t numberVertices) { + VerticesContainer::allocate(numberVertices); + current_vector = allocate_vertex_array(); + } + + void insert_vertex(Vertex v) { + // equals current array is full + if (current_vector->size() == vertices_per_vector) { + current_vector = allocate_vertex_array(); + } + + current_vector->push_back(std::make_shared(v)); + number_of_vertices++;; + } + + bool exists_vertex(const uint64_t id) const override { + // !assumes no deletion (should be replaced when an id-index exists) + return number_of_vertices > id; + } + + uint64_t vertex_count() const override { + return number_of_vertices; + } + + std::pair get_size() const override { + auto [index_size, data_size] = VerticesContainer::get_size(); + + // vector_count, current vertex_vector + index_size += 2 * sizeof(uint64_t); + + index_size += sizeof(std::vector); + index_size += vertices.size() * sizeof(vertex_vector_ptr_ptr); + + for(auto vector: vertices) { + index_size += vector->size() * sizeof(std::shared_ptr); + data_size += vector->size() * Vertex::get_data_size_of_vertex(); + } + + return std::make_pair(index_size, data_size); + } + }; +} + +#endif //MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H \ No newline at end of file diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 1c94c857..08775da2 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -21,71 +21,92 @@ */ #include +#include #include #include +#include typedef std::chrono::high_resolution_clock highResClock; using namespace morphstore; -int64_t getDuration(std::chrono::time_point start) { +int64_t get_duration(std::chrono::time_point start) { auto stop = highResClock::now(); return std::chrono::duration_cast(stop - start).count(); } +int64_t get_median(std::vector values) { + assert(values.size() > 0); + std::nth_element(values.begin(), values.begin() + values.size()/2, values.end()); + return values[values.size()/2]; +} + int main(void) { // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function int number_of_executions = 5; - std::cout << "Test vertex storage structure (avg of 5 for full_iterate and random access) times in μs" << std::endl; - std::cout << "vertex_count | loading time | full_iterate | 10^4 random access" << std::endl; + std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access) times in μs" << std::endl; + std::cout << "Container type | vertex_count | loading time | full_iterate | 10^4 random access" << std::endl; + + std::vector storage_types = { + VerticesContainerType::HashMapContainer, + VerticesContainerType::HashMapPtrContainer, + VerticesContainerType::VectorArrayContainer, + VerticesContainerType::VectorVectorPtrContainer, + VerticesContainerType::VectorVectorContainer}; - for(int vertex_count=10000; vertex_count < 100000000; vertex_count = vertex_count*10) { - int64_t duration = 0; - - std::cout << vertex_count << " | "; - std::unique_ptr graph = std::make_unique(); + for (auto storage_type : storage_types) { + for (int vertex_count = 10000; vertex_count < 100000000; + vertex_count = vertex_count * 10) { + std::unique_ptr graph = std::make_unique(storage_type); graph->allocate_graph_structure(vertex_count, 0); + + std::string measurement_entry = + graph->vertices_container_description() + " | "; + measurement_entry += std::to_string(vertex_count) + " | "; + auto start = highResClock::now(); - for(int i=0; i < vertex_count; i++) { - graph->add_vertex(i); + for (int i = 0; i < vertex_count; i++) { + graph->add_vertex(i); } - std::cout << getDuration(start) << " | "; - - duration = 0; - - for(int exec=0; exec < number_of_executions; exec++) { - auto start = highResClock::now(); - // iterate - for(int i=0; i < vertex_count; i++) { - graph->get_vertex(i); - } - duration += getDuration(start); - } + measurement_entry += std::to_string(get_duration(start)) + " | "; + + std::vector durations; - std::cout << duration / number_of_executions << " | "; + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + // iterate + for (int i = 0; i < vertex_count; i++) { + graph->get_vertex(i); + } + durations.push_back(get_duration(start)); + } + measurement_entry += std::to_string(get_median(durations)) + " | "; // random access - duration = 0; + durations.clear(); - for(int exec=0; exec < number_of_executions; exec++) { - std::random_device rd; - std::uniform_int_distribution dist(0, vertex_count - 1); + for (int exec = 0; exec < number_of_executions; exec++) { + std::random_device rd; + std::uniform_int_distribution dist(0, vertex_count - 1); - auto start = highResClock::now(); + auto start = highResClock::now(); - for(int i=0; i < 10000; i++) { - graph->get_vertex(dist(rd)); - } + for (int i = 0; i < 10000; i++) { + graph->get_vertex(dist(rd)); + } - duration += getDuration(start); + durations.push_back(get_duration(start)); } - std::cout << duration / number_of_executions << std::endl; + measurement_entry += std::to_string(get_median(durations)); + + std::cout << measurement_entry << std::endl; + } } return 0; From af15891d7dcefe506f31b1f4330bb9efaf81d9f5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sat, 11 Apr 2020 16:34:38 +0200 Subject: [PATCH 121/216] Extend vertices container benchmark * more measurements * same random access for each vertices container --- .../graph/vertex_storage_benchmark.cpp | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 08775da2..91d3fc66 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -46,8 +46,8 @@ int main(void) { int number_of_executions = 5; - std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access) times in μs" << std::endl; - std::cout << "Container type | vertex_count | loading time | full_iterate | 10^4 random access" << std::endl; + std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; + std::cout << "Container type | vertex_count | loading time in μs | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; std::vector storage_types = { VerticesContainerType::HashMapContainer, @@ -56,9 +56,17 @@ int main(void) { VerticesContainerType::VectorVectorPtrContainer, VerticesContainerType::VectorVectorContainer}; - for (auto storage_type : storage_types) { - for (int vertex_count = 10000; vertex_count < 100000000; - vertex_count = vertex_count * 10) { + std::vector vertex_counts = {10000, 100000, 1000000, 2000000, 5000000, 10000000, 15000000}; + + for (int vertex_count: vertex_counts) { + std::random_device rd; + std::uniform_int_distribution dist(0, vertex_count - 1); + std::vector random_accesses; + for (int i = 0; i < vertex_count; i++) { + random_accesses.push_back(dist(rd)); + } + + for (auto storage_type : storage_types) { std::unique_ptr graph = std::make_unique(storage_type); graph->allocate_graph_structure(vertex_count, 0); @@ -91,13 +99,10 @@ int main(void) { durations.clear(); for (int exec = 0; exec < number_of_executions; exec++) { - std::random_device rd; - std::uniform_int_distribution dist(0, vertex_count - 1); - auto start = highResClock::now(); - for (int i = 0; i < 10000; i++) { - graph->get_vertex(dist(rd)); + for (int random_pos : random_accesses) { + graph->get_vertex(random_pos); } durations.push_back(get_duration(start)); From 257007503f373795bee9bb642e9ba243bc5e8045 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 13 Apr 2020 19:23:08 +0200 Subject: [PATCH 122/216] Provide a default init for property data_type as otherwise could not be compiled in release mode --- include/core/storage/graph/ldbc_import.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index edef6a68..632d0c39 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -499,7 +499,7 @@ namespace morphstore{ if(!get_vertex_type_number(targetVertexType).has_value()) { // Multi-value-attributes: just take the last recently one std::string propertyKey; - Ldbc_Data_Type data_type; + Ldbc_Data_Type data_type = Ldbc_Data_Type::STRING; std::unordered_map multiValueAttr; uint64_t systemID; property_type value; From 00395a87b7868531bc57bf0b0db1601799aac10b Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 14 Apr 2020 12:19:16 +0200 Subject: [PATCH 123/216] Add NDEBUG flag in order to skip asserts in release mode --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18a8d0da..3864fd73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ IF(CMAKE_BUILD_TYPE MATCHES Debug) message(STATUS "MorphStore is configured in DEBUG mode.") ELSEIF(CMAKE_BUILD_TYPE MATCHES Release) morph_flag(-O2) + morph_flag(-DNDEBUG) message(STATUS "MorphStore is configured in RELEASE mode.") ELSEIF(CMAKE_BUILD_TYPE MATCHES HighPerf) morph_flag(-O3) From 708bc46f7020afcee014b39d45f3df7125552bde Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 12:33:13 +0200 Subject: [PATCH 124/216] Fix bfs test as it did not include static part of the graph --- test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index d5922855..0f5ff7eb 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -38,10 +38,11 @@ template void bfs_ldbc_graph_test (void) { static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - + std::string sourceDir = ""; std::string targetDir = ""; + if (sourceDir.empty()) { throw std::invalid_argument("Where are the ldbc files??"); } @@ -67,7 +68,7 @@ void bfs_ldbc_graph_test (void) { graph->statistics(); auto bfs = std::make_unique(graph); - - assert(bfs->do_BFS(0) == 229144); + // for scale factor 1 and including static as well as dynamic part of the graph + std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; //bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); } \ No newline at end of file From fd88b415b5164380ff840ee281b9792f5ee93dbc Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 14 Apr 2020 14:33:10 +0200 Subject: [PATCH 125/216] Use free instead of delete[] as memory was allocated using malloc and not new --- include/core/storage/graph/ldbc_import.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 632d0c39..1efb6533 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -260,7 +260,7 @@ namespace morphstore{ } } - delete[] buffer; // free memory + free(buffer); // free memory vertexFile.close(); ++vertexTypeNumber; @@ -379,7 +379,7 @@ namespace morphstore{ } - delete[] buffer; // free memory + free(buffer); // free memory edgeFile.close(); } @@ -440,7 +440,7 @@ namespace morphstore{ } } - delete[] buffer; // free memory + free(buffer); // free memory vertexFile.close(); } return result; @@ -608,7 +608,7 @@ namespace morphstore{ } } } - delete[] buffer; // free memory + free(buffer); // free memory edgeFile.close(); } // graph gets full edge-type-list here: From 40f3727029289699d32cb0bbcdd2de61518e7765 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 14 Apr 2020 14:43:08 +0200 Subject: [PATCH 126/216] Add method stub for graph compression --- include/core/storage/graph/formats/adjacencylist.h | 4 ++++ include/core/storage/graph/formats/csr.h | 4 ++++ include/core/storage/graph/graph.h | 1 + 3 files changed, 9 insertions(+) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 0fa6d360..db3b9a0e 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -109,6 +109,10 @@ namespace morphstore{ return targetVertexIds; } + void compress() override { + std::cout << "Compressing graph format specific data structures"; + } + // for measuring the size in bytes: std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 383da7cd..459cfa56 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -136,6 +136,10 @@ namespace morphstore{ return targetVertexIds; } + void compress() override { + std::cout << "Compressing graph format specific data structures"; + } + // get size of storage format: std::pair get_size_of_graph() const override { diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index fca88054..442c633b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -214,6 +214,7 @@ namespace morphstore{ virtual std::string get_storage_format() const = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; + virtual void compress() = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; From e761a8870a01ab43d563b735feb2837cd4f11ec5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 14 Apr 2020 23:52:12 +0200 Subject: [PATCH 127/216] Use uncompressed columns in the CSR format --- include/core/storage/graph/formats/csr.h | 67 +++++++++++++----------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 459cfa56..db222f6a 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -26,6 +26,8 @@ #include "../graph.h" #include "../vertex/vertex.h" +#include + #include #include @@ -35,20 +37,15 @@ namespace morphstore{ private: /* graph topology: - * offset array: index is vertex-id; array cell contains offset in edgeId array - * edgeId array: contains edge id + * offset column: index is vertex-id; column entry contains offset in edgeId array + * edgeId column: contains edge id */ - uint64_t* offset_array = nullptr; - uint64_t* edgeId_array = nullptr; + column* offset_column; + column* edgeId_column; public: CSR(VerticesContainerType vertices_container_type = VectorArrayContainer) : Graph(vertices_container_type) {} - ~CSR() { - free(offset_array); - free(edgeId_array); - } - std::string get_storage_format() const override { return "CSR"; } @@ -57,11 +54,14 @@ namespace morphstore{ void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); - offset_array = (uint64_t*) malloc(numberVertices * sizeof(uint64_t)); - edgeId_array = (uint64_t*) malloc(numberEdges * sizeof(uint64_t)); + offset_column = column::create_global_column(numberVertices * sizeof(uint64_t)); + offset_column->set_count_values(numberVertices); + edgeId_column = column::create_global_column(numberEdges * sizeof(uint64_t)); + edgeId_column->set_count_values(numberEdges); // init node array: - offset_array[0] = 0; + uint64_t* offset_data = offset_column->get_data(); + offset_data[0] = 0; } // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy @@ -73,7 +73,8 @@ namespace morphstore{ // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); - uint64_t offset = offset_array[sourceID]; + uint64_t* offset_data = offset_column->get_data(); + uint64_t offset = offset_data[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); if (!vertices->exists_vertex(sourceID)) { @@ -81,31 +82,37 @@ namespace morphstore{ } // fill the arrays + // TODO: fill array using memcpy? (put edgeIds into vector as prerpare step) + uint64_t* edgeId_data = edgeId_column->get_data(); for(const auto& edge : edgesToAdd){ std::shared_ptr ePtr = std::make_shared(edge); if(!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found " + edge.to_string()); } edges[ePtr->getId()] = ePtr; - edgeId_array[offset] = ePtr->getId(); + + edgeId_data[offset] = ePtr->getId(); ++offset; } // to avoid buffer overflow: if(sourceID < getExpectedVertexCount()-1){ - offset_array[sourceID+1] = nextOffset; + offset_data[sourceID+1] = nextOffset; } } // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) override { - uint64_t offset = offset_array[id]; + uint64_t* offset_data = offset_column->get_data(); + uint64_t offset = offset_data[id]; // special case: last vertex id has no next offset uint64_t nextOffset; + + // todo: `getExpectedVertexCount()` could be replaced by `offset_column->get_count_values()` if(id == getExpectedVertexCount() -1){ nextOffset = getExpectedEdgeCount(); }else{ - nextOffset = offset_array[id+1]; + nextOffset = offset_data[id+1]; } if(offset == nextOffset) return 0; @@ -116,12 +123,15 @@ namespace morphstore{ // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { std::vector neighbourEdgeIds; - uint64_t offset = offset_array[id]; + uint64_t* offset_data = offset_column->get_data(); + uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); // avoiding out of bounds ... + // TODO: use assert here, as this is only out of bounds if the offset if( offset < getExpectedEdgeCount()){ - neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_array+offset, edgeId_array+offset+numberEdges); + uint64_t* edgeId_data = edgeId_column->get_data(); + neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_data+offset, edgeId_data+offset+numberEdges); } std::vector targetVertexIds; @@ -138,21 +148,16 @@ namespace morphstore{ void compress() override { std::cout << "Compressing graph format specific data structures"; + // TODO: need a way to change column format } // get size of storage format: std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); - // might be only valid for the uncompressed case - // pointer to arrays: - index_size += sizeof(uint64_t*) * 2 + sizeof(Edge*); - // edgeId array values: - index_size += getExpectedEdgeCount() * sizeof(uint64_t); - - // offset array values: - index_size += getExpectedVertexCount() * sizeof(uint64_t); + index_size += edgeId_column->get_size_used_byte(); + index_size += offset_column->get_size_used_byte(); return {index_size, data_size}; } @@ -160,11 +165,13 @@ namespace morphstore{ // for debugging: void print_neighbors_of_vertex(uint64_t id) override{ std::cout << "Neighbours for Vertex with id " << id << std::endl; - uint64_t offset = offset_array[id]; + uint64_t* offset_data = offset_column->get_data(); + uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); - + + uint64_t* edgeId_data = edgeId_column->get_data(); for(uint64_t i = offset; i < offset+numberEdges; ++i){ - uint64_t edgeId = edgeId_array[i]; + uint64_t edgeId = edgeId_data[i]; print_edge_by_id(edgeId); } } From 1df8e32f0686dfce33fb06a4b0fd0e1f4a120357 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 17 Apr 2020 17:16:23 +0200 Subject: [PATCH 128/216] Delete redundant vertices container classes only keeping one hashmap and one vector impl --- .../storage/graph/formats/adjacencylist.h | 2 +- include/core/storage/graph/formats/csr.h | 7 +- include/core/storage/graph/graph.h | 12 -- .../storage/graph/vertex/vertices_container.h | 2 +- .../vertex/vertices_hashmap_ptr_container.h | 83 ----------- .../vertex/vertices_vectorarray_container.h | 15 +- .../vertex/vertices_vectorvector_container.h | 135 ----------------- .../vertices_vectorvector_ptr_container.h | 136 ------------------ .../graph/vertex_storage_benchmark.cpp | 6 +- 9 files changed, 18 insertions(+), 380 deletions(-) delete mode 100644 include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h delete mode 100644 include/core/storage/graph/vertex/vertices_vectorvector_container.h delete mode 100644 include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index db3b9a0e..88c794b4 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -38,7 +38,7 @@ namespace morphstore{ std::unordered_map>> adjacencylistPerVertex; public: - AdjacencyList(VerticesContainerType vertices_container_type = VectorArrayContainer) : Graph(vertices_container_type) {} + AdjacencyList(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} std::string get_storage_format() const override { return "Adjacency_List"; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index db222f6a..93ce4ba6 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -44,7 +44,12 @@ namespace morphstore{ column* edgeId_column; public: - CSR(VerticesContainerType vertices_container_type = VectorArrayContainer) : Graph(vertices_container_type) {} + CSR(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} + + ~CSR() { + delete offset_column; + delete edgeId_column; + } std::string get_storage_format() const override { return "CSR"; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 442c633b..74a8a7ff 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -26,9 +26,6 @@ #include "vertex/vertex.h" #include "vertex/vertices_hashmap_container.h" -#include "vertex/vertices_hashmap_ptr_container.h" -#include "vertex/vertices_vectorvector_container.h" -#include "vertex/vertices_vectorvector_ptr_container.h" #include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" #include "property_type.h" @@ -87,18 +84,9 @@ namespace morphstore{ case VerticesContainerType::VectorArrayContainer: vertices = std::make_unique(); break; - case VerticesContainerType::VectorVectorContainer: - vertices = std::make_unique(); - break; - case VerticesContainerType::VectorVectorPtrContainer: - vertices = std::make_unique(); - break; case VerticesContainerType::HashMapContainer: vertices = std::make_unique(); break; - case VerticesContainerType::HashMapPtrContainer: - vertices = std::make_unique(); - break; } } diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index d0a95965..9270452f 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -33,7 +33,7 @@ #include namespace morphstore{ - enum VerticesContainerType {HashMapContainer, HashMapPtrContainer, VectorVectorContainer, VectorVectorPtrContainer, VectorArrayContainer}; + enum VerticesContainerType {HashMapContainer, VectorArrayContainer}; class VerticesContainer { protected: std::map vertex_type_dictionary; diff --git a/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h b/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h deleted file mode 100644 index dd57fdb3..00000000 --- a/include/core/storage/graph/vertex/vertices_hashmap_ptr_container.h +++ /dev/null @@ -1,83 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file vertices__hashmap_container.h - * @brief storing vertices using a hashmap - * @todo -*/ - -#ifndef MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H -#define MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H - -#include "vertex.h" -#include "vertices_container.h" - -#include -#include - -namespace morphstore{ - - class VerticesHashMapPtrContainer : public VerticesContainer{ - protected: - std::unordered_map> vertices; - - Vertex get_vertex_without_properties(uint64_t id) override{ - return *vertices[id]; - } - - public: - std::string container_description() const override { - return "unordered_map>"; - } - - void allocate(const uint64_t numberVertices) override { - VerticesContainer::allocate(numberVertices); - this->vertices.reserve(numberVertices); - } - - void insert_vertex(const Vertex v) override { - vertices[v.getID()] = std::make_unique(v); - } - - bool exists_vertex(const uint64_t id) const override { - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } - - uint64_t vertex_count() const { - return vertices.size(); - } - - std::pair get_size() const override { - auto [index_size, data_size] = VerticesContainer::get_size(); - - // container for indexes: - index_size += sizeof(std::unordered_map>); - // index size of vertex: size of id and sizeof pointer - index_size += vertices.size() * (sizeof(uint64_t) + sizeof(std::unique_ptr)); - data_size += vertices.size() * Vertex::get_data_size_of_vertex(); - - - return {index_size, data_size}; - } - }; -} - -#endif //MORPHSTORE_VERTICES_HASHMAP_PTR_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 3267aab3..640ebe97 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -77,6 +77,14 @@ namespace morphstore{ // TODO: make array_size based on constructor //VerticesVectorArrayContainer(array_size) + ~VerticesVectorArrayContainer() { + // TODO: find memory leak (destructor seems not to be called) + std::cout << "freeing vertex pages"; + for (auto array_pointer : this->vertices) { + free(array_pointer); + } + } + std::string container_description() const override { return "vector"; } @@ -121,13 +129,6 @@ namespace morphstore{ return std::make_pair(index_size, data_size); } - - ~VerticesVectorArrayContainer() { - //std::cout << "freeing vertex pages"; - for (auto array_pointer : this->vertices) { - free(array_pointer); - } - } }; } diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_container.h deleted file mode 100644 index 2645c413..00000000 --- a/include/core/storage/graph/vertex/vertices_vectorvector_container.h +++ /dev/null @@ -1,135 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file vertices__vectorvector_container.h - * @brief storing vertices using a vector of vectors - * @todo -*/ - -#ifndef MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H -#define MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H - -#include "vertex.h" -#include "vertices_container.h" - -#include -#include -#include -#include - -namespace morphstore{ - - using vertex_vector_ptr = std::shared_ptr>; - - class VerticesVectorVectorContainer : public VerticesContainer{ - protected: - std::vector vertices; - uint64_t number_of_vertices = 0; - vertex_vector_ptr current_vector; - static const inline uint64_t vertex_vector_size = 4096; - static const inline uint64_t vertices_per_vector = vertex_vector_size / sizeof(Vertex); - - vertex_vector_ptr allocate_vertex_array() { - auto vertex_vector = std::make_shared>(); - vertex_vector->reserve(vertices_per_vector); - vertices.push_back(vertex_vector); - - //std::cout << " Added a page" << std::endl; - //std::cout.flush(); - - return vertex_vector; - } - - inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { - return vertex_id / vertex_vector_size; - } - - inline uint64_t get_pos_in_vector(uint64_t vertex_id) const { - return vertex_id % vertices_per_vector; - } - - - Vertex get_vertex_without_properties(uint64_t id) override { - uint64_t vector_number = get_vertex_vector_number(id); - uint64_t pos_in_vector = get_pos_in_vector(id); - - /*std::cout << " id: " << id - << " vectors_number: " << vector_number - << " pos in vector: " << pos_in_vector - << " max_pos_in_vector: " << pos_in_vector - << " max_pos_in_vector: " << vertices_per_vector - << " number of vectors: " << vertices.size() << std::endl; - std::cout.flush(); - */ - - //assert (vector_number <= vertices.size()); - //assert (pos_in_vector < vertices_per_vector); - - return vertices.at(vector_number)->at(pos_in_vector); - } - - - public: - - std::string container_description() const override { - return "vector>>"; - } - - void allocate(const uint64_t numberVertices) { - VerticesContainer::allocate(numberVertices); - current_vector = allocate_vertex_array(); - } - - void insert_vertex(Vertex v) { - // equals current array is full - if (current_vector->size() == vertices_per_vector) { - current_vector = allocate_vertex_array(); - } - - current_vector->push_back(v); - number_of_vertices++;; - } - - bool exists_vertex(const uint64_t id) const override { - // !assumes no deletion (should be replaced when an id-index exists) - return number_of_vertices > id; - } - - uint64_t vertex_count() const override { - return number_of_vertices; - } - - std::pair get_size() const override { - auto [index_size, data_size] = VerticesContainer::get_size(); - - // vector_count, current vertex_vector - index_size += 2 * sizeof(uint64_t); - - index_size += sizeof(std::vector); - index_size += vertices.size() * sizeof(vertex_vector_ptr); - - for(auto vector: vertices) { - data_size += vector->size() * Vertex::get_data_size_of_vertex(); - } - - return std::make_pair(index_size, data_size); - } - }; -} - -#endif //MORPHSTORE_VERTICES_VECTORVECTOR_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h b/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h deleted file mode 100644 index d12cf650..00000000 --- a/include/core/storage/graph/vertex/vertices_vectorvector_ptr_container.h +++ /dev/null @@ -1,136 +0,0 @@ -/********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * - * * - * This file is part of MorphStore - a compression aware vectorized column store. * - * * - * This program is free software: you can redistribute it and/or modify it under the * - * terms of the GNU General Public License as published by the Free Software Foundation, * - * either version 3 of the License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * - * See the GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License along with this program. * - * If not, see . * - **********************************************************************************************/ - -/** - * @file vertices__vectorvector_ptr_container.h - * @brief storing vertices using a vector of vectors - * @todo -*/ - -#ifndef MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H -#define MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H - -#include "vertex.h" -#include "vertices_container.h" - -#include -#include -#include -#include - -namespace morphstore{ - - using vertex_vector_ptr_ptr = std::shared_ptr>>; - - class VerticesVectorVectorPtrContainer : public VerticesContainer{ - protected: - std::vector vertices; - uint64_t number_of_vertices = 0; - vertex_vector_ptr_ptr current_vector; - static const inline uint64_t vertex_vector_size = 4096; - static const inline uint64_t vertices_per_vector = vertex_vector_size / sizeof(std::shared_ptr); - - vertex_vector_ptr_ptr allocate_vertex_array() { - auto vertex_vector = std::make_shared>>(); - vertex_vector->reserve(vertices_per_vector); - vertices.push_back(vertex_vector); - - //std::cout << " Added a page" << std::endl; - //std::cout.flush(); - - return vertex_vector; - } - - inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { - return vertex_id / vertex_vector_size; - } - - inline uint64_t get_pos_in_vector(uint64_t vertex_id) const { - return vertex_id % vertices_per_vector; - } - - - Vertex get_vertex_without_properties(uint64_t id) override { - uint64_t vector_number = get_vertex_vector_number(id); - uint64_t pos_in_vector = get_pos_in_vector(id); - - /*std::cout << " id: " << id - << " vectors_number: " << vector_number - << " pos in vector: " << pos_in_vector - << " max_pos_in_vector: " << pos_in_vector - << " max_pos_in_vector: " << vertices_per_vector - << " number of vectors: " << vertices.size() << std::endl; - std::cout.flush(); - */ - - //assert (vector_number <= vertices.size()); - // assert (pos_in_vector < vertices_per_vector); - - return *vertices.at(vector_number)->at(pos_in_vector); - } - - - public: - - std::string container_description() const override { - return "vector>>>"; - } - - void allocate(const uint64_t numberVertices) { - VerticesContainer::allocate(numberVertices); - current_vector = allocate_vertex_array(); - } - - void insert_vertex(Vertex v) { - // equals current array is full - if (current_vector->size() == vertices_per_vector) { - current_vector = allocate_vertex_array(); - } - - current_vector->push_back(std::make_shared(v)); - number_of_vertices++;; - } - - bool exists_vertex(const uint64_t id) const override { - // !assumes no deletion (should be replaced when an id-index exists) - return number_of_vertices > id; - } - - uint64_t vertex_count() const override { - return number_of_vertices; - } - - std::pair get_size() const override { - auto [index_size, data_size] = VerticesContainer::get_size(); - - // vector_count, current vertex_vector - index_size += 2 * sizeof(uint64_t); - - index_size += sizeof(std::vector); - index_size += vertices.size() * sizeof(vertex_vector_ptr_ptr); - - for(auto vector: vertices) { - index_size += vector->size() * sizeof(std::shared_ptr); - data_size += vector->size() * Vertex::get_data_size_of_vertex(); - } - - return std::make_pair(index_size, data_size); - } - }; -} - -#endif //MORPHSTORE_VERTICES_VECTORVECTOR_PTR_CONTAINER_H \ No newline at end of file diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 91d3fc66..4a9fe226 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -51,10 +51,8 @@ int main(void) { std::vector storage_types = { VerticesContainerType::HashMapContainer, - VerticesContainerType::HashMapPtrContainer, - VerticesContainerType::VectorArrayContainer, - VerticesContainerType::VectorVectorPtrContainer, - VerticesContainerType::VectorVectorContainer}; + VerticesContainerType::VectorArrayContainer + }; std::vector vertex_counts = {10000, 100000, 1000000, 2000000, 5000000, 10000000, 15000000}; From 2c3dc0a170685bcd14cd27838f2420d0c4504236 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 17 Apr 2020 17:32:55 +0200 Subject: [PATCH 129/216] Use unique Pointers for csr columns --- include/core/storage/graph/formats/csr.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 93ce4ba6..bea76c41 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -40,17 +40,12 @@ namespace morphstore{ * offset column: index is vertex-id; column entry contains offset in edgeId array * edgeId column: contains edge id */ - column* offset_column; - column* edgeId_column; + std::unique_ptr> offset_column; + std::unique_ptr> edgeId_column; public: CSR(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} - ~CSR() { - delete offset_column; - delete edgeId_column; - } - std::string get_storage_format() const override { return "CSR"; } @@ -59,9 +54,9 @@ namespace morphstore{ void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); - offset_column = column::create_global_column(numberVertices * sizeof(uint64_t)); + offset_column = std::make_unique>(numberVertices * sizeof(uint64_t)); offset_column->set_count_values(numberVertices); - edgeId_column = column::create_global_column(numberEdges * sizeof(uint64_t)); + edgeId_column = std::make_unique>(numberEdges * sizeof(uint64_t)); edgeId_column->set_count_values(numberEdges); // init node array: From c136cadffee86e6b1b8c15a519dd58262f50c9f9 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 17 Apr 2020 18:19:58 +0200 Subject: [PATCH 130/216] Move vertex id gen inside of vertices container --- include/core/storage/graph/graph.h | 12 ++---------- .../core/storage/graph/vertex/vertices_container.h | 11 ++++++++++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 74a8a7ff..713e3133 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -50,10 +50,9 @@ namespace morphstore{ uint64_t expectedVertexCount; uint64_t expectedEdgeCount; - mutable uint64_t currentMaxVertexId = 0; - std::unique_ptr vertices; + // Todo: use a EdgesContainer for edges and edge_properties (very similar to vertices Container) std::unordered_map> edges; std::unordered_map> edge_properties; @@ -70,11 +69,6 @@ namespace morphstore{ } return true; } - - // TODO: put this into vertex container? - uint64_t getNextVertexId() const { - return currentMaxVertexId++; - } public: // -------------------- Setters & Getters -------------------- @@ -125,9 +119,7 @@ namespace morphstore{ uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { assert(expectedVertexCount > getVertexCount()); - Vertex v = Vertex(getNextVertexId(), type); - vertices->add_vertex(v, props); - return v.getID(); + return vertices->add_vertex(type, props); }; std::string get_edgeType_by_number(unsigned short int type){ diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 9270452f..299d725d 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -36,6 +36,8 @@ namespace morphstore{ enum VerticesContainerType {HashMapContainer, VectorArrayContainer}; class VerticesContainer { protected: + uint64_t currentMaxVertexId = 0; + std::map vertex_type_dictionary; // TODO: try other property storage formats than per node .. (triple-store or per property) @@ -52,6 +54,10 @@ namespace morphstore{ } } + uint64_t getNextVertexId() { + return currentMaxVertexId++; + } + public: virtual std::string container_description() const = 0; virtual void insert_vertex(Vertex v) = 0; @@ -63,11 +69,14 @@ namespace morphstore{ vertex_properties.reserve(numberVertices); } - void add_vertex(Vertex v, const std::unordered_map properties = {}) { + uint64_t add_vertex(const unsigned short int type, const std::unordered_map properties = {}) { + Vertex v = Vertex(getNextVertexId(), type); insert_vertex(v); if (!properties.empty()) { vertex_properties.insert(std::make_pair(v.getID(), properties)); } + + return v.getID(); } void add_property_to_vertex(uint64_t id, const std::pair property) { From d7ab8e459027b389a1086e170dd3ace1e3c9545c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 17 Apr 2020 18:38:09 +0200 Subject: [PATCH 131/216] Add memory usage to vertices container benchmark --- .../graph/vertex/vertices_vectorarray_container.h | 2 +- src/microbenchmarks/graph/vertex_storage_benchmark.cpp | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 640ebe97..59ee4af6 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -127,7 +127,7 @@ namespace morphstore{ // allocated memory for vertices data_size += vertices.size() * Vertex::get_data_size_of_vertex() * vertices_per_array; - return std::make_pair(index_size, data_size); + return {index_size, data_size}; } }; } diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 4a9fe226..0879f850 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -47,7 +47,7 @@ int main(void) { int number_of_executions = 5; std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; - std::cout << "Container type | vertex_count | loading time in μs | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; + std::cout << "Container type | vertex_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; std::vector storage_types = { VerticesContainerType::HashMapContainer, @@ -76,11 +76,17 @@ int main(void) { for (int i = 0; i < vertex_count; i++) { graph->add_vertex(i); } - + // loading time measurement_entry += std::to_string(get_duration(start)) + " | "; + // size + auto [index_size, data_size] = graph->get_size_of_graph(); + measurement_entry += std::to_string(index_size + data_size) + " | "; + + std::vector durations; + // full iterate for (int exec = 0; exec < number_of_executions; exec++) { auto start = highResClock::now(); // iterate From 1c0251bc971b4561d469e25f1347e5a418e99877 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 20 Apr 2020 18:35:18 +0200 Subject: [PATCH 132/216] Init compression of csr columns --- .../storage/graph/formats/adjacencylist.h | 6 +- include/core/storage/graph/formats/csr.h | 77 +++++++++++++++++-- include/core/storage/graph/graph.h | 28 ++++++- .../storage/graph/vertex/vertices_container.h | 2 +- .../storage/graph/simple/simple_graph_test.h | 4 +- 5 files changed, 103 insertions(+), 14 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 88c794b4..9a2d0f49 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -109,8 +109,10 @@ namespace morphstore{ return targetVertexIds; } - void compress() override { - std::cout << "Compressing graph format specific data structures"; + void compress(GraphCompressionFormat target_format) override { + std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; + + //this->current_compression = target_format; } // for measuring the size in bytes: diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index bea76c41..26e598b6 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -27,6 +27,12 @@ #include "../graph.h" #include "../vertex/vertex.h" #include +#include +#include +#include +#include +#include +#include #include #include @@ -54,10 +60,13 @@ namespace morphstore{ void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); - offset_column = std::make_unique>(numberVertices * sizeof(uint64_t)); - offset_column->set_count_values(numberVertices); - edgeId_column = std::make_unique>(numberEdges * sizeof(uint64_t)); - edgeId_column->set_count_values(numberEdges); + const size_t offset_size = numberVertices * sizeof(uint64_t); + offset_column = std::make_unique>(offset_size); + offset_column->set_meta_data(numberVertices, offset_size); + + const size_t edge_ids_size = numberEdges * sizeof(uint64_t); + edgeId_column = std::make_unique>(edge_ids_size); + edgeId_column->set_meta_data(numberEdges, edge_ids_size); // init node array: uint64_t* offset_data = offset_column->get_data(); @@ -146,9 +155,53 @@ namespace morphstore{ return targetVertexIds; } - void compress() override { - std::cout << "Compressing graph format specific data structures"; - // TODO: need a way to change column format + void compress(GraphCompressionFormat target_format) override { + std::cout << "Compressing graph format specific data structures via " << to_string(target_format) << std::endl; + + + if (current_compression == target_format) { + std::cout << "Already in " << to_string(target_format); + return; + } + + // TODO: allow also other vector extensions (switch from safe_morph to morph) + // example layout: dynamic_vbp_f<512, 32, 8> + + using ve = vectorlib::scalar>; + column* inCol = offset_column.get(); + + switch (target_format) + { +/* case GraphCompressionFormat::DELTA: + auto column = morph>, uncompr_f>(inCol); + std::cout << " values: " << column->get_count_values() + << " size in bytes: " << column->get_size_used_byte() + << " ?compressed bytes : " << column->get_size_compr_byte() << std::endl; + break; + case GraphCompressionFormat::FOR: + auto column = morph>, uncompr_f>(inCol); + std::cout << " values: " << column->get_count_values() + << " size in bytes: " << column->get_size_used_byte() + << " ?compressed bytes : " << column->get_size_compr_byte() << std::endl; + break; */ + case GraphCompressionFormat::RLE: { + auto column = morph(inCol); + std::cout << " values: " << column->get_count_values() + << " size in bytes: " << column->get_size_used_byte() + << " compression ratio: " << inCol->get_size_used_byte() / (double) column->get_size_used_byte() << std::endl; + break; + } + default: + throw std::runtime_error("Could not compress yet"); + break; + } + + //auto column = safe_morph(inCol); + + + // TODO: save them .. and correctly operate on the compressed column + // TODO: use normal morph (as vector extension is ignored) + this->current_compression = target_format; } // get size of storage format: @@ -175,6 +228,16 @@ namespace morphstore{ print_edge_by_id(edgeId); } } + + std::string get_column_info(const column *column) { + return " values: " + std::to_string(column->get_count_values()) + " size in bytes: " + std::to_string(column->get_size_used_byte()); + } + + void statistics() override { + Graph::statistics(); + std::cout << "offset column: " << get_column_info(offset_column.get()) << std::endl; + std::cout << "edgeId column: " << get_column_info(edgeId_column.get()) << std::endl; + } }; } #endif //MORPHSTORE_CSR_H diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 713e3133..ee5143c3 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -43,10 +43,33 @@ namespace morphstore{ + enum class GraphCompressionFormat {DELTA, RLE, FOR, UNCOMPRESSED}; + + std::string to_string(GraphCompressionFormat format) { + std::string desc; + + switch (format) { + case GraphCompressionFormat::DELTA: + desc = "Delta"; + break; + case GraphCompressionFormat::UNCOMPRESSED: + desc = "Uncompressed"; + break; + case GraphCompressionFormat::RLE: + desc = "Runtime length"; + break; + case GraphCompressionFormat::FOR: + desc = "Frame of Reference"; + break; + } + return desc; + } class Graph{ protected: + GraphCompressionFormat current_compression = GraphCompressionFormat::UNCOMPRESSED; + uint64_t expectedVertexCount; uint64_t expectedEdgeCount; @@ -73,7 +96,7 @@ namespace morphstore{ public: // -------------------- Setters & Getters -------------------- - Graph(VerticesContainerType vertices_container_type = VectorArrayContainer) { + Graph(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) { switch (vertices_container_type) { case VerticesContainerType::VectorArrayContainer: vertices = std::make_unique(); @@ -194,7 +217,7 @@ namespace morphstore{ virtual std::string get_storage_format() const = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; - virtual void compress() = 0; + virtual void compress(GraphCompressionFormat target_format) = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; @@ -248,6 +271,7 @@ namespace morphstore{ std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edge_properties.size() << std::endl; + std::cout << "Compression Format:" << to_string(current_compression) << std::endl; std::cout << "--------------------------------------------" << std::endl; } diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 299d725d..159d1d72 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -33,7 +33,7 @@ #include namespace morphstore{ - enum VerticesContainerType {HashMapContainer, VectorArrayContainer}; + enum class VerticesContainerType {HashMapContainer, VectorArrayContainer}; class VerticesContainer { protected: uint64_t currentMaxVertexId = 0; diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 0438b1f2..217e40c2 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -58,9 +58,9 @@ void simpleGraphFormatTest (void) { // (DEBUG) graph->statistics(); graph->print_edge_by_id(0); - graph->print_neighbors_of_vertex(v1); + graph->compress(morphstore::GraphCompressionFormat::RLE); graph->print_neighbors_of_vertex(v2); - graph->print_neighbors_of_vertex(v3); + graph->statistics(); assert(graph->getVertexCount() == 3); assert(graph->getEdgeCount() == 3); From dfa247bd1c52ec3db2e55c45edd2565c44a49378 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 10:51:53 +0200 Subject: [PATCH 133/216] Add valid flag for vertices and edges enable array containers to delete without direct reallocation --- include/core/storage/graph/edge/edge.h | 13 ++++++++++++- include/core/storage/graph/vertex/vertex.h | 10 ++++++++++ .../graph/vertex/vertices_vectorarray_container.h | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 1c763ef1..064a8b8b 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -42,6 +42,9 @@ namespace morphstore{ uint64_t sourceID, targetID, id; unsigned short int type; + // delete flag + bool valid = false; + uint64_t getNextEdgeId() const { static uint64_t currentMaxEdgeId = 0; return currentMaxEdgeId++; @@ -53,6 +56,7 @@ namespace morphstore{ this->targetID = targetId; this->type = type; this->id = getNextEdgeId(); + this->valid = true; } // this is needed for csr when doing edge_array[offset] = edge... @@ -65,6 +69,8 @@ namespace morphstore{ this->sourceID = edge.sourceID; this->targetID = edge.targetID; this->type = edge.type; + this->id = edge.id; + this->valid = edge.valid; // return the existing object so we can chain this operator return *this; @@ -88,6 +94,10 @@ namespace morphstore{ return type; } + bool isValid() const { + return valid; + } + // function for sorting algorithms in the ldbc-importer: // compare target-ids and return if it's "lower" (we need the sorting for the CSR) bool operator<(const Edge& e) const{ @@ -97,8 +107,9 @@ namespace morphstore{ // get size of edge object in bytes: static size_t size_in_bytes() { size_t size = 0; - size += sizeof(uint64_t) * 2; // source- and target-id + size += sizeof(uint64_t) * 3; // id, source- and target-id size += sizeof(unsigned short int); // type + size += sizeof(bool); // valid flag return size; } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index d2387f14..4788b657 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -41,6 +41,9 @@ namespace morphstore{ // optional: type, properties unsigned short int type; + // delete flag + bool valid = false; + public: // default constr. needed for VertexWithProperties(Vertex vertex, const std::unordered_map properties) // otherwise compiler won't accept @@ -49,6 +52,7 @@ namespace morphstore{ Vertex(uint64_t id, unsigned short int type){ this->id = id; this->type = type; + this->valid = true; } uint64_t getID() const { @@ -59,6 +63,10 @@ namespace morphstore{ return type; } + bool isValid() const { + return valid; + } + // this is needed when using VerticesVectorArrayContainer when doing vertex_array[offset] = vertex Vertex& operator= (const Vertex &vertex){ // self-assignment guard @@ -68,6 +76,7 @@ namespace morphstore{ // do the copy this->id = vertex.id; this->type = vertex.type; + this->valid = vertex.valid; // return the existing object so we can chain this operator return *this; @@ -78,6 +87,7 @@ namespace morphstore{ size_t size = 0; size += sizeof(uint64_t); // id size += sizeof(unsigned short int); // entity + size += sizeof(bool); // valid flag return size; } diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 59ee4af6..b1f12db9 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -107,7 +107,7 @@ namespace morphstore{ } bool exists_vertex(const uint64_t id) const override { - // assumes no deletion! + // assumes no deletion! else retrieve vertrex at position and check isValid() return number_of_vertices > id; } From cd7a177174879c94455344b17b9f5364d2c1fdb8 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 10:52:35 +0200 Subject: [PATCH 134/216] Reserve pointers to vertex arrays --- .../core/storage/graph/vertex/vertices_vectorarray_container.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index b1f12db9..1f19e777 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -89,8 +89,9 @@ namespace morphstore{ return "vector"; } - void allocate(const uint64_t numberVertices) { + void allocate(const uint64_t numberVertices) override { VerticesContainer::allocate(numberVertices); + this->vertices.reserve(number_of_vertices / vertices_per_array); current_array = allocate_vertex_array(); } From 32638e332e3c699fddf2629d45802b5eb517c10f Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 17:25:56 +0200 Subject: [PATCH 135/216] Add EdgesContainer similar to VerticesContainer --- include/core/storage/graph/edge/edge.h | 17 +- .../core/storage/graph/edge/edges_container.h | 154 ++++++++++++++++++ .../graph/edge/edges_hashmap_container.h | 82 ++++++++++ .../graph/edge/edges_vectorarray_container.h | 141 ++++++++++++++++ 4 files changed, 389 insertions(+), 5 deletions(-) create mode 100644 include/core/storage/graph/edge/edges_container.h create mode 100644 include/core/storage/graph/edge/edges_hashmap_container.h create mode 100644 include/core/storage/graph/edge/edges_vectorarray_container.h diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 064a8b8b..df49062e 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -46,11 +46,16 @@ namespace morphstore{ bool valid = false; uint64_t getNextEdgeId() const { + // Todo: enable resetting maxEdgeId + // Ideal would be to pull id gen to graph.h but this requires rewriting Ldbc importer to use (edge property setting depends on it) static uint64_t currentMaxEdgeId = 0; return currentMaxEdgeId++; } public: + // default constr. needed for EdgeWithProperties constructor + Edge(){} + Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type){ this->sourceID = sourceId; this->targetID = targetId; @@ -94,7 +99,7 @@ namespace morphstore{ return type; } - bool isValid() const { + bool isValid() const { return valid; } @@ -114,21 +119,23 @@ namespace morphstore{ } std::string to_string() const { - return "(id:" + std::to_string(this->id) + " ," + std::to_string(this->sourceID) + "->" + std::to_string(this->targetID) + ")"; + return "(id:" + std::to_string(this->id) + " ," + + std::to_string(this->sourceID) + "->" + std::to_string(this->targetID) + " ," + + "valid: " + std::to_string(this->valid) + ")"; } }; class EdgeWithProperties { private: - std::shared_ptr edge; + Edge edge; std::unordered_map properties; public: - EdgeWithProperties(std::shared_ptr edge, const std::unordered_map properties) { + EdgeWithProperties(Edge edge, const std::unordered_map properties) { this->edge = edge; this->properties = properties; } - std::shared_ptr getEdge() { + Edge getEdge() { return edge; } diff --git a/include/core/storage/graph/edge/edges_container.h b/include/core/storage/graph/edge/edges_container.h new file mode 100644 index 00000000..5133c2b5 --- /dev/null +++ b/include/core/storage/graph/edge/edges_container.h @@ -0,0 +1,154 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file edges_container.h + * @brief abstract class for storing edges + * @todo an EntityContainer abstraction (reduce duplicated code) +*/ + +#ifndef MORPHSTORE_EDGES_CONTAINER_H +#define MORPHSTORE_EDGES_CONTAINER_H + +#include "edge.h" +#include "../property_type.h" + +#include +#include +#include +#include + +namespace morphstore{ + enum class EdgesContainerType {HashMapContainer, VectorArrayContainer}; + + class EdgesContainer { + protected: + uint64_t expected_edge_count = 0; + + std::map edge_type_dictionary; + + // TODO: try other property storage formats than per vertex .. (triple-store or per property) + std::unordered_map> edge_properties; + + std::string get_edge_type(unsigned short int type) const { + if (edge_type_dictionary.find(type) != edge_type_dictionary.end()) { + return edge_type_dictionary.at(type); + } + else { + return "No Matching of type-number in the database! For type " + std::to_string(type); + } + } + + public: + virtual std::string container_description() const = 0; + virtual void insert_edge(Edge e) = 0; + virtual Edge get_edge(uint64_t id) = 0; + virtual bool exists_edge(const uint64_t id) const = 0; + virtual uint64_t edge_count() const = 0; + + + virtual void allocate(uint64_t expected_edges) { + edge_properties.reserve(expected_edges); + expected_edge_count += expected_edges; + } + + void add_edge(Edge edge) { + insert_edge(edge); + } + + bool has_properties(uint64_t id){ + return edge_properties.find(id) != edge_properties.end(); + } + + void add_property_to_edge(uint64_t id, const std::pair property) { + assert(exists_edge(id)); + edge_properties[id].insert(property); + }; + + void set_edge_properties(uint64_t id, const std::unordered_map properties) { + assert(exists_edge(id)); + + if (has_properties(id)) { + std::cout << "Overwritting existing properties for :"; + print_edge_by_id(id); + std::cout << std::endl; + } + + edge_properties[id] = properties; + }; + + void set_edge_type_dictionary(const std::map& types) { + assert(types.size() != 0); + this->edge_type_dictionary = types; + } + + const EdgeWithProperties get_edge_with_properties(uint64_t id) { + assert(exists_edge(id)); + return EdgeWithProperties(get_edge(id), edge_properties[id]); + } + + uint64_t edges_with_properties_count() { + return edge_properties.size(); + } + + virtual std::pair get_size() const { + size_t data_size = 0; + size_t index_size = 0; + + // lookup type dicts + index_size += 2 * sizeof(std::map); + for(auto& type_mapping : edge_type_dictionary){ + index_size += sizeof(unsigned short int); + index_size += sizeof(char)*(type_mapping.second.length()); + } + + // edge-properties: + index_size += sizeof(std::unordered_map>); + for (const auto &property_mapping : edge_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (const auto &property : property_mapping.second) { + data_size += sizeof(char) * property.first.length() + sizeof(property.second); + } + } + + return {index_size, data_size}; + } + + void print_type_dict(){ + std::cout << "EdgeType-Dict: " << std::endl; + for (auto const &entry : edge_type_dictionary) { + std::cout << entry.first << " -> " << entry.second << std::endl; + } + } + + void print_edge_by_id(const uint64_t id) { + std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; + EdgeWithProperties e = get_edge_with_properties(id); + std::cout << e.getEdge().to_string() << std::endl; + std::cout << "Type: " << this->get_edge_type(e.getEdge().getType()) << std::endl; + std::cout << "Properties: "; + for (const auto entry : e.getProperties()) { + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; + } + } + }; +} + +#endif //MORPHSTORE_EDGES_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/edge/edges_hashmap_container.h b/include/core/storage/graph/edge/edges_hashmap_container.h new file mode 100644 index 00000000..e35f2f52 --- /dev/null +++ b/include/core/storage/graph/edge/edges_hashmap_container.h @@ -0,0 +1,82 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file edges__hashmap_container.h + * @brief storing edges using a hashmap + * @todo an EntityHashMapContainer abstraction (reduce duplicated code) +*/ + +#ifndef MORPHSTORE_EDGES_HASHMAP_CONTAINER_H +#define MORPHSTORE_EDGES_HASHMAP_CONTAINER_H + +#include "edge.h" +#include "edges_container.h" + +#include +#include + +namespace morphstore{ + + class EdgesHashMapContainer : public EdgesContainer{ + protected: + std::unordered_map edges; + public: + std::string container_description() const override { + return "unordered_map"; + } + + void allocate(const uint64_t expected_edges) override { + EdgesContainer::allocate(expected_edges); + this->edges.reserve(expected_edges); + } + + void insert_edge(const Edge e) override { + edges[e.getId()] = e; + } + + bool exists_edge(const uint64_t id) const override { + if(edges.find(id) == edges.end()){ + return false; + } + return true; + } + + Edge get_edge(uint64_t id) override { + return edges[id]; + } + + uint64_t edge_count() const { + return edges.size(); + } + + std::pair get_size() const override { + auto [index_size, data_size] = EdgesContainer::get_size(); + + // container for indexes: + index_size += sizeof(std::unordered_map); + // index size of edge: size of id and sizeof pointer + index_size += edges.size() * sizeof(uint64_t); + data_size += edges.size() * Edge::size_in_bytes(); + + + return {index_size, data_size}; + } + }; +} + +#endif //MORPHSTORE_EDGES_HASHMAP_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h new file mode 100644 index 00000000..d08e2375 --- /dev/null +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -0,0 +1,141 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file edges__vectorarray_container.h + * @brief storing edges using a vector of arrays + * @todo +*/ + +#ifndef MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H +#define MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H + +#include "edge.h" +#include "edges_container.h" + +#include +#include +#include + +namespace morphstore{ + // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! + // and using std::array as aligned_alloc did not set invalid flag to false (could be solveable) + class EdgesVectorArrayContainer : public EdgesContainer{ + protected: + static const inline uint64_t edge_array_size = 4096; + static const inline uint64_t edges_per_array = edge_array_size / sizeof(Edge); + + using edge_array = std::array; + std::vector edges; + + uint64_t number_of_edges = 0; + + + edge_array allocate_edge_array() { + edge_array array; + edges.push_back(array); + //std::cout << " Added a page" << std::endl; + //std::cout.flush(); + + return array; + } + + inline uint64_t get_edge_array_number(uint64_t edge_id) const { + return edge_id / edges_per_array; + } + + inline uint64_t get_pos_in_array(uint64_t edge_id) const { + return edge_id % edges_per_array; + } + + public: + std::string container_description() const override { + return "vector>"; + } + + void allocate(const uint64_t expected_edges) override { + EdgesContainer::allocate(expected_edges); + + auto array_count = std::ceil(expected_edges / (float) edges_per_array); + this->edges.reserve(array_count); + + for(int i = 0; i < array_count; i++) { + allocate_edge_array(); + } + } + + void insert_edge(Edge e) { + auto array_number = get_edge_array_number(e.getId()); + auto array_pos = get_pos_in_array(e.getId()); + + if (array_number >= edges.size()) { + throw std::runtime_error("Exceeded edge id limit: Edge id " + + std::to_string(e.getId()) + " > " + + std::to_string(edges_per_array * edges.size())); + } + + /* if (edges.at(array_number)[array_pos].isValid()) { + throw std::runtime_error("Delete existing edge before overwriting it: edge-id " + e.to_string()); + } */ + + edges.at(array_number)[array_pos] = e; + number_of_edges++; + } + + bool exists_edge(const uint64_t id) const override { + uint64_t array_number = get_edge_array_number(id); + uint64_t pos_in_array = get_pos_in_array(id); + + if (array_number >= edges.size()) + return false; + + std::cout << "edge_count " << edge_count() + << " id " << id + << " edge :" << edges.at(array_number).at(pos_in_array).to_string() << std::endl; + + return edges.at(array_number)[pos_in_array].isValid(); + } + + Edge get_edge(uint64_t id) override { + uint64_t array_number = get_edge_array_number(id); + uint64_t pos_in_array = get_pos_in_array(id); + + assert (array_number < edges.size()); + + return edges.at(array_number)[pos_in_array]; + } + + uint64_t edge_count() const override { + return number_of_edges; + } + + std::pair get_size() const override { + auto [index_size, data_size] = EdgesContainer::get_size(); + + // vector count, current_array_offset + index_size += 2 * sizeof(uint64_t); + + index_size += sizeof(std::vector); + // allocated memory for edges + data_size += edges.size() * sizeof(edge_array); + + return {index_size, data_size}; + } + }; +} + +#endif //MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H \ No newline at end of file From 65c2737c79239cd867e5e1c1051e268d74bbfc1e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 17:27:30 +0200 Subject: [PATCH 136/216] Use EdgesContainer in graph formats --- .../storage/graph/formats/adjacencylist.h | 19 ++- include/core/storage/graph/formats/csr.h | 18 +- include/core/storage/graph/graph.h | 157 ++++++------------ include/core/storage/graph/ldbc_import.h | 2 +- 4 files changed, 75 insertions(+), 121 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 9a2d0f49..540d0816 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -38,6 +38,9 @@ namespace morphstore{ std::unordered_map>> adjacencylistPerVertex; public: + AdjacencyList(EdgesContainerType edges_container_type) + : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} + AdjacencyList(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} std::string get_storage_format() const override { @@ -61,7 +64,7 @@ namespace morphstore{ if (!vertices->exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } - + // TODO: remove shared pointer? std::shared_ptr> adjacencyList; if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { adjacencyList = adjacencylistPerVertex[sourceId]; @@ -70,14 +73,12 @@ namespace morphstore{ adjacencylistPerVertex[sourceId] = adjacencyList; } - for(const auto edge : edgesToAdd) { - edges[edge.getId()] = std::make_shared(edge); - if(vertices->exists_vertex(edge.getTargetId())) { - adjacencyList->push_back(edge.getId()); - } - else { + for (const auto edge : edgesToAdd) { + if (!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found :" + edge.to_string()); } + edges->add_edge(edge); + adjacencyList->push_back(edge.getId()); } } @@ -101,8 +102,8 @@ namespace morphstore{ if (entry != adjacencylistPerVertex.end()) { for(uint64_t const edgeId: *(entry->second)) { - assert(edges.find(edgeId) != edges.end()); - targetVertexIds.push_back(edges[edgeId]->getTargetId()); + assert(edges->exists_edge(edgeId)); + targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); } } diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 26e598b6..01869980 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -50,7 +50,11 @@ namespace morphstore{ std::unique_ptr> edgeId_column; public: - CSR(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} + CSR(EdgesContainerType edges_container_type) + : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} + + CSR(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) + : Graph(vertices_container_type) {} std::string get_storage_format() const override { return "CSR"; @@ -91,16 +95,14 @@ namespace morphstore{ } // fill the arrays - // TODO: fill array using memcpy? (put edgeIds into vector as prerpare step) + // TODO: fill array using memcpy? (put edgeIds into vector as prepare step) uint64_t* edgeId_data = edgeId_column->get_data(); for(const auto& edge : edgesToAdd){ - std::shared_ptr ePtr = std::make_shared(edge); if(!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found " + edge.to_string()); } - edges[ePtr->getId()] = ePtr; - - edgeId_data[offset] = ePtr->getId(); + edgeId_data[offset] = edge.getId(); + edges->add_edge(edge); ++offset; } @@ -148,8 +150,8 @@ namespace morphstore{ // resolving each edgeId for (auto edgeId: neighbourEdgeIds) { - assert(edges.find(edgeId) != edges.end()); - targetVertexIds.push_back(edges[edgeId]->getTargetId()); + assert(edges->exists_edge(edgeId)); + targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); } return targetVertexIds; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index ee5143c3..d58a2bfc 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -28,6 +28,8 @@ #include "vertex/vertices_hashmap_container.h" #include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" +#include "edge/edges_hashmap_container.h" +#include "edge/edges_vectorarray_container.h" #include "property_type.h" #include @@ -74,54 +76,51 @@ namespace morphstore{ uint64_t expectedEdgeCount; std::unique_ptr vertices; + std::unique_ptr edges; - // Todo: use a EdgesContainer for edges and edge_properties (very similar to vertices Container) - std::unordered_map> edges; - - std::unordered_map> edge_properties; - - - // Lookup for types: number to string - std::map edgeTypeDictionary; - + public: + Graph(EdgesContainerType edges_container_type) + : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} + + Graph(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer, + EdgesContainerType edges_container_type = EdgesContainerType::VectorArrayContainer) { + switch (vertices_container_type) { + case VerticesContainerType::VectorArrayContainer: + vertices = std::make_unique(); + break; + case VerticesContainerType::HashMapContainer: + vertices = std::make_unique(); + break; + } - // function to check if the edge-ID is present or not (exists) - bool exist_edgeId(const uint64_t id){ - if (edges.find(id) == edges.end()) { - return false; - } - return true; + switch (edges_container_type) { + case EdgesContainerType::VectorArrayContainer: + edges = std::make_unique(); + break; + case EdgesContainerType::HashMapContainer: + edges = std::make_unique(); + break; + } } - public: - // -------------------- Setters & Getters -------------------- - - Graph(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) { - switch (vertices_container_type) { - case VerticesContainerType::VectorArrayContainer: - vertices = std::make_unique(); - break; - case VerticesContainerType::HashMapContainer: - vertices = std::make_unique(); - break; + std::string vertices_container_description() { + return vertices->container_description(); } - } - std::string vertices_container_description() { - return vertices->container_description(); + std::string edges_container_description() { + return edges->container_description(); } + // -------------------- Setters & Getters -------------------- + void set_vertex_type_dictionary(const std::map& types) { assert(types.size() != 0); this->vertices->set_vertex_type_dictionary(types); } - const std::map &getRelationDictionary() const { - return edgeTypeDictionary; - } - - void setEdgeTypeDictionary(const std::map& rel) { - this->edgeTypeDictionary = rel; + void setEdgeTypeDictionary(const std::map& types) { + assert(types.size() != 0); + this->edges->set_edge_type_dictionary(types); } uint64_t getExpectedVertexCount() const { @@ -137,31 +136,21 @@ namespace morphstore{ } uint64_t getEdgeCount() const { - return edges.size(); + return edges->edge_count(); } - uint64_t add_vertex(const unsigned short int type, const std::unordered_map props = {}) { - assert(expectedVertexCount > getVertexCount()); + uint64_t add_vertex(const unsigned short int type = 0, const std::unordered_map props = {}) { return vertices->add_vertex(type, props); }; - std::string get_edgeType_by_number(unsigned short int type){ - if(edgeTypeDictionary.find( type ) != edgeTypeDictionary.end()){ - return edgeTypeDictionary.at(type); - }else{ - print_type_dicts(); - return std::to_string(type) + " not found in edge-type dictionary"; - } - } - // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id){ return vertices->get_vertex(id); } - // function which returns a pointer to vertex by id + // function which returns a pointer to edge by id EdgeWithProperties get_edge(uint64_t id){ - return EdgeWithProperties(edges[id], edge_properties[id]); + return edges->get_edge_with_properties(id); } // function to return a list of pair < vertex id, degree > DESC: @@ -208,15 +197,15 @@ namespace morphstore{ vertices->add_property_to_vertex(id, property); }; - void add_properties_to_edge(uint64_t id, const std::unordered_map properties) { - edge_properties[id] = properties; + void set_edge_properties(uint64_t id, const std::unordered_map properties) { + edges->set_edge_properties(id, properties); }; // -------------------- pure virtual functions -------------------- virtual std::string get_storage_format() const = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; + virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; virtual void compress(GraphCompressionFormat target_format) = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; @@ -225,39 +214,20 @@ namespace morphstore{ // including vertices + its properties + its type dict auto [index_size, data_size] = vertices->get_size(); - // lookup type dicts - for(const auto& rel : edgeTypeDictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(rel.second.length()); - } - - index_size += sizeof(std::unordered_map>); - // index size of edge: size of id and sizeof pointer - index_size += edges.size() * (sizeof(uint64_t) + sizeof(std::shared_ptr)); - data_size += edges.size() * Edge::size_in_bytes(); - - // TODO: extra propertymappings class - // edge-properties: - index_size += sizeof(std::unordered_map>); - for(const auto& property_mapping: edge_properties) { - index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - // properties of a single edge - for (const auto& property : property_mapping.second) { - data_size += sizeof(char) * property.first.length() + sizeof(property.second); - } - } + // including edges + its properties + its type dict + auto edges_size = edges->get_size(); + index_size += edges_size.first; + data_size += edges_size.second; return std::make_pair(index_size, data_size); }; - virtual void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) { - this->expectedVertexCount = numberVertices; - this->expectedEdgeCount = numberEdges; - - vertices->allocate(numberVertices); - - edges.reserve(numberEdges); - edge_properties.reserve(numberEdges); + virtual void allocate_graph_structure(uint64_t expected_vertices, uint64_t expected_edges) { + this->expectedVertexCount = expected_vertices; + this->expectedEdgeCount = expected_edges; + + vertices->allocate(expected_vertices); + edges->allocate(expected_edges); }; // -------------------- debugging functions -------------------- @@ -270,7 +240,7 @@ namespace morphstore{ std::cout << "Number of vertices: " << getVertexCount() << std::endl; std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; - std::cout << "Number of edges with properties:" << edge_properties.size() << std::endl; + std::cout << "Number of edges with properties:" << edges->edges_with_properties_count() << std::endl; std::cout << "Compression Format:" << to_string(current_compression) << std::endl; std::cout << "--------------------------------------------" << std::endl; } @@ -284,31 +254,12 @@ namespace morphstore{ } void print_edge_by_id(uint64_t id) { - std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; - std::shared_ptr edge = edges[id]; - std::cout << "Edge-ID: \t" << edge->getId() << std::endl; - std::cout << "Source-ID: \t" << edge->getSourceId() << std::endl; - std::cout << "Target-ID: \t" << edge->getTargetId() << std::endl; - std::cout << "Type: \t" << get_edgeType_by_number(edge->getType()) << std::endl; - std::cout << "\n"; - std::cout << "Properties: "; - for (const auto entry : edge_properties[id]) { - auto value = entry.second; - std::cout << "{" << entry.first << ": "; - std::visit(PropertyValueVisitor{}, value); - std::cout << "}"; - } - std::cout << "\n"; - std::cout << "-----------------------------------------------" << std::endl; + edges->print_edge_by_id(id); } void print_type_dicts(){ vertices->print_type_dict(); - - std::cout << "EdgeType-Dict: " << std::endl; - for(auto const& rel : edgeTypeDictionary){ - std::cout << rel.first << " -> " << rel.second << std::endl; - } + edges->print_type_dict(); } }; diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/ldbc_import.h index 1efb6533..8f53de69 100644 --- a/include/core/storage/graph/ldbc_import.h +++ b/include/core/storage/graph/ldbc_import.h @@ -641,7 +641,7 @@ namespace morphstore{ for(auto edge: edges) { auto entry = edgeProperties.find(edge.getId()); if (entry != edgeProperties.end()) { - graph.add_properties_to_edge(entry->first, entry->second); + graph.set_edge_properties(entry->first, entry->second); } } } From e6afcd915a3489c67dc51f200709a7a1175b950b Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 17:28:34 +0200 Subject: [PATCH 137/216] WIP edges_container_benchmark currently failing due to edge id generation --- include/core/storage/graph/vertex/vertex.h | 2 +- .../storage/graph/vertex/vertices_container.h | 11 +- .../graph/vertex/vertices_hashmap_container.h | 6 +- .../graph/edge_storage_benchmark.cpp | 128 ++++++++++++++++++ 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 src/microbenchmarks/graph/edge_storage_benchmark.cpp diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 4788b657..cb702592 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -47,7 +47,7 @@ namespace morphstore{ public: // default constr. needed for VertexWithProperties(Vertex vertex, const std::unordered_map properties) // otherwise compiler won't accept - Vertex(){} + Vertex() {}; Vertex(uint64_t id, unsigned short int type){ this->id = id; diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 159d1d72..91481c56 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -37,7 +37,7 @@ namespace morphstore{ class VerticesContainer { protected: uint64_t currentMaxVertexId = 0; - + uint64_t expected_vertex_count = 0; std::map vertex_type_dictionary; // TODO: try other property storage formats than per node .. (triple-store or per property) @@ -65,11 +65,13 @@ namespace morphstore{ virtual uint64_t vertex_count() const = 0; - virtual void allocate(uint64_t numberVertices) { - vertex_properties.reserve(numberVertices); + virtual void allocate(uint64_t expected_vertices) { + vertex_properties.reserve(expected_vertices); + expected_vertex_count += expected_vertices; } uint64_t add_vertex(const unsigned short int type, const std::unordered_map properties = {}) { + assert(currentMaxVertexId < expected_vertex_count); Vertex v = Vertex(getNextVertexId(), type); insert_vertex(v); if (!properties.empty()) { @@ -110,7 +112,7 @@ namespace morphstore{ index_size += sizeof(char)*(type_mapping.second.length()); } - // vertex-properties: + // vertex-properties: index_size += sizeof(std::unordered_map>); for (const auto &property_mapping : vertex_properties) { index_size += sizeof(uint64_t) + sizeof(std::unordered_map); @@ -134,7 +136,6 @@ namespace morphstore{ VertexWithProperties v = get_vertex(id); std::cout << "Vertex-ID: \t" << v.getID() << std::endl; std::cout << "Type: \t" << get_vertex_type(v.getType()) << std::endl; - std::cout << "\n"; std::cout << "Properties: "; for (const auto entry : v.getProperties()) { auto value = entry.second; diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index 736e69b6..0a824497 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -45,9 +45,9 @@ namespace morphstore{ return "unordered_map"; } - void allocate(const uint64_t numberVertices) override { - VerticesContainer::allocate(numberVertices); - this->vertices.reserve(numberVertices); + void allocate(const uint64_t expected_vertices) override { + VerticesContainer::allocate(expected_vertices); + this->vertices.reserve(expected_vertices); } void insert_vertex(const Vertex v) override { diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp new file mode 100644 index 00000000..382d41cc --- /dev/null +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -0,0 +1,128 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file edge_storage_benchmark.cpp + * @brief A little mirco benchmark of the edge storage. + * @todo Fix edge id generation for benchmark to work + */ + +#include +#include +#include +#include +#include + + +typedef std::chrono::high_resolution_clock highResClock; +using namespace morphstore; + +int64_t get_duration(std::chrono::time_point start) { + auto stop = highResClock::now(); + return std::chrono::duration_cast(stop - start).count(); +} + +int64_t get_median(std::vector values) { + assert(values.size() > 0); + std::nth_element(values.begin(), values.begin() + values.size()/2, values.end()); + return values[values.size()/2]; +} + +int main(void) { + // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function + + int number_of_executions = 5; + + std::cout << "Test edge storage structure (median of 5 for full_iterate and random access)" << std::endl; + std::cout << "Container type | vertex_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; + + std::vector storage_types = { + EdgesContainerType::HashMapContainer, + EdgesContainerType::VectorArrayContainer + }; + + std::vector edge_counts = {10000, 100000, 1000000, 2000000, 5000000, 10000000, 15000000}; + + for (int edge_count: edge_counts) { + std::random_device rd; + std::uniform_int_distribution dist(0, edge_count - 1); + std::vector random_accesses; + for (int i = 0; i < edge_count; i++) { + random_accesses.push_back(dist(rd)); + } + + for (auto storage_type : storage_types) { + std::unique_ptr graph = std::make_unique(storage_type); + graph->allocate_graph_structure(1, edge_count); + + std::string measurement_entry = + graph->edges_container_description() + " | "; + measurement_entry += std::to_string(edge_count) + " | "; + + auto vertex_id = graph->add_vertex(0); + std::vector edges; + + for (int i = 0; i < edge_count; i++) { + edges.push_back(Edge(vertex_id, vertex_id, 0)); + } + + auto start = highResClock::now(); + graph->add_edges(vertex_id, edges); + // loading time + measurement_entry += std::to_string(get_duration(start)) + " | "; + + // size + auto [index_size, data_size] = graph->get_size_of_graph(); + measurement_entry += std::to_string(index_size + data_size) + " | "; + + + std::vector durations; + + // full iterate + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + // iterate + for (int i = 0; i < edge_count; i++) { + graph->get_edge(i); + } + durations.push_back(get_duration(start)); + } + + measurement_entry += std::to_string(get_median(durations)) + " | "; + + // random access + + durations.clear(); + + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + + for (int random_pos : random_accesses) { + graph->get_edge(random_pos); + } + + durations.push_back(get_duration(start)); + } + + measurement_entry += std::to_string(get_median(durations)); + + std::cout << measurement_entry << std::endl; + } + } + + return 0; +} From fc069660f1e6ca5f59a656755321a5ded18f4909 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 17:31:02 +0200 Subject: [PATCH 138/216] Align vertices_container naming --- include/core/storage/graph/graph.h | 2 +- .../storage/graph/vertex/vertices_container.h | 9 ++--- .../graph/vertex/vertices_hashmap_container.h | 8 ++-- .../vertex/vertices_vectorarray_container.h | 38 +++++++++++-------- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index d58a2bfc..f08ab243 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -145,7 +145,7 @@ namespace morphstore{ // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id){ - return vertices->get_vertex(id); + return vertices->get_vertex_with_properties(id); } // function which returns a pointer to edge by id diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 91481c56..d07d8362 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -43,8 +43,6 @@ namespace morphstore{ // TODO: try other property storage formats than per node .. (triple-store or per property) std::unordered_map> vertex_properties; - virtual Vertex get_vertex_without_properties(uint64_t id) = 0; - std::string get_vertex_type(unsigned short int type) const { if (vertex_type_dictionary.find(type) != vertex_type_dictionary.end()) { return vertex_type_dictionary.at(type); @@ -62,6 +60,7 @@ namespace morphstore{ virtual std::string container_description() const = 0; virtual void insert_vertex(Vertex v) = 0; virtual bool exists_vertex(const uint64_t id) const = 0; + virtual Vertex get_vertex(uint64_t id) = 0; virtual uint64_t vertex_count() const = 0; @@ -92,9 +91,9 @@ namespace morphstore{ } - const VertexWithProperties get_vertex(uint64_t id) { + const VertexWithProperties get_vertex_with_properties(uint64_t id) { assert(exists_vertex(id)); - return VertexWithProperties(get_vertex_without_properties(id), vertex_properties[id]); + return VertexWithProperties(get_vertex(id), vertex_properties[id]); } uint64_t vertices_with_properties_count() { @@ -133,7 +132,7 @@ namespace morphstore{ void print_vertex_by_id(const uint64_t id) { std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; - VertexWithProperties v = get_vertex(id); + VertexWithProperties v = get_vertex_with_properties(id); std::cout << "Vertex-ID: \t" << v.getID() << std::endl; std::cout << "Type: \t" << get_vertex_type(v.getType()) << std::endl; std::cout << "Properties: "; diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index 0a824497..aaea6787 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -36,10 +36,6 @@ namespace morphstore{ protected: std::unordered_map vertices; - Vertex get_vertex_without_properties(uint64_t id) override{ - return vertices[id]; - } - public: std::string container_description() const override { return "unordered_map"; @@ -54,6 +50,10 @@ namespace morphstore{ vertices[v.getID()] = v; } + Vertex get_vertex(uint64_t id) override { + return vertices[id]; + } + bool exists_vertex(const uint64_t id) const override { if(vertices.find(id) == vertices.end()){ return false; diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 1f19e777..b3c575dc 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -29,6 +29,7 @@ #include #include +#include namespace morphstore{ @@ -52,27 +53,18 @@ namespace morphstore{ vertices.push_back(array_pointer); //std::cout << " Added a page" << std::endl; //std::cout.flush(); + return array_pointer; } inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { - return vertex_id / vertex_array_size; + return vertex_id / vertices_per_array; } inline uint64_t get_pos_in_array(uint64_t vertex_id) const { return vertex_id % vertices_per_array; } - Vertex get_vertex_without_properties(uint64_t id) override { - uint64_t array_number = get_vertex_vector_number(id); - uint64_t pos_in_array = get_pos_in_array(id); - - //assert (pos_in_array < vertices_per_array); - //assert (array_number < vertices.size()); - - return vertices.at(array_number)[pos_in_array]; - } - public: // TODO: make array_size based on constructor //VerticesVectorArrayContainer(array_size) @@ -89,10 +81,12 @@ namespace morphstore{ return "vector"; } - void allocate(const uint64_t numberVertices) override { - VerticesContainer::allocate(numberVertices); - this->vertices.reserve(number_of_vertices / vertices_per_array); - current_array = allocate_vertex_array(); + void allocate(const uint64_t expected_vertices) override { + VerticesContainer::allocate(expected_vertices); + this->vertices.reserve(std::ceil(expected_vertices / (double) vertices_per_array)); + + if (current_array == nullptr) + current_array = allocate_vertex_array(); } void insert_vertex(Vertex v) { @@ -101,12 +95,23 @@ namespace morphstore{ current_array = allocate_vertex_array(); current_array_offset = 0; } - + // TODO: add check that there is no valid vertex stored there + // need to solve problem that aligned_alloc randomaly inits Vertices (ignores default values) current_array[current_array_offset] = v; current_array_offset++; number_of_vertices++; } + Vertex get_vertex(uint64_t id) override { + uint64_t array_number = get_vertex_vector_number(id); + uint64_t pos_in_array = get_pos_in_array(id); + + //assert (pos_in_array < vertices_per_array); + assert(array_number < vertices.size()); + + return vertices.at(array_number)[pos_in_array]; + } + bool exists_vertex(const uint64_t id) const override { // assumes no deletion! else retrieve vertrex at position and check isValid() return number_of_vertices > id; @@ -123,6 +128,7 @@ namespace morphstore{ index_size += 2 * sizeof(uint64_t); // current_array index_size += sizeof(Vertex*); + index_size += sizeof(std::vector); index_size += vertices.size() * sizeof(Vertex*); // allocated memory for vertices From 47fb825215be1c117d7fb44c2ad159435042e672 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 20:20:02 +0200 Subject: [PATCH 139/216] Fix edge ids by allowing explicit id only dirty fix .. edge id should be given by the graph --- include/core/storage/graph/edge/edge.h | 7 +++++-- src/microbenchmarks/graph/edge_storage_benchmark.cpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index df49062e..76d46f9f 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -56,11 +56,14 @@ namespace morphstore{ // default constr. needed for EdgeWithProperties constructor Edge(){} - Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type){ + Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) + : Edge(getNextEdgeId(), sourceId, targetId, type) {} + + Edge(uint64_t id, uint64_t sourceId, uint64_t targetId, unsigned short int type){ this->sourceID = sourceId; this->targetID = targetId; this->type = type; - this->id = getNextEdgeId(); + this->id = id; this->valid = true; } diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp index 382d41cc..f9b58dd0 100644 --- a/src/microbenchmarks/graph/edge_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -77,7 +77,7 @@ int main(void) { std::vector edges; for (int i = 0; i < edge_count; i++) { - edges.push_back(Edge(vertex_id, vertex_id, 0)); + edges.push_back(Edge(i, vertex_id, vertex_id, 0)); } auto start = highResClock::now(); From e90bad1279c758feca153cff464989ab8adae5cc Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 27 Apr 2020 20:21:00 +0200 Subject: [PATCH 140/216] Add minor improvements --- include/core/storage/graph/edge/edge.h | 1 + .../core/storage/graph/edge/edges_vectorarray_container.h | 6 +----- include/core/storage/graph/vertex/vertex.h | 2 +- src/microbenchmarks/graph/edge_storage_benchmark.cpp | 2 +- src/microbenchmarks/graph/vertex_storage_benchmark.cpp | 2 +- 5 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 76d46f9f..fab6c9f9 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -43,6 +43,7 @@ namespace morphstore{ unsigned short int type; // delete flag + // TODO put as a std::bitset in vectorarray_container bool valid = false; uint64_t getNextEdgeId() const { diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index d08e2375..0b30ed3d 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -85,7 +85,7 @@ namespace morphstore{ if (array_number >= edges.size()) { throw std::runtime_error("Exceeded edge id limit: Edge id " + std::to_string(e.getId()) + " > " + - std::to_string(edges_per_array * edges.size())); + std::to_string(edges_per_array * edges.size() - 1)); } /* if (edges.at(array_number)[array_pos].isValid()) { @@ -103,10 +103,6 @@ namespace morphstore{ if (array_number >= edges.size()) return false; - std::cout << "edge_count " << edge_count() - << " id " << id - << " edge :" << edges.at(array_number).at(pos_in_array).to_string() << std::endl; - return edges.at(array_number)[pos_in_array].isValid(); } diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index cb702592..e08c2f31 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -49,7 +49,7 @@ namespace morphstore{ // otherwise compiler won't accept Vertex() {}; - Vertex(uint64_t id, unsigned short int type){ + Vertex(uint64_t id, unsigned short int type = 0){ this->id = id; this->type = type; this->valid = true; diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp index f9b58dd0..ff9444f1 100644 --- a/src/microbenchmarks/graph/edge_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -48,7 +48,7 @@ int main(void) { int number_of_executions = 5; std::cout << "Test edge storage structure (median of 5 for full_iterate and random access)" << std::endl; - std::cout << "Container type | vertex_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; + std::cout << "Container type | edge_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the edge count in μs" << std::endl; std::vector storage_types = { EdgesContainerType::HashMapContainer, diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 0879f850..c0ae75d8 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -74,7 +74,7 @@ int main(void) { auto start = highResClock::now(); for (int i = 0; i < vertex_count; i++) { - graph->add_vertex(i); + graph->add_vertex(); } // loading time measurement_entry += std::to_string(get_duration(start)) + " | "; From 6325c1cec9ad52d308a59aa8f69f6ce870f81816 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 28 Apr 2020 22:50:13 +0200 Subject: [PATCH 141/216] Try out DELTA and FOR compression for CSR format ! not yet saving the compressed columns --- include/core/storage/graph/formats/csr.h | 42 ++++++++++--------- .../core/storage/graph/ldbc/ldbc_graph_test.h | 2 + 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 01869980..ca563e0e 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include @@ -170,40 +170,44 @@ namespace morphstore{ // example layout: dynamic_vbp_f<512, 32, 8> using ve = vectorlib::scalar>; - column* inCol = offset_column.get(); switch (target_format) { -/* case GraphCompressionFormat::DELTA: - auto column = morph>, uncompr_f>(inCol); - std::cout << " values: " << column->get_count_values() - << " size in bytes: " << column->get_size_used_byte() - << " ?compressed bytes : " << column->get_size_compr_byte() << std::endl; + case GraphCompressionFormat::DELTA: { + auto compressed_offset_col = morph(offset_column.get()); + auto compressed_edge_col = morph(edgeId_column.get()); + std::cout << " offset col compression ratio: " + << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl + << " edgeId col compression ratio: " + << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; break; - case GraphCompressionFormat::FOR: - auto column = morph>, uncompr_f>(inCol); - std::cout << " values: " << column->get_count_values() - << " size in bytes: " << column->get_size_used_byte() - << " ?compressed bytes : " << column->get_size_compr_byte() << std::endl; - break; */ + } + case GraphCompressionFormat::FOR: { + auto compressed_offset_col = morph(offset_column.get()); + auto compressed_edge_col = morph(edgeId_column.get()); + std::cout << " offset col compression ratio: " + << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl + << " edgeId col compression ratio: " + << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; + break; + } + // RLE never really finished .. do not use for now case GraphCompressionFormat::RLE: { - auto column = morph(inCol); + throw std::runtime_error("`Never really completed RLE implementation`"); + auto column = morph(offset_column.get()); std::cout << " values: " << column->get_count_values() << " size in bytes: " << column->get_size_used_byte() - << " compression ratio: " << inCol->get_size_used_byte() / (double) column->get_size_used_byte() << std::endl; + << " compression ratio: " << offset_column->get_size_used_byte() / (double) column->get_size_used_byte() << std::endl; break; } default: throw std::runtime_error("Could not compress yet"); break; } - - //auto column = safe_morph(inCol); // TODO: save them .. and correctly operate on the compressed column - // TODO: use normal morph (as vector extension is ignored) - this->current_compression = target_format; + //this->current_compression = target_format; } // get size of storage format: diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index e4e6edf8..015e3456 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -61,6 +61,8 @@ void ldbcGraphFormatTest (void) { // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); + graph->compress(morphstore::GraphCompressionFormat::FOR); + // measure degree distribution and write to file (file path as parameter): graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); From 8fbeb1c81365e8fd91adf8ead4ae3ffd083eb860 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 4 May 2020 11:17:52 +0200 Subject: [PATCH 142/216] WIP Introduce a template free column_base class --- include/core/storage/column.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/include/core/storage/column.h b/include/core/storage/column.h index e590595f..2ec17f0c 100644 --- a/include/core/storage/column.h +++ b/include/core/storage/column.h @@ -42,8 +42,31 @@ enum class storage_persistence_type { queryScope }; +// template-free base class +// use-case: graph formats can change their column format at run-time via `compress(Format f)` +// TODO: currently not useable as template argument deduction/substitution fails when f.i. morphing on column_base +class column_base { + public: + // todo: find a way to specify `inline` + virtual voidptr_t get_data( void ) const = 0; + virtual size_t get_count_values( void ) const = 0; + virtual void set_count_values( size_t p_CountValues ) = 0; + virtual size_t get_size_used_byte( void ) const = 0; + virtual void set_size_used_byte( size_t p_SizeUsedByte ) = 0; + virtual size_t get_size_compr_byte( void ) const = 0; + virtual void set_size_compr_byte( size_t p_SizeComprByte ) = 0; + virtual void set_meta_data( size_t p_CountValues, size_t p_SizeUsedByte, size_t p_SizeComprByte ) = 0; + virtual void set_meta_data( size_t p_CountValues, size_t p_SizeUsedByte) = 0; + + virtual const voidptr_t get_data_uncompr_start() const = 0; + virtual size_t get_count_values_uncompr() const = 0; + virtual size_t get_count_values_compr() const = 0; + // this is a template-method and cannot be defined here? + //virtual bool prepare_for_random_access() const = 0; +}; + template< class F > -class column { +class column : public column_base { static_assert( std::is_base_of< format, F >::value, "column: template parameter F must be a subclass of format" From 623af9546635858b1457cc2a14bb3944d3381e5e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 4 May 2020 14:45:48 +0200 Subject: [PATCH 143/216] Use column_base in CSR * solved compiler error: "template argument deduction/substitution failed" via own morph_graph_col * removed RLE graph compression --- include/core/storage/column.h | 1 - include/core/storage/graph/formats/csr.h | 83 ++++-------- include/core/storage/graph/graph.h | 22 +--- .../core/storage/graph/graph_compr_format.h | 118 ++++++++++++++++++ .../storage/graph/simple/simple_graph_test.h | 2 +- 5 files changed, 146 insertions(+), 80 deletions(-) create mode 100644 include/core/storage/graph/graph_compr_format.h diff --git a/include/core/storage/column.h b/include/core/storage/column.h index 2ec17f0c..57449b87 100644 --- a/include/core/storage/column.h +++ b/include/core/storage/column.h @@ -44,7 +44,6 @@ enum class storage_persistence_type { // template-free base class // use-case: graph formats can change their column format at run-time via `compress(Format f)` -// TODO: currently not useable as template argument deduction/substitution fails when f.i. morphing on column_base class column_base { public: // todo: find a way to specify `inline` diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index ca563e0e..5ce5765e 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -24,15 +24,7 @@ #ifndef MORPHSTORE_CSR_H #define MORPHSTORE_CSR_H -#include "../graph.h" -#include "../vertex/vertex.h" -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -46,8 +38,8 @@ namespace morphstore{ * offset column: index is vertex-id; column entry contains offset in edgeId array * edgeId column: contains edge id */ - std::unique_ptr> offset_column; - std::unique_ptr> edgeId_column; + std::unique_ptr offset_column; + std::unique_ptr edgeId_column; public: CSR(EdgesContainerType edges_container_type) @@ -61,6 +53,7 @@ namespace morphstore{ } // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays + // TODO: test that no data exists before (as this will get overwritten) void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); @@ -86,6 +79,9 @@ namespace morphstore{ // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); + // currently only read-only after compression (TODO allow writes on compressed data) + assert(current_compression == GraphCompressionFormat::UNCOMPRESSED); + uint64_t* offset_data = offset_column->get_data(); uint64_t offset = offset_data[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); @@ -114,7 +110,10 @@ namespace morphstore{ // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) override { - uint64_t* offset_data = offset_column->get_data(); + // decompressing offset_column in order to read correct offset + // TODO: only decompress part as only offset_column[id] and offset_column[id+1] will be read + uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); + uint64_t offset = offset_data[id]; // special case: last vertex id has no next offset uint64_t nextOffset; @@ -134,14 +133,14 @@ namespace morphstore{ // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { std::vector neighbourEdgeIds; - uint64_t* offset_data = offset_column->get_data(); + uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); // avoiding out of bounds ... // TODO: use assert here, as this is only out of bounds if the offset if( offset < getExpectedEdgeCount()){ - uint64_t* edgeId_data = edgeId_column->get_data(); + uint64_t* edgeId_data = decompress_graph_col(edgeId_column.get(), current_compression)->get_data(); neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_data+offset, edgeId_data+offset+numberEdges); } @@ -158,53 +157,21 @@ namespace morphstore{ } void compress(GraphCompressionFormat target_format) override { - std::cout << "Compressing graph format specific data structures via " << to_string(target_format) << std::endl; + std::cout << "Morphing graph format specific data structures from " << to_string(current_compression) << " to " << to_string(target_format) << std::endl; if (current_compression == target_format) { std::cout << "Already in " << to_string(target_format); return; } - - // TODO: allow also other vector extensions (switch from safe_morph to morph) - // example layout: dynamic_vbp_f<512, 32, 8> - - using ve = vectorlib::scalar>; - - switch (target_format) - { - case GraphCompressionFormat::DELTA: { - auto compressed_offset_col = morph(offset_column.get()); - auto compressed_edge_col = morph(edgeId_column.get()); - std::cout << " offset col compression ratio: " - << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl - << " edgeId col compression ratio: " - << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; - break; - } - case GraphCompressionFormat::FOR: { - auto compressed_offset_col = morph(offset_column.get()); - auto compressed_edge_col = morph(edgeId_column.get()); - std::cout << " offset col compression ratio: " - << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl - << " edgeId col compression ratio: " - << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; - break; - } - // RLE never really finished .. do not use for now - case GraphCompressionFormat::RLE: { - throw std::runtime_error("`Never really completed RLE implementation`"); - auto column = morph(offset_column.get()); - std::cout << " values: " << column->get_count_values() - << " size in bytes: " << column->get_size_used_byte() - << " compression ratio: " << offset_column->get_size_used_byte() / (double) column->get_size_used_byte() << std::endl; - break; - } - default: - throw std::runtime_error("Could not compress yet"); - break; - } + const column_base* compressed_offset_col = morph_graph_col(offset_column.get(), current_compression, target_format); + const column_base* compressed_edge_col = morph_graph_col(edgeId_column.get(), current_compression, target_format); + + std::cout << " offset col compression ratio: " + << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl + << " edgeId col compression ratio: " + << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; // TODO: save them .. and correctly operate on the compressed column //this->current_compression = target_format; @@ -222,20 +189,22 @@ namespace morphstore{ } // for debugging: + // TODO: simply by using a get_outgoing_edges(id) method void print_neighbors_of_vertex(uint64_t id) override{ std::cout << "Neighbours for Vertex with id " << id << std::endl; - uint64_t* offset_data = offset_column->get_data(); + + uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); - uint64_t* edgeId_data = edgeId_column->get_data(); + uint64_t* edgeId_data = decompress_graph_col(edgeId_column.get(), current_compression)->get_data(); for(uint64_t i = offset; i < offset+numberEdges; ++i){ uint64_t edgeId = edgeId_data[i]; print_edge_by_id(edgeId); } } - std::string get_column_info(const column *column) { + std::string get_column_info(const column_base *column) { return " values: " + std::to_string(column->get_count_values()) + " size in bytes: " + std::to_string(column->get_size_used_byte()); } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index f08ab243..8957e88a 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -31,6 +31,7 @@ #include "edge/edges_hashmap_container.h" #include "edge/edges_vectorarray_container.h" #include "property_type.h" +#include #include #include @@ -45,28 +46,7 @@ namespace morphstore{ - enum class GraphCompressionFormat {DELTA, RLE, FOR, UNCOMPRESSED}; - - std::string to_string(GraphCompressionFormat format) { - std::string desc; - - switch (format) { - case GraphCompressionFormat::DELTA: - desc = "Delta"; - break; - case GraphCompressionFormat::UNCOMPRESSED: - desc = "Uncompressed"; - break; - case GraphCompressionFormat::RLE: - desc = "Runtime length"; - break; - case GraphCompressionFormat::FOR: - desc = "Frame of Reference"; - break; - } - return desc; - } class Graph{ protected: diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h new file mode 100644 index 00000000..be2217af --- /dev/null +++ b/include/core/storage/graph/graph_compr_format.h @@ -0,0 +1,118 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph_compr_format.h + * @brief helper for specifying compression of graph format specific columns + * @todo +*/ + +#ifndef MORPHSTORE_GRAPH_COMPR_FORMAT_H +#define MORPHSTORE_GRAPH_COMPR_FORMAT_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace morphstore{ + // TODO: allow also other vector extensions (switch from safe_morph to morph) + // example layout: dynamic_vbp_f<512, 32, 8> + using ve = vectorlib::scalar>; + + // TODO use column_base (currently not working as template argument deduction/substitution fails) + using column_uncompr = column; + using column_delta = column; + using column_for = column; + + enum class GraphCompressionFormat {DELTA, FOR, UNCOMPRESSED}; + + std::string to_string(GraphCompressionFormat format) { + std::string desc; + + switch (format) { + case GraphCompressionFormat::DELTA: + desc = "Delta"; + break; + case GraphCompressionFormat::UNCOMPRESSED: + desc = "Uncompressed"; + break; + case GraphCompressionFormat::FOR: + desc = "Frame of Reference"; + break; + } + + return desc; + } + + // casting the column to the actual column type before morphing (as compiler could not derive it) + const column_base* morph_graph_col(const column_base* column, const GraphCompressionFormat src_f, const GraphCompressionFormat trg_f) { + if (src_f == trg_f) { + return column; + } + + switch (src_f) { + case GraphCompressionFormat::UNCOMPRESSED: { + const column_uncompr *old_col = dynamic_cast(column); + switch (trg_f) { + case GraphCompressionFormat::DELTA: + return morph(old_col); + break; + case GraphCompressionFormat::FOR: + return morph(old_col); + break; + case GraphCompressionFormat::UNCOMPRESSED: + return old_col; + break; + } + break; + } + + // as direct morphing is not yet supported .. go via decompressing first + case GraphCompressionFormat::DELTA: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_delta *old_col = dynamic_cast(column); + return morph(old_col); + } + return morph_graph_col(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED), GraphCompressionFormat::UNCOMPRESSED, trg_f); + break; + } + case GraphCompressionFormat::FOR: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_for *old_col = dynamic_cast(column); + return morph(old_col); + } + return morph_graph_col(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED), GraphCompressionFormat::UNCOMPRESSED, trg_f); + break; + } + } + + throw std::runtime_error("Did not handle src: " + to_string(src_f) + " trg: " + to_string(trg_f)); + } + + const column_uncompr* decompress_graph_col(const column_base* column, const GraphCompressionFormat src_f) { + return static_cast(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED)); + } +} + +#endif //MORPHSTORE_GRAPH_COMPR_FORMAT_H \ No newline at end of file diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 217e40c2..161e2f05 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -58,7 +58,7 @@ void simpleGraphFormatTest (void) { // (DEBUG) graph->statistics(); graph->print_edge_by_id(0); - graph->compress(morphstore::GraphCompressionFormat::RLE); + graph->compress(morphstore::GraphCompressionFormat::DELTA); graph->print_neighbors_of_vertex(v2); graph->statistics(); From 6419e0211093f2289c1648061083ecab40f4a78d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 4 May 2020 14:46:00 +0200 Subject: [PATCH 144/216] Fix simple graph test --- test/core/storage/graph/simple/simple_graph_test.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 161e2f05..76cc17f9 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -52,7 +52,7 @@ void simpleGraphFormatTest (void) { auto e1 = morphstore::Edge(v1, v2, 1); graph->add_edges(v1, {e1}); - graph->add_properties_to_edge(e1.getId(), {{"rating", 42}, {"description", "has the answer to everything"}}); + graph->set_edge_properties(e1.getId(), {{"rating", 42}, {"description", "has the answer to everything"}}); graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); // (DEBUG) From cf621a2b04dd1e77efbb8b55ae107fcca6fadb18 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 4 May 2020 16:34:26 +0200 Subject: [PATCH 145/216] Enable CSR to work on compressed columns --- include/core/storage/graph/formats/csr.h | 44 +++++++++++-------- .../core/storage/graph/graph_compr_format.h | 4 ++ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 5ce5765e..e395cae2 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -38,10 +38,15 @@ namespace morphstore{ * offset column: index is vertex-id; column entry contains offset in edgeId array * edgeId column: contains edge id */ - std::unique_ptr offset_column; - std::unique_ptr edgeId_column; + column_base *offset_column; + column_base *edgeId_column; public: + ~CSR() { + free(offset_column); + free(edgeId_column); + } + CSR(EdgesContainerType edges_container_type) : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} @@ -58,11 +63,11 @@ namespace morphstore{ Graph::allocate_graph_structure(numberVertices, numberEdges); const size_t offset_size = numberVertices * sizeof(uint64_t); - offset_column = std::make_unique>(offset_size); + offset_column = new column(offset_size); offset_column->set_meta_data(numberVertices, offset_size); const size_t edge_ids_size = numberEdges * sizeof(uint64_t); - edgeId_column = std::make_unique>(edge_ids_size); + edgeId_column = new column(edge_ids_size); edgeId_column->set_meta_data(numberEdges, edge_ids_size); // init node array: @@ -79,8 +84,10 @@ namespace morphstore{ // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); - // currently only read-only after compression (TODO allow writes on compressed data) - assert(current_compression == GraphCompressionFormat::UNCOMPRESSED); + // currently only read-only if compressed + if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { + throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + to_string(current_compression)); + } uint64_t* offset_data = offset_column->get_data(); uint64_t offset = offset_data[sourceID]; @@ -112,7 +119,7 @@ namespace morphstore{ uint64_t get_out_degree(uint64_t id) override { // decompressing offset_column in order to read correct offset // TODO: only decompress part as only offset_column[id] and offset_column[id+1] will be read - uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); + uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); uint64_t offset = offset_data[id]; // special case: last vertex id has no next offset @@ -133,14 +140,14 @@ namespace morphstore{ // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { std::vector neighbourEdgeIds; - uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); + uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); // avoiding out of bounds ... // TODO: use assert here, as this is only out of bounds if the offset if( offset < getExpectedEdgeCount()){ - uint64_t* edgeId_data = decompress_graph_col(edgeId_column.get(), current_compression)->get_data(); + uint64_t* edgeId_data = decompress_graph_col(edgeId_column, current_compression)->get_data(); neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_data+offset, edgeId_data+offset+numberEdges); } @@ -165,16 +172,15 @@ namespace morphstore{ return; } - const column_base* compressed_offset_col = morph_graph_col(offset_column.get(), current_compression, target_format); - const column_base* compressed_edge_col = morph_graph_col(edgeId_column.get(), current_compression, target_format); + offset_column = const_cast(morph_graph_col(offset_column, current_compression, target_format)); + edgeId_column = const_cast(morph_graph_col(edgeId_column, current_compression, target_format)); std::cout << " offset col compression ratio: " - << offset_column->get_size_used_byte() / (double)compressed_offset_col->get_size_used_byte() << std::endl + << compression_ratio(offset_column, target_format) << std::endl << " edgeId col compression ratio: " - << edgeId_column->get_size_used_byte() / (double)compressed_edge_col->get_size_used_byte() << std::endl; + << compression_ratio(edgeId_column, target_format) << std::endl; - // TODO: save them .. and correctly operate on the compressed column - //this->current_compression = target_format; + this->current_compression = target_format; } // get size of storage format: @@ -193,11 +199,11 @@ namespace morphstore{ void print_neighbors_of_vertex(uint64_t id) override{ std::cout << "Neighbours for Vertex with id " << id << std::endl; - uint64_t* offset_data = decompress_graph_col(offset_column.get(), current_compression)->get_data(); + uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); uint64_t offset = offset_data[id]; uint64_t numberEdges = get_out_degree(id); - uint64_t* edgeId_data = decompress_graph_col(edgeId_column.get(), current_compression)->get_data(); + uint64_t* edgeId_data = decompress_graph_col(edgeId_column, current_compression)->get_data(); for(uint64_t i = offset; i < offset+numberEdges; ++i){ uint64_t edgeId = edgeId_data[i]; print_edge_by_id(edgeId); @@ -210,8 +216,8 @@ namespace morphstore{ void statistics() override { Graph::statistics(); - std::cout << "offset column: " << get_column_info(offset_column.get()) << std::endl; - std::cout << "edgeId column: " << get_column_info(edgeId_column.get()) << std::endl; + std::cout << "offset column: " << get_column_info(offset_column) << std::endl; + std::cout << "edgeId column: " << get_column_info(edgeId_column) << std::endl; } }; } diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index be2217af..d5f9ca0a 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -113,6 +113,10 @@ namespace morphstore{ const column_uncompr* decompress_graph_col(const column_base* column, const GraphCompressionFormat src_f) { return static_cast(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED)); } + + double compression_ratio(const column_base* column, GraphCompressionFormat col_format) { + return decompress_graph_col(column, col_format)->get_size_used_byte() / (double) column->get_size_used_byte(); + } } #endif //MORPHSTORE_GRAPH_COMPR_FORMAT_H \ No newline at end of file From 113c1d6f3961fe2a053efbb9b40d1170bf8eb788 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 4 May 2020 21:37:39 +0200 Subject: [PATCH 146/216] Prepare adjacency list format for compression introducing variant --- .../storage/graph/formats/adjacencylist.h | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 540d0816..cfbf7033 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -24,20 +24,56 @@ #ifndef MORPHSTORE_ADJACENCYLIST_H #define MORPHSTORE_ADJACENCYLIST_H -#include "../graph.h" -#include "../vertex/vertex.h" +#include #include #include +#include +#include namespace morphstore{ class AdjacencyList: public Graph { private: - std::unordered_map>> adjacencylistPerVertex; + using adjacency_column = column_base*; + using adjacency_vector = std::vector*; + using adjacency_list_variant = std::variant; + struct Adjacency_List_Size_Visitor { + size_t operator()(const adjacency_column c) const { + return c->get_size_used_byte(); + } + size_t operator()(const adjacency_vector v) const { + return v->size(); + } + }; + + // maps the outgoing edges (ids) per vertex + std::unordered_map adjacencylistPerVertex; + + // indicating whether we have columns or vectors (columns after first compress() call) + // TODO: is this replace-able by just checking the type of the first element in the map? (via holds_alternative) + bool finalized = false; + + // convert every adjVector to a adjColumn + void finalize() { + if (!finalized) { + // use std::transform + } + } public: + ~AdjacencyList() { + for(auto entry: this->adjacencylistPerVertex) { + if (finalized) { + free(std::get(entry.second)); + } + else { + free(std::get(entry.second)); + } + } + } + AdjacencyList(EdgesContainerType edges_container_type) : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} From 35e81d4dbe2488c6df46e8ec6aa38e5191903bd7 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 12:39:23 +0200 Subject: [PATCH 147/216] Convert adjacency vectors into colums aka finalize getting read-only as inserts would be to expensive on fixed sized columns --- include/core/storage/column_gen.h | 9 +- .../storage/graph/formats/adjacencylist.h | 115 ++++++++++++++---- 2 files changed, 94 insertions(+), 30 deletions(-) diff --git a/include/core/storage/column_gen.h b/include/core/storage/column_gen.h index f7e51bec..5dd2c154 100644 --- a/include/core/storage/column_gen.h +++ b/include/core/storage/column_gen.h @@ -47,12 +47,13 @@ namespace morphstore { * elements. * * @param vec The vector to initialize the column with. + * @param sudo Overrule limit of 20 * @return An uncompressed column containing a copy of the data in the given * vector. */ -const column * make_column(const std::vector & vec) { +const column * make_column(const std::vector & vec, bool sudo = false) { const size_t count = vec.size(); - if(count > 20) + if(count > 20 && !sudo) throw std::runtime_error( "make_column() is an inefficient convenience function and " "should only be used for very small columns" @@ -64,8 +65,8 @@ const column * make_column(const std::vector & vec) { return resCol; } -const column * make_column(uint64_t const * const vec, size_t count) { - if(count > 400) +const column * make_column(uint64_t const * const vec, size_t count, bool sudo = false) { + if(count > 400 && !sudo) throw std::runtime_error( "make_column() is an inefficient convenience function and " "should only be used for very small columns" diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index cfbf7033..9d7885f4 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -25,6 +25,8 @@ #define MORPHSTORE_ADJACENCYLIST_H #include +#include +#include #include #include @@ -49,6 +51,16 @@ namespace morphstore{ } }; + struct Adjacency_List_Finalizer { + adjacency_column operator()(const adjacency_column c) const { + return c; + } + adjacency_column operator()(const adjacency_vector v) const { + // const_cast to be able to assign colum to entry value + return const_cast(make_column(v->data(), v->size(), true)); + } + }; + // maps the outgoing edges (ids) per vertex std::unordered_map adjacencylistPerVertex; @@ -58,8 +70,16 @@ namespace morphstore{ // convert every adjVector to a adjColumn void finalize() { - if (!finalized) { - // use std::transform + if (!finalized) { + std::unordered_map adjacencyColumnPerVertex; + adjacencyColumnPerVertex.reserve(adjacencylistPerVertex.size()); + + for(auto entry: adjacencylistPerVertex) { + adjacencyColumnPerVertex.insert({entry.first, std::visit(Adjacency_List_Finalizer{}, entry.second)}); + } + + this->adjacencylistPerVertex = adjacencyColumnPerVertex; + this->finalized = true; } } public: @@ -97,18 +117,29 @@ namespace morphstore{ // function that adds multiple edges (list of neighbors) at once to vertex void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { + if (finalized) { + throw std::runtime_error("Cannot add edges, if adj. lists are compressed"); + } + if (!vertices->exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } - // TODO: remove shared pointer? - std::shared_ptr> adjacencyList; + + // avoid inserting an empty adjacencyList (waste of memory) + if (edgesToAdd.size() == 0) { + return ; + } + + std::vector *adjacencyList; if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { - adjacencyList = adjacencylistPerVertex[sourceId]; + adjacencyList = std::get(adjacencylistPerVertex[sourceId]); } else { - adjacencyList = std::make_shared>(); - adjacencylistPerVertex[sourceId] = adjacencyList; + adjacencyList = new std::vector(); + adjacencylistPerVertex.insert({sourceId, adjacencyList}); } + adjacencyList->reserve(edgesToAdd.size()); + for (const auto edge : edgesToAdd) { if (!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found :" + edge.to_string()); @@ -126,28 +157,57 @@ namespace morphstore{ return 0; } else { - return entry->second->size(); + uint64_t out_degree; + if (finalized) { + out_degree = std::get(entry->second)->get_count_values(); + } + else { + out_degree = std::get(entry->second)->size(); + } + return out_degree; } } - // get the neighbors-ids into vector for BFS alg. - std::vector get_neighbors_ids(uint64_t id) override { - std::vector targetVertexIds = std::vector(); - + std::vector get_outgoing_edge_ids(uint64_t id) { + std::vector edge_ids; auto entry = adjacencylistPerVertex.find(id); - if (entry != adjacencylistPerVertex.end()) { - for(uint64_t const edgeId: *(entry->second)) { - assert(edges->exists_edge(edgeId)); - targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); + if (this->finalized) { + adjacency_column col = std::get(entry->second); + const size_t column_size = col->get_count_values(); + // TODO: init vector via range-constructor / mem-cpy + //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; + const uint64_t * start_addr = col->get_data(); + edge_ids.insert(edge_ids.end(), start_addr, start_addr+column_size); + } else { + edge_ids = *std::get(entry->second); } } - + return edge_ids; + } + + // get the neighbors-ids into vector for BFS alg. + // todo: this is actually format generic and can be pulled to graph.h + std::vector get_neighbors_ids(uint64_t id) override { + std::vector targetVertexIds; + + for (uint64_t const edgeId : get_outgoing_edge_ids(id)) { + assert(edges->exists_edge(edgeId)); + targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); + } + return targetVertexIds; } void compress(GraphCompressionFormat target_format) override { std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; + + if (!finalized) { + std::cout << "Transforming vectors into columns" << std::endl; + this->finalize(); + } + + // TODO: convert column to target_format //this->current_compression = target_format; } @@ -157,12 +217,11 @@ namespace morphstore{ auto [index_size, data_size] = Graph::get_size_of_graph(); // adjacencyListPerVertex - index_size += sizeof(std::unordered_map>>); - index_size += adjacencylistPerVertex.size() * (sizeof(uint64_t) + sizeof(std::shared_ptr>)); + index_size += sizeof(std::unordered_map); + index_size += adjacencylistPerVertex.size() * (sizeof(uint64_t) + sizeof(adjacency_list_variant)); - for(const auto& iterator : adjacencylistPerVertex){ - // might be wrong in case of compression - data_size += sizeof(uint64_t) * iterator.second->size(); + for(const auto iterator : adjacencylistPerVertex){ + data_size += std::visit(Adjacency_List_Size_Visitor{}, iterator.second); } return {index_size, data_size}; @@ -170,13 +229,15 @@ namespace morphstore{ // for debugging: print neighbors a vertex void print_neighbors_of_vertex(uint64_t id) override{ - std::cout << "Neighbours for Vertex with id " << id << std::endl; - if(adjacencylistPerVertex.find(id) == adjacencylistPerVertex.end()) { + std::cout << std::endl << "Neighbours for Vertex with id " << id << std::endl; + auto edge_ids = get_outgoing_edge_ids(id); + + if(edge_ids.size() == 0) { std::cout << " No outgoing edges for vertex with id: " << id << std::endl; } else { - for (const auto edgeId : *adjacencylistPerVertex[id]) { - print_edge_by_id(edgeId); + for (const auto edge_id : edge_ids) { + print_edge_by_id(edge_id); } } } @@ -184,6 +245,8 @@ namespace morphstore{ void statistics() override { Graph::statistics(); std::cout << "Number of adjacency lists:" << adjacencylistPerVertex.size() << std::endl; + std::string isFinal = (finalized) ? "true" : "false"; + std::cout << "AdjacencyLists finalized:" << isFinal << std::endl; std::cout << std::endl << std::endl; } From c2dc471626db1763050952854735db1d1d33ab44 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 12:39:53 +0200 Subject: [PATCH 148/216] Assert correct out_degrees after compression --- .../storage/graph/simple/simple_graph_test.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 76cc17f9..7cdd4278 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -57,10 +57,6 @@ void simpleGraphFormatTest (void) { // (DEBUG) graph->statistics(); - graph->print_edge_by_id(0); - graph->compress(morphstore::GraphCompressionFormat::DELTA); - graph->print_neighbors_of_vertex(v2); - graph->statistics(); assert(graph->getVertexCount() == 3); assert(graph->getEdgeCount() == 3); @@ -68,5 +64,19 @@ void simpleGraphFormatTest (void) { assert(graph->get_out_degree(v3) == 0); assert(graph->get_out_degree(v1) == 1); assert(graph->get_out_degree(v2) == 2); + + graph->compress(morphstore::GraphCompressionFormat::DELTA); + + graph->statistics(); + +/* graph->print_neighbors_of_vertex(v1); + graph->print_neighbors_of_vertex(v2); + graph->print_neighbors_of_vertex(v3); */ + + assert(graph->get_out_degree(v3) == 0); + assert(graph->get_out_degree(v1) == 1); + assert(graph->get_out_degree(v2) == 2); + + //assert(false); } From 0d9ab239f4c62cdbf42749c95c9eafe9a708495f Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 16:13:44 +0200 Subject: [PATCH 149/216] Add virtual deconstructor to `column_base` --- include/core/storage/column.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/core/storage/column.h b/include/core/storage/column.h index 57449b87..4f2884e7 100644 --- a/include/core/storage/column.h +++ b/include/core/storage/column.h @@ -46,6 +46,7 @@ enum class storage_persistence_type { // use-case: graph formats can change their column format at run-time via `compress(Format f)` class column_base { public: + virtual ~column_base() {} // todo: find a way to specify `inline` virtual voidptr_t get_data( void ) const = 0; virtual size_t get_count_values( void ) const = 0; From aa0d7f7432aee8c016b31a0e5b0dc694b03de68f Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 16:14:36 +0200 Subject: [PATCH 150/216] Implement `compress` for AdjacencyList format --- .../storage/graph/formats/adjacencylist.h | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 9d7885f4..e77324be 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -38,12 +38,13 @@ namespace morphstore{ class AdjacencyList: public Graph { private: - using adjacency_column = column_base*; + // const column as after finalized only read_only + using adjacency_column = const column_base*; using adjacency_vector = std::vector*; using adjacency_list_variant = std::variant; struct Adjacency_List_Size_Visitor { - size_t operator()(const adjacency_column c) const { + size_t operator()(adjacency_column c) const { return c->get_size_used_byte(); } size_t operator()(const adjacency_vector v) const { @@ -56,8 +57,7 @@ namespace morphstore{ return c; } adjacency_column operator()(const adjacency_vector v) const { - // const_cast to be able to assign colum to entry value - return const_cast(make_column(v->data(), v->size(), true)); + return make_column(v->data(), v->size(), true); } }; @@ -86,7 +86,7 @@ namespace morphstore{ ~AdjacencyList() { for(auto entry: this->adjacencylistPerVertex) { if (finalized) { - free(std::get(entry.second)); + delete std::get(entry.second); } else { free(std::get(entry.second)); @@ -159,6 +159,7 @@ namespace morphstore{ else { uint64_t out_degree; if (finalized) { + // todo: verify that column can stay decompressod for retrieving count_values out_degree = std::get(entry->second)->get_count_values(); } else { @@ -173,7 +174,7 @@ namespace morphstore{ auto entry = adjacencylistPerVertex.find(id); if (entry != adjacencylistPerVertex.end()) { if (this->finalized) { - adjacency_column col = std::get(entry->second); + adjacency_column col = decompress_graph_col(std::get(entry->second), current_compression); const size_t column_size = col->get_count_values(); // TODO: init vector via range-constructor / mem-cpy //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; @@ -199,17 +200,40 @@ namespace morphstore{ return targetVertexIds; } + // compresses the adj-lists to the given target_format + // !!! first time overhead: as convert each vector to a column (finalizing) !!! void compress(GraphCompressionFormat target_format) override { - std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; - if (!finalized) { std::cout << "Transforming vectors into columns" << std::endl; this->finalize(); } + + std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; + + std::unordered_map morphedAdjColumns; + morphedAdjColumns.reserve(adjacencylistPerVertex.size()); + + for (auto const& [id, adjList] : adjacencylistPerVertex) { + auto old_adj_col = std::get(adjList); + adjacency_column morphed_adj_col = morph_graph_col(old_adj_col, current_compression, target_format); + delete old_adj_col; + morphedAdjColumns.insert({id, morphed_adj_col}); + } - // TODO: convert column to target_format + this->adjacencylistPerVertex = morphedAdjColumns; + this->current_compression = target_format; + + // TODO: move into seperate function (maybe returning map) + std::vector compr_ratios; + for (auto const& [id, adjList] : adjacencylistPerVertex) { + std::cout << "compression_ratio of adjlist of vertex " << id << std::endl; + compr_ratios.push_back(compression_ratio(std::get(adjList), current_compression)); + } + + double avg_compr_ratio = std::accumulate(compr_ratios.begin(), compr_ratios.end(), 0.0) / compr_ratios.size(); + std::cout << "avg compression " << avg_compr_ratio << std::endl; + - //this->current_compression = target_format; } // for measuring the size in bytes: @@ -231,7 +255,7 @@ namespace morphstore{ void print_neighbors_of_vertex(uint64_t id) override{ std::cout << std::endl << "Neighbours for Vertex with id " << id << std::endl; auto edge_ids = get_outgoing_edge_ids(id); - + if(edge_ids.size() == 0) { std::cout << " No outgoing edges for vertex with id: " << id << std::endl; } From db4fb055ff5d1a371aca829359dc1c9da4c66dde Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 17:47:50 +0200 Subject: [PATCH 151/216] Use a pointer to map for the adj-lists better manual `delete` .. and some renamings --- .../storage/graph/formats/adjacencylist.h | 81 +++++++++++-------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index e77324be..04e00c39 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -62,8 +62,9 @@ namespace morphstore{ }; // maps the outgoing edges (ids) per vertex - std::unordered_map adjacencylistPerVertex; - + std::unordered_map *adjacencylistPerVertex = + new std::unordered_map(); + // indicating whether we have columns or vectors (columns after first compress() call) // TODO: is this replace-able by just checking the type of the first element in the map? (via holds_alternative) bool finalized = false; @@ -71,26 +72,32 @@ namespace morphstore{ // convert every adjVector to a adjColumn void finalize() { if (!finalized) { - std::unordered_map adjacencyColumnPerVertex; - adjacencyColumnPerVertex.reserve(adjacencylistPerVertex.size()); + std::unordered_map *adjacency_column_per_vertex = + new std::unordered_map(); + + adjacency_column_per_vertex->reserve(adjacencylistPerVertex->size()); - for(auto entry: adjacencylistPerVertex) { - adjacencyColumnPerVertex.insert({entry.first, std::visit(Adjacency_List_Finalizer{}, entry.second)}); + for(auto [id, adj_list]: *adjacencylistPerVertex) { + adjacency_column_per_vertex->insert({id, std::visit(Adjacency_List_Finalizer{}, adj_list)}); } - this->adjacencylistPerVertex = adjacencyColumnPerVertex; + delete adjacencylistPerVertex; + + this->adjacencylistPerVertex = adjacency_column_per_vertex; this->finalized = true; } } public: ~AdjacencyList() { - for(auto entry: this->adjacencylistPerVertex) { + for(auto [id, adj_list]: *this->adjacencylistPerVertex) { if (finalized) { - delete std::get(entry.second); + delete std::get(adj_list); } else { - free(std::get(entry.second)); + free(std::get(adj_list)); } + + delete adjacencylistPerVertex; } } @@ -106,7 +113,7 @@ namespace morphstore{ // function: to set graph allocations void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); - adjacencylistPerVertex.reserve(numberVertices); + adjacencylistPerVertex->reserve(numberVertices); } // adding a single edge to vertex: @@ -131,11 +138,11 @@ namespace morphstore{ } std::vector *adjacencyList; - if (adjacencylistPerVertex.find(sourceId) != adjacencylistPerVertex.end()) { - adjacencyList = std::get(adjacencylistPerVertex[sourceId]); + if (adjacencylistPerVertex->find(sourceId) != adjacencylistPerVertex->end()) { + adjacencyList = std::get(adjacencylistPerVertex->at(sourceId)); } else { adjacencyList = new std::vector(); - adjacencylistPerVertex.insert({sourceId, adjacencyList}); + adjacencylistPerVertex->insert({sourceId, adjacencyList}); } adjacencyList->reserve(edgesToAdd.size()); @@ -152,14 +159,14 @@ namespace morphstore{ // get number of neighbors of vertex with id uint64_t get_out_degree(uint64_t id) override { - auto entry = adjacencylistPerVertex.find(id); - if (entry == adjacencylistPerVertex.end()) { + auto entry = adjacencylistPerVertex->find(id); + if (entry == adjacencylistPerVertex->end()) { return 0; } else { uint64_t out_degree; if (finalized) { - // todo: verify that column can stay decompressod for retrieving count_values + // todo: verify that column can stay compressod for retrieving count_values out_degree = std::get(entry->second)->get_count_values(); } else { @@ -171,8 +178,8 @@ namespace morphstore{ std::vector get_outgoing_edge_ids(uint64_t id) { std::vector edge_ids; - auto entry = adjacencylistPerVertex.find(id); - if (entry != adjacencylistPerVertex.end()) { + auto entry = adjacencylistPerVertex->find(id); + if (entry != adjacencylistPerVertex->end()) { if (this->finalized) { adjacency_column col = decompress_graph_col(std::get(entry->second), current_compression); const size_t column_size = col->get_count_values(); @@ -180,6 +187,8 @@ namespace morphstore{ //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; const uint64_t * start_addr = col->get_data(); edge_ids.insert(edge_ids.end(), start_addr, start_addr+column_size); + + delete col; } else { edge_ids = *std::get(entry->second); } @@ -210,24 +219,26 @@ namespace morphstore{ std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; - std::unordered_map morphedAdjColumns; - morphedAdjColumns.reserve(adjacencylistPerVertex.size()); + std::unordered_map *morphed_adj_columns = + new std::unordered_map(); + morphed_adj_columns->reserve(adjacencylistPerVertex->size()); + + for (auto const [id, adj_list] : *adjacencylistPerVertex) { + auto old_adj_col = std::get(adj_list); + adjacency_column morphed_adj_col = morph_graph_col(old_adj_col, current_compression, target_format, true); - for (auto const& [id, adjList] : adjacencylistPerVertex) { - auto old_adj_col = std::get(adjList); - adjacency_column morphed_adj_col = morph_graph_col(old_adj_col, current_compression, target_format); - delete old_adj_col; - morphedAdjColumns.insert({id, morphed_adj_col}); + morphed_adj_columns->insert({id, morphed_adj_col}); } - this->adjacencylistPerVertex = morphedAdjColumns; + delete adjacencylistPerVertex; + this->adjacencylistPerVertex = morphed_adj_columns; this->current_compression = target_format; - // TODO: move into seperate function (maybe returning map) + // TODO: move into seperate function std::vector compr_ratios; - for (auto const& [id, adjList] : adjacencylistPerVertex) { - std::cout << "compression_ratio of adjlist of vertex " << id << std::endl; - compr_ratios.push_back(compression_ratio(std::get(adjList), current_compression)); + for (auto const [id, adj_list] : *adjacencylistPerVertex) { + std::cout << "compression_ratio of adj_list of vertex " << id << std::endl; + compr_ratios.push_back(compression_ratio(std::get(adj_list), current_compression)); } double avg_compr_ratio = std::accumulate(compr_ratios.begin(), compr_ratios.end(), 0.0) / compr_ratios.size(); @@ -242,10 +253,10 @@ namespace morphstore{ // adjacencyListPerVertex index_size += sizeof(std::unordered_map); - index_size += adjacencylistPerVertex.size() * (sizeof(uint64_t) + sizeof(adjacency_list_variant)); + index_size += adjacencylistPerVertex->size() * (sizeof(uint64_t) + sizeof(adjacency_list_variant)); - for(const auto iterator : adjacencylistPerVertex){ - data_size += std::visit(Adjacency_List_Size_Visitor{}, iterator.second); + for(const auto [id, adj_list] : *adjacencylistPerVertex){ + data_size += std::visit(Adjacency_List_Size_Visitor{}, adj_list); } return {index_size, data_size}; @@ -268,7 +279,7 @@ namespace morphstore{ void statistics() override { Graph::statistics(); - std::cout << "Number of adjacency lists:" << adjacencylistPerVertex.size() << std::endl; + std::cout << "Number of adjacency lists:" << adjacencylistPerVertex->size() << std::endl; std::string isFinal = (finalized) ? "true" : "false"; std::cout << "AdjacencyLists finalized:" << isFinal << std::endl; std::cout << std::endl << std::endl; From 15a6fa474916c0096b912e163b43ebb2d2f46732 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 17:48:35 +0200 Subject: [PATCH 152/216] Delete intermediate columns in morph_graph_col --- .../core/storage/graph/graph_compr_format.h | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index d5f9ca0a..c1cefadf 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -66,25 +66,29 @@ namespace morphstore{ } // casting the column to the actual column type before morphing (as compiler could not derive it) - const column_base* morph_graph_col(const column_base* column, const GraphCompressionFormat src_f, const GraphCompressionFormat trg_f) { + // delete_old_col -> delete input column after morphing (if the result is not the input column) + const column_base* morph_graph_col(const column_base* column, const GraphCompressionFormat src_f, const GraphCompressionFormat trg_f, bool delete_in_col = false) { if (src_f == trg_f) { return column; } + const column_base *result; + switch (src_f) { case GraphCompressionFormat::UNCOMPRESSED: { const column_uncompr *old_col = dynamic_cast(column); switch (trg_f) { case GraphCompressionFormat::DELTA: - return morph(old_col); + result = morph(old_col); break; case GraphCompressionFormat::FOR: - return morph(old_col); + result = morph(old_col); break; case GraphCompressionFormat::UNCOMPRESSED: - return old_col; + result = old_col; break; } + return result; break; } @@ -92,29 +96,52 @@ namespace morphstore{ case GraphCompressionFormat::DELTA: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { const column_delta *old_col = dynamic_cast(column); - return morph(old_col); + result = morph(old_col); + } + else { + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); + result = morph_graph_col( + uncompr_col, + GraphCompressionFormat::UNCOMPRESSED, + trg_f); + delete uncompr_col; } - return morph_graph_col(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED), GraphCompressionFormat::UNCOMPRESSED, trg_f); break; } case GraphCompressionFormat::FOR: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { const column_for *old_col = dynamic_cast(column); - return morph(old_col); + result = morph(old_col); + } + else { + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); + result = morph_graph_col( + uncompr_col, + GraphCompressionFormat::UNCOMPRESSED, + trg_f); + delete uncompr_col; } - return morph_graph_col(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED), GraphCompressionFormat::UNCOMPRESSED, trg_f); break; } } - throw std::runtime_error("Did not handle src: " + to_string(src_f) + " trg: " + to_string(trg_f)); + if (result != column && delete_in_col){ + delete column; + } + + if (result == nullptr) { + throw std::runtime_error("Did not handle src: " + to_string(src_f) + " trg: " + to_string(trg_f)); + } + + return result; } - const column_uncompr* decompress_graph_col(const column_base* column, const GraphCompressionFormat src_f) { - return static_cast(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED)); + const column_uncompr* decompress_graph_col(const column_base* column, const GraphCompressionFormat src_f, bool delete_in_col = false) { + return static_cast(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col)); } double compression_ratio(const column_base* column, GraphCompressionFormat col_format) { + // TODO: need to delete decompressed_col? return decompress_graph_col(column, col_format)->get_size_used_byte() / (double) column->get_size_used_byte(); } } From bcbbf6e99c488a23043b7b1ba6f3d2c695460776 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 18:43:34 +0200 Subject: [PATCH 153/216] Delete temporary colums in CSR and refactor duplicate get_edge_ids code .. fixing a memory leak in at least `get_out_degree()` --- include/core/storage/graph/formats/csr.h | 98 +++++++++++++----------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index e395cae2..d856742e 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -118,62 +118,55 @@ namespace morphstore{ // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) override { // decompressing offset_column in order to read correct offset - // TODO: only decompress part as only offset_column[id] and offset_column[id+1] will be read - uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); + // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read + auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); + uint64_t* offset_data = uncompr_offset_col->get_data(); uint64_t offset = offset_data[id]; - // special case: last vertex id has no next offset uint64_t nextOffset; - // todo: `getExpectedVertexCount()` could be replaced by `offset_column->get_count_values()` - if(id == getExpectedVertexCount() -1){ - nextOffset = getExpectedEdgeCount(); + // special case: last vertex id has no next offset + if(id == getVertexCount() -1){ + nextOffset = getEdgeCount(); }else{ nextOffset = offset_data[id+1]; } - if(offset == nextOffset) return 0; - uint64_t degree = nextOffset - offset; - return degree; + // deleting temporary column + if (uncompr_offset_col != offset_column) { + delete uncompr_offset_col; + } + + // compute out_degree + if (offset == nextOffset) + return 0; + else { + return nextOffset - offset; + } } // function to return a vector of ids of neighbors for BFS alg. std::vector get_neighbors_ids(uint64_t id) override { - std::vector neighbourEdgeIds; - uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); - uint64_t offset = offset_data[id]; - uint64_t numberEdges = get_out_degree(id); - - // avoiding out of bounds ... - // TODO: use assert here, as this is only out of bounds if the offset - if( offset < getExpectedEdgeCount()){ - uint64_t* edgeId_data = decompress_graph_col(edgeId_column, current_compression)->get_data(); - neighbourEdgeIds.insert(neighbourEdgeIds.end(), edgeId_data+offset, edgeId_data+offset+numberEdges); - } - - std::vector targetVertexIds; - - // resolving each edgeId - for (auto edgeId: neighbourEdgeIds) - { - assert(edges->exists_edge(edgeId)); - targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); - } - - return targetVertexIds; + std::vector targetVertexIds; + for (auto edge_id : get_outgoing_edge_ids(id)) { + assert(edges->exists_edge(edge_id)); + targetVertexIds.push_back(edges->get_edge(edge_id).getTargetId()); + } + + return targetVertexIds; } void compress(GraphCompressionFormat target_format) override { - std::cout << "Morphing graph format specific data structures from " << to_string(current_compression) << " to " << to_string(target_format) << std::endl; - + std::cout << "Morphing graph format specific data structures from " + << to_string(current_compression) << " to " << to_string(target_format) << std::endl; if (current_compression == target_format) { std::cout << "Already in " << to_string(target_format); return; } - offset_column = const_cast(morph_graph_col(offset_column, current_compression, target_format)); - edgeId_column = const_cast(morph_graph_col(edgeId_column, current_compression, target_format)); + offset_column = const_cast(morph_graph_col(offset_column, current_compression, target_format, true)); + edgeId_column = const_cast(morph_graph_col(edgeId_column, current_compression, target_format, true)); std::cout << " offset col compression ratio: " << compression_ratio(offset_column, target_format) << std::endl @@ -194,19 +187,38 @@ namespace morphstore{ return {index_size, data_size}; } + std::vector get_outgoing_edge_ids(uint64_t id) { + assert(vertices->exists_vertex(id)); + + std::vector out_edge_ids; + auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); + uint64_t offset = ((uint64_t *)uncompr_offset_col->get_data())[id]; + + if (uncompr_offset_col != offset_column) { + delete uncompr_offset_col; + } + + // TODO: decompressing offset_column twice this way + uint64_t numberEdges = get_out_degree(id); + + auto uncompr_edgeId_col = decompress_graph_col(edgeId_column, current_compression); + uint64_t *edgeId_data = uncompr_edgeId_col->get_data(); + out_edge_ids.insert(out_edge_ids.end(), edgeId_data + offset, edgeId_data + offset + numberEdges); + + if (uncompr_edgeId_col != edgeId_column) { + delete uncompr_edgeId_col; + } + + return out_edge_ids; + } + // for debugging: // TODO: simply by using a get_outgoing_edges(id) method void print_neighbors_of_vertex(uint64_t id) override{ std::cout << "Neighbours for Vertex with id " << id << std::endl; - uint64_t* offset_data = decompress_graph_col(offset_column, current_compression)->get_data(); - uint64_t offset = offset_data[id]; - uint64_t numberEdges = get_out_degree(id); - - uint64_t* edgeId_data = decompress_graph_col(edgeId_column, current_compression)->get_data(); - for(uint64_t i = offset; i < offset+numberEdges; ++i){ - uint64_t edgeId = edgeId_data[i]; - print_edge_by_id(edgeId); + for(auto const edge_id: get_outgoing_edge_ids(id)){ + print_edge_by_id(edge_id); } } From e8fd68a869fdb60deadb0818e1401925638ce00d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 20:54:29 +0200 Subject: [PATCH 154/216] Comment and rename minor things .. best commit title --- include/core/storage/graph/formats/csr.h | 4 +++- include/core/storage/graph/graph.h | 9 +++++++-- include/core/storage/graph/graph_compr_format.h | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index d856742e..75de7faf 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -86,7 +86,9 @@ namespace morphstore{ assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); // currently only read-only if compressed if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { - throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + to_string(current_compression)); + throw std::runtime_error( + "Edge insertion only allowed in uncompressed format. Current format: " + + to_string(current_compression)); } uint64_t* offset_data = offset_column->get_data(); diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 8957e88a..bc671c5e 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -134,11 +134,15 @@ namespace morphstore{ } // function to return a list of pair < vertex id, degree > DESC: + // TODO: move into seperate header and use graph as input parameter std::vector> get_list_of_degree_DESC(){ std::vector> vertexDegreeList; - vertexDegreeList.reserve(expectedVertexCount); + vertexDegreeList.reserve(getVertexCount()); // fill the vector with every vertex key and his degree - for(uint64_t i = 0; i < expectedVertexCount; ++i){ + for(uint64_t i = 0; i < getVertexCount(); ++i){ + if (i % 1000 == 0) { + std::cout << "Degree-List - Current Progress" << i << "/" << getVertexCount() << std::endl; + } vertexDegreeList.push_back({i, this->get_out_degree(i)}); } // sort the vector on degree DESC @@ -150,6 +154,7 @@ namespace morphstore{ } // function to measure graph characteristics (degree and count): + // TODO: move into seperate header and use graph as input parameter void measure_degree_count(std::string filePath){ std::vector> verticesDegree = get_list_of_degree_DESC(); // unordered map for mapping degree to count: diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index c1cefadf..b541286d 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -91,14 +91,13 @@ namespace morphstore{ return result; break; } - - // as direct morphing is not yet supported .. go via decompressing first case GraphCompressionFormat::DELTA: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { const column_delta *old_col = dynamic_cast(column); result = morph(old_col); } else { + // as direct morphing is not yet supported .. go via decompressing first auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); result = morph_graph_col( uncompr_col, @@ -114,6 +113,7 @@ namespace morphstore{ result = morph(old_col); } else { + // as direct morphing is not yet supported .. go via decompressing first auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); result = morph_graph_col( uncompr_col, From 3f42887b35d8067db73af8ad26878d1682c4590e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 20:55:43 +0200 Subject: [PATCH 155/216] Remove relative imports --- include/core/storage/graph/edge/edge.h | 2 +- include/core/storage/graph/edge/edges_container.h | 4 ++-- include/core/storage/graph/graph.h | 4 ++-- include/core/storage/graph/vertex/vertex.h | 2 +- .../core/storage/graph/vertex/vertices_container.h | 4 ++-- test/core/storage/graph/ldbc/ldbc_graph_test.h | 12 ++++++++---- 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index fab6c9f9..85e58053 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_EDGE_H #define MORPHSTORE_EDGE_H -#include "../property_type.h" +#include #include #include diff --git a/include/core/storage/graph/edge/edges_container.h b/include/core/storage/graph/edge/edges_container.h index 5133c2b5..80bed61c 100644 --- a/include/core/storage/graph/edge/edges_container.h +++ b/include/core/storage/graph/edge/edges_container.h @@ -24,8 +24,8 @@ #ifndef MORPHSTORE_EDGES_CONTAINER_H #define MORPHSTORE_EDGES_CONTAINER_H -#include "edge.h" -#include "../property_type.h" +#include +#include #include #include diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index bc671c5e..7ab6aff8 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -140,9 +140,9 @@ namespace morphstore{ vertexDegreeList.reserve(getVertexCount()); // fill the vector with every vertex key and his degree for(uint64_t i = 0; i < getVertexCount(); ++i){ - if (i % 1000 == 0) { +/* if (i % 1000 == 0) { std::cout << "Degree-List - Current Progress" << i << "/" << getVertexCount() << std::endl; - } + } */ vertexDegreeList.push_back({i, this->get_out_degree(i)}); } // sort the vector on degree DESC diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index e08c2f31..2805ca69 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_VERTEX_H #define MORPHSTORE_VERTEX_H -#include "../property_type.h" +#include #include #include diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index d07d8362..0b2a8c50 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -24,8 +24,8 @@ #ifndef MORPHSTORE_VERTICES_CONTAINER_H #define MORPHSTORE_VERTICES_CONTAINER_H -#include "vertex.h" -#include "../property_type.h" +#include +#include #include #include diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index 015e3456..7e8ee309 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -61,10 +61,7 @@ void ldbcGraphFormatTest (void) { // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); - graph->compress(morphstore::GraphCompressionFormat::FOR); - - // measure degree distribution and write to file (file path as parameter): - graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); + graph->compress(morphstore::GraphCompressionFormat::DELTA); // some statistics (DEBUG) std::cout << "Some statistics" << std::endl; @@ -74,4 +71,11 @@ void ldbcGraphFormatTest (void) { graph->print_vertex_by_id(1035174); graph->print_edge_by_id(10); graph->print_neighbors_of_vertex(1035174); + + // measure degree distribution and write to file (file path as parameter): + // TODO: but this into benchmark or so .. not actual test + //std::cout << "Measure degree count" << std::endl; + //graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); + + } \ No newline at end of file From 3e8174ea341e8e8f591e089d49d744f07e0e7c5e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 21:18:40 +0200 Subject: [PATCH 156/216] Morph adj-list map now in-place and add logging --- .../storage/graph/formats/adjacencylist.h | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 04e00c39..3814b9b8 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -39,12 +39,12 @@ namespace morphstore{ private: // const column as after finalized only read_only - using adjacency_column = const column_base*; + using adjacency_column = column_base*; using adjacency_vector = std::vector*; using adjacency_list_variant = std::variant; struct Adjacency_List_Size_Visitor { - size_t operator()(adjacency_column c) const { + size_t operator()(const adjacency_column c) const { return c->get_size_used_byte(); } size_t operator()(const adjacency_vector v) const { @@ -53,15 +53,16 @@ namespace morphstore{ }; struct Adjacency_List_Finalizer { - adjacency_column operator()(const adjacency_column c) const { + adjacency_list_variant operator()(const adjacency_column c) const { return c; } - adjacency_column operator()(const adjacency_vector v) const { - return make_column(v->data(), v->size(), true); + adjacency_list_variant operator()(const adjacency_vector v) const { + // const_cast as return type is not constant + return const_cast(make_column(v->data(), v->size(), true)); } }; - // maps the outgoing edges (ids) per vertex + // maps the a list of outgoing edges (ids) to a vertex-id std::unordered_map *adjacencylistPerVertex = new std::unordered_map(); @@ -69,24 +70,16 @@ namespace morphstore{ // TODO: is this replace-able by just checking the type of the first element in the map? (via holds_alternative) bool finalized = false; - // convert every adjVector to a adjColumn + // convert every adj-vector to a adj-column void finalize() { if (!finalized) { - std::unordered_map *adjacency_column_per_vertex = - new std::unordered_map(); - - adjacency_column_per_vertex->reserve(adjacencylistPerVertex->size()); - for(auto [id, adj_list]: *adjacencylistPerVertex) { - adjacency_column_per_vertex->insert({id, std::visit(Adjacency_List_Finalizer{}, adj_list)}); + (*adjacencylistPerVertex)[id] = std::visit(Adjacency_List_Finalizer{}, adj_list); } - - delete adjacencylistPerVertex; - - this->adjacencylistPerVertex = adjacency_column_per_vertex; this->finalized = true; } } + public: ~AdjacencyList() { for(auto [id, adj_list]: *this->adjacencylistPerVertex) { @@ -181,7 +174,7 @@ namespace morphstore{ auto entry = adjacencylistPerVertex->find(id); if (entry != adjacencylistPerVertex->end()) { if (this->finalized) { - adjacency_column col = decompress_graph_col(std::get(entry->second), current_compression); + auto col = decompress_graph_col(std::get(entry->second), current_compression); const size_t column_size = col->get_count_values(); // TODO: init vector via range-constructor / mem-cpy //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; @@ -218,20 +211,19 @@ namespace morphstore{ } std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; - - std::unordered_map *morphed_adj_columns = - new std::unordered_map(); - morphed_adj_columns->reserve(adjacencylistPerVertex->size()); - + + auto entry_count = adjacencylistPerVertex->size(); + int progress = 0; for (auto const [id, adj_list] : *adjacencylistPerVertex) { auto old_adj_col = std::get(adj_list); - adjacency_column morphed_adj_col = morph_graph_col(old_adj_col, current_compression, target_format, true); - - morphed_adj_columns->insert({id, morphed_adj_col}); + if (progress % 10000 == 0) { + std::cout << "Compression Progress: " << progress << "/" << entry_count << std::endl; + } + // const_cast needed as map-value is not constant + (*adjacencylistPerVertex)[id] = const_cast(morph_graph_col(old_adj_col, current_compression, target_format, true)); + progress++; } - delete adjacencylistPerVertex; - this->adjacencylistPerVertex = morphed_adj_columns; this->current_compression = target_format; // TODO: move into seperate function From 8ad4318880ba5e1dd1274bbe7ad318cb0974b02c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 5 May 2020 21:25:14 +0200 Subject: [PATCH 157/216] Use "morph" instead of "compress" it could be also decompression --- include/core/storage/graph/formats/adjacencylist.h | 4 ++-- include/core/storage/graph/formats/csr.h | 2 +- include/core/storage/graph/graph.h | 2 +- test/core/storage/graph/ldbc/ldbc_graph_test.h | 2 +- test/core/storage/graph/simple/simple_graph_test.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 3814b9b8..eba0f8f5 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -202,9 +202,9 @@ namespace morphstore{ return targetVertexIds; } - // compresses the adj-lists to the given target_format + // morphes the adj-lists to the given target_format // !!! first time overhead: as convert each vector to a column (finalizing) !!! - void compress(GraphCompressionFormat target_format) override { + void morph(GraphCompressionFormat target_format) override { if (!finalized) { std::cout << "Transforming vectors into columns" << std::endl; this->finalize(); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 75de7faf..c0fcae92 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -158,7 +158,7 @@ namespace morphstore{ return targetVertexIds; } - void compress(GraphCompressionFormat target_format) override { + void morph(GraphCompressionFormat target_format) override { std::cout << "Morphing graph format specific data structures from " << to_string(current_compression) << " to " << to_string(target_format) << std::endl; diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 7ab6aff8..47177822 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -191,7 +191,7 @@ namespace morphstore{ virtual std::string get_storage_format() const = 0; virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; - virtual void compress(GraphCompressionFormat target_format) = 0; + virtual void morph(GraphCompressionFormat target_format) = 0; virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index 7e8ee309..c94d95e8 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -61,7 +61,7 @@ void ldbcGraphFormatTest (void) { // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); - graph->compress(morphstore::GraphCompressionFormat::DELTA); + graph->morph(morphstore::GraphCompressionFormat::DELTA); // some statistics (DEBUG) std::cout << "Some statistics" << std::endl; diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 7cdd4278..518b6998 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -65,7 +65,7 @@ void simpleGraphFormatTest (void) { assert(graph->get_out_degree(v1) == 1); assert(graph->get_out_degree(v2) == 2); - graph->compress(morphstore::GraphCompressionFormat::DELTA); + graph->morph(morphstore::GraphCompressionFormat::DELTA); graph->statistics(); From 7e50da87ccdad4d1959823a2caf979ab07efea41 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 12:29:32 +0200 Subject: [PATCH 158/216] Add edges container benchmark --- src/microbenchmarks/graph/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index 8324ae0d..e70d3e0b 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -1,5 +1,7 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/vertex_storage_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/edge_storage_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) + add_executable( edge_storage_benchmark_app edge_storage_benchmark.cpp) endif() \ No newline at end of file From 9054cc41df630d594ce47bc4ae117828d261aa4d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 16:15:19 +0200 Subject: [PATCH 159/216] Add debug flag --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3864fd73..32e18393 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ set( LOG_FILE "recentMorphStoreProjectConf.log" ) IF(CMAKE_BUILD_TYPE MATCHES Debug) morph_flag(-g) + morph_flag(-DNDEBUG) message(STATUS "MorphStore is configured in DEBUG mode.") ELSEIF(CMAKE_BUILD_TYPE MATCHES Release) morph_flag(-O2) From 5369f552a1b909fa3fa923524d0b7a558da075e2 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 16:16:35 +0200 Subject: [PATCH 160/216] Add compression ratios to CSR statistics --- include/core/storage/graph/formats/csr.h | 15 +++++++++------ include/core/storage/graph/graph.h | 1 - 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index c0fcae92..924bb62f 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -83,7 +83,11 @@ namespace morphstore{ // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { + // TODO: throw error if not in order of vertex-ids ASC inserted (currently will only produce rubbish data) + // TODO: handle if sourceIDs are skipped + // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and insert offsets for skipped vertices assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); + // currently only read-only if compressed if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { throw std::runtime_error( @@ -170,11 +174,6 @@ namespace morphstore{ offset_column = const_cast(morph_graph_col(offset_column, current_compression, target_format, true)); edgeId_column = const_cast(morph_graph_col(edgeId_column, current_compression, target_format, true)); - std::cout << " offset col compression ratio: " - << compression_ratio(offset_column, target_format) << std::endl - << " edgeId col compression ratio: " - << compression_ratio(edgeId_column, target_format) << std::endl; - this->current_compression = target_format; } @@ -225,13 +224,17 @@ namespace morphstore{ } std::string get_column_info(const column_base *column) { - return " values: " + std::to_string(column->get_count_values()) + " size in bytes: " + std::to_string(column->get_size_used_byte()); + return " values: " + std::to_string(column->get_count_values()) + + " size in bytes: " + std::to_string(column->get_size_used_byte()) + + " compression ratio: " + std::to_string(compression_ratio(column, current_compression)); } void statistics() override { Graph::statistics(); std::cout << "offset column: " << get_column_info(offset_column) << std::endl; std::cout << "edgeId column: " << get_column_info(edgeId_column) << std::endl; + std::cout << "--------------------------------------------" << std::endl; + std::cout << std::endl << std::endl; } }; } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 47177822..68f49441 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -227,7 +227,6 @@ namespace morphstore{ std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edges->edges_with_properties_count() << std::endl; std::cout << "Compression Format:" << to_string(current_compression) << std::endl; - std::cout << "--------------------------------------------" << std::endl; } void print_vertex_by_id(uint64_t id) { From 5256433c9048a9fa6da9b860ad6a492db84494c6 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 16:20:16 +0200 Subject: [PATCH 161/216] Restrict compressed adj_list by minimum size * previously as only default formats are used -> quickly OOM as most have less than 1024 edges * small adj_list are still vectors * columns are read-only currently * Introduce ratio methods --- .../storage/graph/formats/adjacencylist.h | 190 ++++++++++++------ 1 file changed, 123 insertions(+), 67 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index eba0f8f5..4e4ac705 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -48,17 +48,17 @@ namespace morphstore{ return c->get_size_used_byte(); } size_t operator()(const adjacency_vector v) const { - return v->size(); + return v->size() * sizeof(uint64_t); } }; - struct Adjacency_List_Finalizer { - adjacency_list_variant operator()(const adjacency_column c) const { - return c; + struct Adjacency_List_OutDegree_Visitor { + uint64_t operator()(const adjacency_column c) const { + // assuming compressed col has the same value count (would not work for RLE) + return c->get_count_values(); } - adjacency_list_variant operator()(const adjacency_vector v) const { - // const_cast as return type is not constant - return const_cast(make_column(v->data(), v->size(), true)); + uint64_t operator()(const adjacency_vector v) const { + return v->size(); } }; @@ -66,24 +66,53 @@ namespace morphstore{ std::unordered_map *adjacencylistPerVertex = new std::unordered_map(); - // indicating whether we have columns or vectors (columns after first compress() call) - // TODO: is this replace-able by just checking the type of the first element in the map? (via holds_alternative) - bool finalized = false; + // as default formats allocate to much memory for small columns + // TODO: allow as parameter in compr + // TODO: as parameter this could provide issues when transforming from on format to another + // handle edge-case by finalizing also checking and potentially converting to (old) current_format + // other edge_case: might need to decompress some columns if min_compr_degree got larger + uint64_t min_compr_degree = 100; - // convert every adj-vector to a adj-column + // convert big-enough adj-vector to a (read-only) adj-column void finalize() { - if (!finalized) { - for(auto [id, adj_list]: *adjacencylistPerVertex) { - (*adjacencylistPerVertex)[id] = std::visit(Adjacency_List_Finalizer{}, adj_list); + int vectors_transformed = 0; + for (auto [id, adj_list] : *adjacencylistPerVertex) { + if (std::holds_alternative(adj_list)) { + auto adj_vector = std::get(adj_list); + // this allows adding new edges to smaller adj_lists (even after morphing) + if (adj_vector->size() >= min_compr_degree) { + auto adj_col = const_cast(make_column( + adj_vector->data(), + adj_vector->size(), + true)); + + (*adjacencylistPerVertex)[id] = adj_col; + + // as v is not needed anymore and allocated using new + delete adj_vector; + vectors_transformed++; + } } - this->finalized = true; + } +#if DEBUG + std::cout << "Transformed " << vectors_transformed << " vectors into columns" << std::endl; +#endif + } + + const column_uncompr *decompress_adjacency_column(const adjacency_column col) const { + // assuming compressed col has the same value count (would not work for RLE) + if (min_compr_degree < col->get_count_values()) { + // decompress_graph_col just checks the format of the column here + return decompress_graph_col(col, GraphCompressionFormat::UNCOMPRESSED); + } else { + return decompress_graph_col(col, current_compression); } } public: ~AdjacencyList() { for(auto [id, adj_list]: *this->adjacencylistPerVertex) { - if (finalized) { + if (std::holds_alternative(adj_list)) { delete std::get(adj_list); } else { @@ -97,8 +126,9 @@ namespace morphstore{ AdjacencyList(EdgesContainerType edges_container_type) : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} - AdjacencyList(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} - + AdjacencyList(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) + : Graph(vertices_container_type) {} + std::string get_storage_format() const override { return "Adjacency_List"; } @@ -117,39 +147,38 @@ namespace morphstore{ // function that adds multiple edges (list of neighbors) at once to vertex void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { - if (finalized) { - throw std::runtime_error("Cannot add edges, if adj. lists are compressed"); - } - if (!vertices->exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } - // avoid inserting an empty adjacencyList (waste of memory) + // avoid inserting an empty adjacencyVector (waste of memory) if (edgesToAdd.size() == 0) { return ; } - std::vector *adjacencyList; - if (adjacencylistPerVertex->find(sourceId) != adjacencylistPerVertex->end()) { - adjacencyList = std::get(adjacencylistPerVertex->at(sourceId)); + std::vector *adjacencyVector; + if (auto entry = adjacencylistPerVertex->find(sourceId); entry != adjacencylistPerVertex->end()) { + if (std::holds_alternative(entry->second)) { + throw std::runtime_error("Not implemented to add edges, if adj. list is a (compressed) column"); + } + + adjacencyVector = std::get(entry->second); } else { - adjacencyList = new std::vector(); - adjacencylistPerVertex->insert({sourceId, adjacencyList}); + adjacencyVector = new std::vector(); + adjacencylistPerVertex->insert({sourceId, adjacencyVector}); } - adjacencyList->reserve(edgesToAdd.size()); + adjacencyVector->reserve(edgesToAdd.size()); for (const auto edge : edgesToAdd) { if (!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found :" + edge.to_string()); } edges->add_edge(edge); - adjacencyList->push_back(edge.getId()); + adjacencyVector->push_back(edge.getId()); } } - // get number of neighbors of vertex with id uint64_t get_out_degree(uint64_t id) override { auto entry = adjacencylistPerVertex->find(id); @@ -157,35 +186,31 @@ namespace morphstore{ return 0; } else { - uint64_t out_degree; - if (finalized) { - // todo: verify that column can stay compressod for retrieving count_values - out_degree = std::get(entry->second)->get_count_values(); - } - else { - out_degree = std::get(entry->second)->size(); - } - return out_degree; + return std::visit(Adjacency_List_OutDegree_Visitor{}, entry->second); } } std::vector get_outgoing_edge_ids(uint64_t id) { + // basically column -> vector (as convinient to use in other methods) + // maybe better idea would be to return a uint64_t* instead (together with a size value) std::vector edge_ids; - auto entry = adjacencylistPerVertex->find(id); - if (entry != adjacencylistPerVertex->end()) { - if (this->finalized) { - auto col = decompress_graph_col(std::get(entry->second), current_compression); - const size_t column_size = col->get_count_values(); + if (auto entry = adjacencylistPerVertex->find(id); entry != adjacencylistPerVertex->end()) { + auto adj_list = entry->second; + if (std::holds_alternative(adj_list)) { + auto uncompr_col = decompress_adjacency_column(std::get(adj_list)); + const size_t column_size = uncompr_col->get_count_values(); // TODO: init vector via range-constructor / mem-cpy //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; - const uint64_t * start_addr = col->get_data(); - edge_ids.insert(edge_ids.end(), start_addr, start_addr+column_size); - - delete col; + const uint64_t *start_addr = uncompr_col->get_data(); + + edge_ids.insert(edge_ids.end(), start_addr, start_addr + column_size); + delete uncompr_col; + } else { - edge_ids = *std::get(entry->second); + edge_ids = *std::get(adj_list); } } + return edge_ids; } @@ -205,38 +230,67 @@ namespace morphstore{ // morphes the adj-lists to the given target_format // !!! first time overhead: as convert each vector to a column (finalizing) !!! void morph(GraphCompressionFormat target_format) override { - if (!finalized) { - std::cout << "Transforming vectors into columns" << std::endl; - this->finalize(); - } + // transform big enough vectors into columns + this->finalize(); +#if DEBUG std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; - auto entry_count = adjacencylistPerVertex->size(); int progress = 0; +# endif for (auto const [id, adj_list] : *adjacencylistPerVertex) { - auto old_adj_col = std::get(adj_list); +#if DEBUG if (progress % 10000 == 0) { std::cout << "Compression Progress: " << progress << "/" << entry_count << std::endl; } - // const_cast needed as map-value is not constant - (*adjacencylistPerVertex)[id] = const_cast(morph_graph_col(old_adj_col, current_compression, target_format, true)); progress++; +# endif + + // const_cast needed as map-value is not constant + if (std::visit(Adjacency_List_OutDegree_Visitor{}, adj_list) >= min_compr_degree) { + auto old_adj_col = std::get(adj_list); + (*adjacencylistPerVertex)[id] = const_cast(morph_graph_col( + old_adj_col, + current_compression, + target_format, + true)); + } } - + this->current_compression = target_format; + } - // TODO: move into seperate function - std::vector compr_ratios; + double compr_ratio() const { + double total_compr_ratio = 0; for (auto const [id, adj_list] : *adjacencylistPerVertex) { - std::cout << "compression_ratio of adj_list of vertex " << id << std::endl; - compr_ratios.push_back(compression_ratio(std::get(adj_list), current_compression)); + auto out_degree = std::visit(Adjacency_List_OutDegree_Visitor{}, adj_list); + double compr_ratio; + if (std::holds_alternative(adj_list)) { + auto adj_col = std::get(adj_list); + compr_ratio = compression_ratio(adj_col, current_compression); + } else { + compr_ratio = 1; + } + auto weighted_ratio = compr_ratio * ((double)out_degree / getEdgeCount()); + total_compr_ratio += weighted_ratio; } - double avg_compr_ratio = std::accumulate(compr_ratios.begin(), compr_ratios.end(), 0.0) / compr_ratios.size(); - std::cout << "avg compression " << avg_compr_ratio << std::endl; + return total_compr_ratio; + } + + double column_ratio() const { + if (getEdgeCount() == 0) { + return 1; + } + uint64_t column_count = 0; + for (auto const [id, adj_list] : *adjacencylistPerVertex) { + if (std::holds_alternative(adj_list)) { + column_count++; + } + } + return (double) column_count / getEdgeCount(); } // for measuring the size in bytes: @@ -272,9 +326,11 @@ namespace morphstore{ void statistics() override { Graph::statistics(); std::cout << "Number of adjacency lists:" << adjacencylistPerVertex->size() << std::endl; - std::string isFinal = (finalized) ? "true" : "false"; - std::cout << "AdjacencyLists finalized:" << isFinal << std::endl; + std::cout << "Colum ratio:" << column_ratio() << std::endl; + std::cout << "Compression ratio:" << compr_ratio() << std::endl; + std::cout << "--------------------------------------------" << std::endl; std::cout << std::endl << std::endl; + } }; From 24fe011561ed030d73a9ba4200250374350a102d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 16:21:10 +0200 Subject: [PATCH 162/216] Solve issue for building in release mode and add 2nd statistics call to ldbc_graph_Test --- include/core/storage/graph/graph_compr_format.h | 5 ++--- test/core/storage/graph/ldbc/ldbc_graph_test.h | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index b541286d..8cbb1b72 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -72,7 +72,7 @@ namespace morphstore{ return column; } - const column_base *result; + const column_base *result = column; switch (src_f) { case GraphCompressionFormat::UNCOMPRESSED: { @@ -85,10 +85,9 @@ namespace morphstore{ result = morph(old_col); break; case GraphCompressionFormat::UNCOMPRESSED: - result = old_col; + // handled by src_f == trg_f break; } - return result; break; } case GraphCompressionFormat::DELTA: { diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index c94d95e8..ce5b33d6 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -60,11 +60,10 @@ void ldbcGraphFormatTest (void) { // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); + graph->statistics(); graph->morph(morphstore::GraphCompressionFormat::DELTA); - // some statistics (DEBUG) - std::cout << "Some statistics" << std::endl; graph->statistics(); // (DEBUG) Test Vertex, which contains edges with properties (SERVER): From 251eca2a3d63304ca8181cbd53222e40da56fa48 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 12 May 2020 22:16:40 +0200 Subject: [PATCH 163/216] Move ldbc importer to dedicated sub-folder --- include/core/storage/graph/{ => importer}/ldbc_import.h | 0 include/core/storage/graph/{ => importer}/ldbc_schema.h | 2 +- include/core/storage/graph/property_type.h | 2 +- test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h | 2 +- test/core/storage/graph/ldbc/ldbc_graph_test.h | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename include/core/storage/graph/{ => importer}/ldbc_import.h (100%) rename include/core/storage/graph/{ => importer}/ldbc_schema.h (99%) diff --git a/include/core/storage/graph/ldbc_import.h b/include/core/storage/graph/importer/ldbc_import.h similarity index 100% rename from include/core/storage/graph/ldbc_import.h rename to include/core/storage/graph/importer/ldbc_import.h diff --git a/include/core/storage/graph/ldbc_schema.h b/include/core/storage/graph/importer/ldbc_schema.h similarity index 99% rename from include/core/storage/graph/ldbc_schema.h rename to include/core/storage/graph/importer/ldbc_schema.h index b6b79544..1bbcdeff 100644 --- a/include/core/storage/graph/ldbc_schema.h +++ b/include/core/storage/graph/importer/ldbc_schema.h @@ -24,7 +24,7 @@ #ifndef MORPHSTORE_LDBC_SCHEMA_H #define MORPHSTORE_LDBC_SCHEMA_H -#include "property_type.h" +#include #include #include diff --git a/include/core/storage/graph/property_type.h b/include/core/storage/graph/property_type.h index 7f133cd9..b7ff6aee 100644 --- a/include/core/storage/graph/property_type.h +++ b/include/core/storage/graph/property_type.h @@ -18,7 +18,7 @@ /** * @file property_type.h * @brief variant of supported data types as a property - * @todo + * @todo Move into dedicated sub-folder (when different property mappings exists) */ #ifndef MORPHSTORE_PROPERTY_TYPE_H diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index 0f5ff7eb..5aca9335 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -21,7 +21,7 @@ * @todo */ -#include +#include #include #include diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index ce5b33d6..e00c5f24 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -21,7 +21,7 @@ * @todo */ -#include +#include #include void print_header(std::string storageFormat) { From f94092db817f84c170a364b4328b346e148befa1 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Wed, 13 May 2020 00:13:29 +0200 Subject: [PATCH 164/216] Rename graph_compr_format to str function to avoid to_string name collision --- include/core/storage/graph/formats/csr.h | 11 +++++++---- include/core/storage/graph/graph.h | 2 +- include/core/storage/graph/graph_compr_format.h | 9 +++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 924bb62f..7f381b18 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -92,7 +92,7 @@ namespace morphstore{ if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { throw std::runtime_error( "Edge insertion only allowed in uncompressed format. Current format: " + - to_string(current_compression)); + graph_compr_f_to_string(current_compression)); } uint64_t* offset_data = offset_column->get_data(); @@ -163,11 +163,14 @@ namespace morphstore{ } void morph(GraphCompressionFormat target_format) override { +#if DEBUG std::cout << "Morphing graph format specific data structures from " - << to_string(current_compression) << " to " << to_string(target_format) << std::endl; - + << graph_compr_f_to_string(current_compression) << " to " << graph_compr_f_to_string(target_format) << std::endl; +#endif if (current_compression == target_format) { - std::cout << "Already in " << to_string(target_format); +#if DEBUG + std::cout << "Already in " << graph_compr_f_to_string(target_format); +#endif return; } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 68f49441..b27578ac 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -226,7 +226,7 @@ namespace morphstore{ std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edges->edges_with_properties_count() << std::endl; - std::cout << "Compression Format:" << to_string(current_compression) << std::endl; + std::cout << "Compression Format:" << graph_compr_f_to_string(current_compression) << std::endl; } void print_vertex_by_id(uint64_t id) { diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index 8cbb1b72..ab5065a1 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -47,18 +47,18 @@ namespace morphstore{ enum class GraphCompressionFormat {DELTA, FOR, UNCOMPRESSED}; - std::string to_string(GraphCompressionFormat format) { + std::string graph_compr_f_to_string(GraphCompressionFormat format) { std::string desc; switch (format) { case GraphCompressionFormat::DELTA: - desc = "Delta"; + desc = "Delta (Default)"; break; case GraphCompressionFormat::UNCOMPRESSED: desc = "Uncompressed"; break; case GraphCompressionFormat::FOR: - desc = "Frame of Reference"; + desc = "Frame of Reference (Default)"; break; } @@ -102,6 +102,7 @@ namespace morphstore{ uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f); + delete uncompr_col; } break; @@ -129,7 +130,7 @@ namespace morphstore{ } if (result == nullptr) { - throw std::runtime_error("Did not handle src: " + to_string(src_f) + " trg: " + to_string(trg_f)); + throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + " trg: " + graph_compr_f_to_string(trg_f)); } return result; From aec3792295200a11b1cafc9e7cd05b245f7c354a Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Wed, 13 May 2020 00:13:53 +0200 Subject: [PATCH 165/216] Add functions returning column compr. ratios --- include/core/storage/graph/formats/csr.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 7f381b18..bfa2cebb 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -216,6 +216,14 @@ namespace morphstore{ return out_edge_ids; } + double offset_column_compr_ratio() { + return compression_ratio(offset_column, current_compression); + } + + double edgeId_column_compr_ratio() { + return compression_ratio(edgeId_column, current_compression); + } + // for debugging: // TODO: simply by using a get_outgoing_edges(id) method void print_neighbors_of_vertex(uint64_t id) override{ From 8ba31042f96397b328e44b22a908fcbdbabc9f7c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Wed, 13 May 2020 00:14:24 +0200 Subject: [PATCH 166/216] Extract common functions into a graph benchmark helper --- src/microbenchmarks/graph/benchmark_helper.h | 46 +++++++++++++++++++ .../graph/edge_storage_benchmark.cpp | 14 +----- .../graph/vertex_storage_benchmark.cpp | 15 +----- 3 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 src/microbenchmarks/graph/benchmark_helper.h diff --git a/src/microbenchmarks/graph/benchmark_helper.h b/src/microbenchmarks/graph/benchmark_helper.h new file mode 100644 index 00000000..8deb036e --- /dev/null +++ b/src/microbenchmarks/graph/benchmark_helper.h @@ -0,0 +1,46 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file benchmark_helper.h + * @brief Helper functions for graph benchmarks + * @todo +*/ + +#ifndef BENCHMARK_HELPER +#define BENCHMARK_HELPER + +#include +#include +#include + +namespace morphstore { + using highResClock = std::chrono::high_resolution_clock; + + int64_t get_duration(std::chrono::time_point start) { + auto stop = highResClock::now(); + return std::chrono::duration_cast(stop - start).count(); + } + + int64_t get_median(std::vector values) { + assert(values.size() > 0); + std::nth_element(values.begin(), values.begin() + values.size() / 2, values.end()); + return values[values.size() / 2]; + } +} // namespace morphstore + +#endif //BENCHMARK_HELPER diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp index ff9444f1..4c6ad7ef 100644 --- a/src/microbenchmarks/graph/edge_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -23,24 +23,12 @@ #include #include -#include #include -#include +#include "benchmark_helper.h" -typedef std::chrono::high_resolution_clock highResClock; using namespace morphstore; -int64_t get_duration(std::chrono::time_point start) { - auto stop = highResClock::now(); - return std::chrono::duration_cast(stop - start).count(); -} - -int64_t get_median(std::vector values) { - assert(values.size() > 0); - std::nth_element(values.begin(), values.begin() + values.size()/2, values.end()); - return values[values.size()/2]; -} int main(void) { // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index c0ae75d8..11e4c7d2 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -22,24 +22,11 @@ #include #include -#include #include -#include +#include "benchmark_helper.h" - -typedef std::chrono::high_resolution_clock highResClock; using namespace morphstore; -int64_t get_duration(std::chrono::time_point start) { - auto stop = highResClock::now(); - return std::chrono::duration_cast(stop - start).count(); -} - -int64_t get_median(std::vector values) { - assert(values.size() > 0); - std::nth_element(values.begin(), values.begin() + values.size()/2, values.end()); - return values[values.size()/2]; -} int main(void) { // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function From 8459afafea6a077165bd7e6c869acb1743806c53 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Wed, 13 May 2020 00:15:13 +0200 Subject: [PATCH 167/216] Add graph compression benchmarks --- src/microbenchmarks/graph/CMakeLists.txt | 6 + .../adjList_graph_compression_benchmark.cpp | 112 +++++++++++++++++ .../graph/csr_graph_compression_benchmark.cpp | 115 ++++++++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp create mode 100644 src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index e70d3e0b..8867f945 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -1,7 +1,13 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/vertex_storage_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/edge_storage_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_csr_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_adjList_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) add_executable( edge_storage_benchmark_app edge_storage_benchmark.cpp) + add_executable( compress_csr_benchmark_app csr_graph_compression_benchmark.cpp) + add_executable( compress_adjList_benchmark_app adjList_graph_compression_benchmark.cpp) + target_link_libraries(compress_csr_benchmark_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(compress_adjList_benchmark_app PRIVATE "-ldl" stdc++fs) endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp new file mode 100644 index 00000000..37e41349 --- /dev/null +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -0,0 +1,112 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file graph_compression_benchmark.cpp + * @brief A benchmark of the csr-graph compression (using the ldbc graph) + * @todo allow different compression formats for the two csr columns; add full_iterate + */ + +#include +#include +#include +#include "benchmark_helper.h" + +using namespace morphstore; + + +struct CompressionBenchmarkEntry { + GraphCompressionFormat compr_format; + int64_t compression_time; + double compression_ratio; + double column_ratio; + int64_t random_access_time; + + std::string to_string() { + return "|" + graph_compr_f_to_string(compr_format) + + "|" + std::to_string(compression_time) + + "|" + std::to_string(compression_ratio) + + "|" + std::to_string(column_ratio) + + "|" + std::to_string(random_access_time); + } +}; + +int main(void) { + // could be also build parameters? + const int number_of_executions = 5; + const int number_of_random_access = 1000; + std::string sourceDir = ""; + + if (sourceDir.empty()) { + throw std::invalid_argument("Where are the ldbc files??"); + } + + + std::vector compr_formats = { + GraphCompressionFormat::DELTA, + GraphCompressionFormat::FOR, + GraphCompressionFormat::UNCOMPRESSED + }; + + // Load ldbc graph + std::unique_ptr graph = std::make_unique(); + std::unique_ptr ldbcImport = std::make_unique(sourceDir); + ldbcImport->import(*graph); + + // prepare random-access + std::random_device rd; + std::uniform_int_distribution dist(0, graph->getVertexCount() - 1); + std::vector random_accesses; + for (int i = 0; i < number_of_random_access; i++) { + random_accesses.push_back(dist(rd)); + } + + + std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; + std::cout << "Compression-Format | compression-time | " + << "compr. ratio | column ratio | access of edges of 5000 random vertices" << std::endl; + + for (auto current_f : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.compr_format = current_f; + // restore start state + graph->morph(GraphCompressionFormat::UNCOMPRESSED); + + auto start = highResClock::now(); + graph->morph(current_f); + // compression time + current_try.compression_time = get_duration(start); + + current_try.compression_ratio = graph->compr_ratio(); + // currently based on fixed min_compr_degree + current_try.column_ratio = graph->column_ratio(); + + + // random access + start = highResClock::now(); + for (int random_pos : random_accesses) { + graph->get_outgoing_edge_ids(random_pos); + } + current_try.random_access_time = get_duration(start); + + std::cout << current_try.to_string() << std::endl; + } + } + + return 0; +} diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp new file mode 100644 index 00000000..af34bbfc --- /dev/null +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -0,0 +1,115 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file csr_graph_compression_benchmark.cpp + * @brief A benchmark of the csr-graph compression (using the ldbc graph) + * @todo allow different compression formats for the two csr columns; add full_iterate + */ + +#include +#include +#include +#include "benchmark_helper.h" + +using namespace morphstore; + + +struct CompressionBenchmarkEntry { + GraphCompressionFormat compr_format; + int64_t compression_time; + double offset_col_compression_ratio; + double edgeId_col_compression_ratio; + int64_t random_access_time; + int64_t full_iterate; + + std::string to_string() { + return "|" + graph_compr_f_to_string(compr_format) + + "|" + std::to_string(compression_time) + + "|" + std::to_string(offset_col_compression_ratio) + + "|" + std::to_string(edgeId_col_compression_ratio) + + "|" + std::to_string(random_access_time); + } +}; + +int main(void) { + // could be also build parameters? + const int number_of_executions = 5; + const int number_of_random_access = 1000; + std::string sourceDir = ""; + + if (sourceDir.empty()) { + throw std::invalid_argument("Where are the ldbc files??"); + } + + + std::vector compr_formats = { + GraphCompressionFormat::UNCOMPRESSED, + GraphCompressionFormat::DELTA, + GraphCompressionFormat::FOR + }; + + // Load ldbc graph + std::unique_ptr graph = std::make_unique(); + std::unique_ptr ldbcImport = std::make_unique(sourceDir); + ldbcImport->import(*graph); + + // prepare random-access + std::random_device rd; + std::uniform_int_distribution dist(0, graph->getVertexCount() - 1); + std::vector random_accesses; + for (int i = 0; i < number_of_random_access; i++) { + random_accesses.push_back(dist(rd)); + } + + + std::cout << "Test compression of ldbc-graph in CSR format (times in micro-seconds)" << std::endl; + std::cout << "Compression-Format | compression-time | offset-column compr. ratio" << + " | edgeId-column compr. ratio | access of edges of " << + std::to_string(number_of_random_access) + " random vertices | full edge-list iterate" + << std::endl; + + for (auto current_f : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.compr_format = current_f; + // restore start state + graph->morph(GraphCompressionFormat::UNCOMPRESSED); + + auto start = highResClock::now(); + graph->morph(current_f); + // compression time + current_try.compression_time = get_duration(start); + + // compression-ratios + current_try.offset_col_compression_ratio = graph->offset_column_compr_ratio(); + current_try.edgeId_col_compression_ratio = graph->edgeId_column_compr_ratio(); + + + // random access + start = highResClock::now(); + for (int random_pos : random_accesses) { + graph->get_outgoing_edge_ids(random_pos); + } + current_try.random_access_time = get_duration(start); + + std::cout << current_try.to_string() << std::endl; + } + } + + return 0; +} From 005d904b1f449e3ab5022ea4dd64aa92c3592343 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Thu, 14 May 2020 09:31:56 +0200 Subject: [PATCH 168/216] Format graph files finally found a clang format style to rule them all .. "{ BasedOnStyle: LLVM, UseTab: Never, IndentWidth: 4, TabWidth: 4, BreakBeforeBraces: Attach, AllowShortIfStatementsOnASingleLine: false, IndentCaseLabels: false, ColumnLimit: 0, AccessModifierOffset: -4, ColumnLimit: 119, NamespaceIndentation: All}" --- include/core/storage/graph/edge/edge.h | 89 +++--- .../core/storage/graph/edge/edges_container.h | 180 ++++++----- .../graph/edge/edges_hashmap_container.h | 72 ++--- .../graph/edge/edges_vectorarray_container.h | 154 +++++----- .../storage/graph/formats/adjacencylist.h | 91 +++--- include/core/storage/graph/formats/csr.h | 78 +++-- include/core/storage/graph/graph.h | 125 ++++---- .../core/storage/graph/graph_compr_format.h | 62 ++-- .../core/storage/graph/importer/ldbc_import.h | 279 +++++++++--------- .../core/storage/graph/importer/ldbc_schema.h | 165 +++++------ include/core/storage/graph/property_type.h | 22 +- include/core/storage/graph/vertex/vertex.h | 79 +++-- .../storage/graph/vertex/vertices_container.h | 192 ++++++------ .../graph/vertex/vertices_hashmap_container.h | 71 ++--- .../vertex/vertices_vectorarray_container.h | 161 +++++----- src/microbenchmarks/graph/benchmark_helper.h | 8 +- .../graph/csr_graph_compression_benchmark.cpp | 94 +++--- .../graph/edge_storage_benchmark.cpp | 118 ++++---- .../graph/vertex_storage_benchmark.cpp | 105 ++++--- .../graph/ldbc/bfs_ldbc_graph_test.h | 23 +- .../core/storage/graph/ldbc/ldbc_graph_test.h | 22 +- .../graph/simple/simple_adj_graph_test.cpp | 2 +- .../graph/simple/simple_csr_graph_test.cpp | 2 +- .../storage/graph/simple/simple_graph_test.h | 13 +- 24 files changed, 1029 insertions(+), 1178 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 85e58053..9484bd4a 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -19,23 +19,23 @@ * @file edge.h * @brief Edge class which represents an edge object between two vertices * @todo -*/ + */ #ifndef MORPHSTORE_EDGE_H #define MORPHSTORE_EDGE_H #include +#include +#include #include -#include #include -#include #include -#include +#include -namespace morphstore{ +namespace morphstore { - class Edge{ + class Edge { protected: // Edge characteristics @@ -47,20 +47,21 @@ namespace morphstore{ bool valid = false; uint64_t getNextEdgeId() const { - // Todo: enable resetting maxEdgeId - // Ideal would be to pull id gen to graph.h but this requires rewriting Ldbc importer to use (edge property setting depends on it) + // Todo: enable resetting maxEdgeId + // Ideal would be to pull id gen to graph.h but this requires rewriting Ldbc importer to use (edge property + // setting depends on it) static uint64_t currentMaxEdgeId = 0; return currentMaxEdgeId++; } public: // default constr. needed for EdgeWithProperties constructor - Edge(){} + Edge() {} Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) : Edge(getNextEdgeId(), sourceId, targetId, type) {} - Edge(uint64_t id, uint64_t sourceId, uint64_t targetId, unsigned short int type){ + Edge(uint64_t id, uint64_t sourceId, uint64_t targetId, unsigned short int type) { this->sourceID = sourceId; this->targetID = targetId; this->type = type; @@ -69,7 +70,7 @@ namespace morphstore{ } // this is needed for csr when doing edge_array[offset] = edge... - Edge& operator= (const Edge &edge){ + Edge &operator=(const Edge &edge) { // self-assignment guard if (this == &edge) return *this; @@ -87,66 +88,50 @@ namespace morphstore{ // --------------- Getter and Setter --------------- - uint64_t getId() const { - return id; - } + uint64_t getId() const { return id; } - uint64_t getSourceId() const { - return sourceID; - } + uint64_t getSourceId() const { return sourceID; } - uint64_t getTargetId() const { - return targetID; - } + uint64_t getTargetId() const { return targetID; } - unsigned short getType() const { - return type; - } + unsigned short getType() const { return type; } - bool isValid() const { - return valid; - } + bool isValid() const { return valid; } // function for sorting algorithms in the ldbc-importer: // compare target-ids and return if it's "lower" (we need the sorting for the CSR) - bool operator<(const Edge& e) const{ - return getTargetId() < e.getTargetId(); - } + bool operator<(const Edge &e) const { return getTargetId() < e.getTargetId(); } // get size of edge object in bytes: static size_t size_in_bytes() { size_t size = 0; - size += sizeof(uint64_t) * 3; // id, source- and target-id + size += sizeof(uint64_t) * 3; // id, source- and target-id size += sizeof(unsigned short int); // type - size += sizeof(bool); // valid flag + size += sizeof(bool); // valid flag return size; } std::string to_string() const { - return "(id:" + std::to_string(this->id) + " ," - + std::to_string(this->sourceID) + "->" + std::to_string(this->targetID) + " ," - + "valid: " + std::to_string(this->valid) + ")"; + return "(id:" + std::to_string(this->id) + " ," + std::to_string(this->sourceID) + "->" + + std::to_string(this->targetID) + " ," + "valid: " + std::to_string(this->valid) + ")"; } }; class EdgeWithProperties { - private: - Edge edge; - std::unordered_map properties; - public: - EdgeWithProperties(Edge edge, const std::unordered_map properties) { - this->edge = edge; - this->properties = properties; - } - - Edge getEdge() { - return edge; - } - - std::unordered_map getProperties() { - return properties; - } + private: + Edge edge; + std::unordered_map properties; + + public: + EdgeWithProperties(Edge edge, const std::unordered_map properties) { + this->edge = edge; + this->properties = properties; + } + + Edge getEdge() { return edge; } + + std::unordered_map getProperties() { return properties; } }; -} +} // namespace morphstore -#endif //MORPHSTORE_EDGE_H +#endif // MORPHSTORE_EDGE_H diff --git a/include/core/storage/graph/edge/edges_container.h b/include/core/storage/graph/edge/edges_container.h index 80bed61c..6797496f 100644 --- a/include/core/storage/graph/edge/edges_container.h +++ b/include/core/storage/graph/edge/edges_container.h @@ -19,7 +19,7 @@ * @file edges_container.h * @brief abstract class for storing edges * @todo an EntityContainer abstraction (reduce duplicated code) -*/ + */ #ifndef MORPHSTORE_EDGES_CONTAINER_H #define MORPHSTORE_EDGES_CONTAINER_H @@ -27,128 +27,120 @@ #include #include +#include #include #include -#include #include -namespace morphstore{ - enum class EdgesContainerType {HashMapContainer, VectorArrayContainer}; +namespace morphstore { + enum class EdgesContainerType { HashMapContainer, VectorArrayContainer }; class EdgesContainer { - protected: - uint64_t expected_edge_count = 0; + protected: + uint64_t expected_edge_count = 0; - std::map edge_type_dictionary; + std::map edge_type_dictionary; - // TODO: try other property storage formats than per vertex .. (triple-store or per property) - std::unordered_map> edge_properties; + // TODO: try other property storage formats than per vertex .. (triple-store or per property) + std::unordered_map> edge_properties; - std::string get_edge_type(unsigned short int type) const { - if (edge_type_dictionary.find(type) != edge_type_dictionary.end()) { - return edge_type_dictionary.at(type); - } - else { - return "No Matching of type-number in the database! For type " + std::to_string(type); - } + std::string get_edge_type(unsigned short int type) const { + if (edge_type_dictionary.find(type) != edge_type_dictionary.end()) { + return edge_type_dictionary.at(type); + } else { + return "No Matching of type-number in the database! For type " + std::to_string(type); } + } - public: - virtual std::string container_description() const = 0; - virtual void insert_edge(Edge e) = 0; - virtual Edge get_edge(uint64_t id) = 0; - virtual bool exists_edge(const uint64_t id) const = 0; - virtual uint64_t edge_count() const = 0; - - - virtual void allocate(uint64_t expected_edges) { - edge_properties.reserve(expected_edges); - expected_edge_count += expected_edges; - } - - void add_edge(Edge edge) { - insert_edge(edge); - } + public: + virtual std::string container_description() const = 0; + virtual void insert_edge(Edge e) = 0; + virtual Edge get_edge(uint64_t id) = 0; + virtual bool exists_edge(const uint64_t id) const = 0; + virtual uint64_t edge_count() const = 0; - bool has_properties(uint64_t id){ - return edge_properties.find(id) != edge_properties.end(); - } + virtual void allocate(uint64_t expected_edges) { + edge_properties.reserve(expected_edges); + expected_edge_count += expected_edges; + } - void add_property_to_edge(uint64_t id, const std::pair property) { - assert(exists_edge(id)); - edge_properties[id].insert(property); - }; + void add_edge(Edge edge) { insert_edge(edge); } - void set_edge_properties(uint64_t id, const std::unordered_map properties) { - assert(exists_edge(id)); + bool has_properties(uint64_t id) { return edge_properties.find(id) != edge_properties.end(); } - if (has_properties(id)) { - std::cout << "Overwritting existing properties for :"; - print_edge_by_id(id); - std::cout << std::endl; - } + void add_property_to_edge(uint64_t id, const std::pair property) { + assert(exists_edge(id)); + edge_properties[id].insert(property); + }; - edge_properties[id] = properties; - }; + void set_edge_properties(uint64_t id, const std::unordered_map properties) { + assert(exists_edge(id)); - void set_edge_type_dictionary(const std::map& types) { - assert(types.size() != 0); - this->edge_type_dictionary = types; + if (has_properties(id)) { + std::cout << "Overwritting existing properties for :"; + print_edge_by_id(id); + std::cout << std::endl; } - const EdgeWithProperties get_edge_with_properties(uint64_t id) { - assert(exists_edge(id)); - return EdgeWithProperties(get_edge(id), edge_properties[id]); - } + edge_properties[id] = properties; + }; - uint64_t edges_with_properties_count() { - return edge_properties.size(); - } + void set_edge_type_dictionary(const std::map &types) { + assert(types.size() != 0); + this->edge_type_dictionary = types; + } - virtual std::pair get_size() const { - size_t data_size = 0; - size_t index_size = 0; + const EdgeWithProperties get_edge_with_properties(uint64_t id) { + assert(exists_edge(id)); + return EdgeWithProperties(get_edge(id), edge_properties[id]); + } - // lookup type dicts - index_size += 2 * sizeof(std::map); - for(auto& type_mapping : edge_type_dictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(type_mapping.second.length()); - } + uint64_t edges_with_properties_count() { return edge_properties.size(); } - // edge-properties: - index_size += sizeof(std::unordered_map>); - for (const auto &property_mapping : edge_properties) { - index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (const auto &property : property_mapping.second) { - data_size += sizeof(char) * property.first.length() + sizeof(property.second); - } - } + virtual std::pair get_size() const { + size_t data_size = 0; + size_t index_size = 0; - return {index_size, data_size}; + // lookup type dicts + index_size += 2 * sizeof(std::map); + for (auto &type_mapping : edge_type_dictionary) { + index_size += sizeof(unsigned short int); + index_size += sizeof(char) * (type_mapping.second.length()); } - void print_type_dict(){ - std::cout << "EdgeType-Dict: " << std::endl; - for (auto const &entry : edge_type_dictionary) { - std::cout << entry.first << " -> " << entry.second << std::endl; + // edge-properties: + index_size += sizeof(std::unordered_map>); + for (const auto &property_mapping : edge_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (const auto &property : property_mapping.second) { + data_size += sizeof(char) * property.first.length() + sizeof(property.second); } } - void print_edge_by_id(const uint64_t id) { - std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; - EdgeWithProperties e = get_edge_with_properties(id); - std::cout << e.getEdge().to_string() << std::endl; - std::cout << "Type: " << this->get_edge_type(e.getEdge().getType()) << std::endl; - std::cout << "Properties: "; - for (const auto entry : e.getProperties()) { - auto value = entry.second; - std::cout << "{" << entry.first << ": "; - std::visit(PropertyValueVisitor{}, value); - std::cout << "}"; - } + return {index_size, data_size}; + } + + void print_type_dict() { + std::cout << "EdgeType-Dict: " << std::endl; + for (auto const &entry : edge_type_dictionary) { + std::cout << entry.first << " -> " << entry.second << std::endl; + } + } + + void print_edge_by_id(const uint64_t id) { + std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; + EdgeWithProperties e = get_edge_with_properties(id); + std::cout << e.getEdge().to_string() << std::endl; + std::cout << "Type: " << this->get_edge_type(e.getEdge().getType()) << std::endl; + std::cout << "Properties: "; + for (const auto entry : e.getProperties()) { + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; } + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_EDGES_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_EDGES_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/edge/edges_hashmap_container.h b/include/core/storage/graph/edge/edges_hashmap_container.h index e35f2f52..3b432017 100644 --- a/include/core/storage/graph/edge/edges_hashmap_container.h +++ b/include/core/storage/graph/edge/edges_hashmap_container.h @@ -19,7 +19,7 @@ * @file edges__hashmap_container.h * @brief storing edges using a hashmap * @todo an EntityHashMapContainer abstraction (reduce duplicated code) -*/ + */ #ifndef MORPHSTORE_EDGES_HASHMAP_CONTAINER_H #define MORPHSTORE_EDGES_HASHMAP_CONTAINER_H @@ -30,53 +30,45 @@ #include #include -namespace morphstore{ +namespace morphstore { - class EdgesHashMapContainer : public EdgesContainer{ - protected: - std::unordered_map edges; - public: - std::string container_description() const override { - return "unordered_map"; - } + class EdgesHashMapContainer : public EdgesContainer { + protected: + std::unordered_map edges; - void allocate(const uint64_t expected_edges) override { - EdgesContainer::allocate(expected_edges); - this->edges.reserve(expected_edges); - } - - void insert_edge(const Edge e) override { - edges[e.getId()] = e; - } + public: + std::string container_description() const override { return "unordered_map"; } - bool exists_edge(const uint64_t id) const override { - if(edges.find(id) == edges.end()){ - return false; - } - return true; - } + void allocate(const uint64_t expected_edges) override { + EdgesContainer::allocate(expected_edges); + this->edges.reserve(expected_edges); + } - Edge get_edge(uint64_t id) override { - return edges[id]; - } + void insert_edge(const Edge e) override { edges[e.getId()] = e; } - uint64_t edge_count() const { - return edges.size(); + bool exists_edge(const uint64_t id) const override { + if (edges.find(id) == edges.end()) { + return false; } + return true; + } - std::pair get_size() const override { - auto [index_size, data_size] = EdgesContainer::get_size(); + Edge get_edge(uint64_t id) override { return edges[id]; } - // container for indexes: - index_size += sizeof(std::unordered_map); - // index size of edge: size of id and sizeof pointer - index_size += edges.size() * sizeof(uint64_t); - data_size += edges.size() * Edge::size_in_bytes(); - + uint64_t edge_count() const { return edges.size(); } - return {index_size, data_size}; - } + std::pair get_size() const override { + auto [index_size, data_size] = EdgesContainer::get_size(); + + // container for indexes: + index_size += sizeof(std::unordered_map); + // index size of edge: size of id and sizeof pointer + index_size += edges.size() * sizeof(uint64_t); + data_size += edges.size() * Edge::size_in_bytes(); + + return {index_size, data_size}; + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_EDGES_HASHMAP_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_EDGES_HASHMAP_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index 0b30ed3d..23efeee0 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -18,8 +18,8 @@ /** * @file edges__vectorarray_container.h * @brief storing edges using a vector of arrays - * @todo -*/ + * @todo + */ #ifndef MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H #define MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H @@ -27,111 +27,103 @@ #include "edge.h" #include "edges_container.h" -#include -#include #include +#include +#include -namespace morphstore{ +namespace morphstore { // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! // and using std::array as aligned_alloc did not set invalid flag to false (could be solveable) - class EdgesVectorArrayContainer : public EdgesContainer{ - protected: - static const inline uint64_t edge_array_size = 4096; - static const inline uint64_t edges_per_array = edge_array_size / sizeof(Edge); + class EdgesVectorArrayContainer : public EdgesContainer { + protected: + static const inline uint64_t edge_array_size = 4096; + static const inline uint64_t edges_per_array = edge_array_size / sizeof(Edge); - using edge_array = std::array; - std::vector edges; + using edge_array = std::array; + std::vector edges; - uint64_t number_of_edges = 0; + uint64_t number_of_edges = 0; + edge_array allocate_edge_array() { + edge_array array; + edges.push_back(array); + // std::cout << " Added a page" << std::endl; + // std::cout.flush(); - edge_array allocate_edge_array() { - edge_array array; - edges.push_back(array); - //std::cout << " Added a page" << std::endl; - //std::cout.flush(); + return array; + } - return array; - } - - inline uint64_t get_edge_array_number(uint64_t edge_id) const { - return edge_id / edges_per_array; - } + inline uint64_t get_edge_array_number(uint64_t edge_id) const { return edge_id / edges_per_array; } - inline uint64_t get_pos_in_array(uint64_t edge_id) const { - return edge_id % edges_per_array; - } + inline uint64_t get_pos_in_array(uint64_t edge_id) const { return edge_id % edges_per_array; } - public: - std::string container_description() const override { - return "vector>"; - } + public: + std::string container_description() const override { + return "vector>"; + } - void allocate(const uint64_t expected_edges) override { - EdgesContainer::allocate(expected_edges); + void allocate(const uint64_t expected_edges) override { + EdgesContainer::allocate(expected_edges); - auto array_count = std::ceil(expected_edges / (float) edges_per_array); - this->edges.reserve(array_count); + auto array_count = std::ceil(expected_edges / (float)edges_per_array); + this->edges.reserve(array_count); - for(int i = 0; i < array_count; i++) { - allocate_edge_array(); - } + for (int i = 0; i < array_count; i++) { + allocate_edge_array(); } + } - void insert_edge(Edge e) { - auto array_number = get_edge_array_number(e.getId()); - auto array_pos = get_pos_in_array(e.getId()); - - if (array_number >= edges.size()) { - throw std::runtime_error("Exceeded edge id limit: Edge id " + - std::to_string(e.getId()) + " > " + - std::to_string(edges_per_array * edges.size() - 1)); - } - - /* if (edges.at(array_number)[array_pos].isValid()) { - throw std::runtime_error("Delete existing edge before overwriting it: edge-id " + e.to_string()); - } */ - - edges.at(array_number)[array_pos] = e; - number_of_edges++; + void insert_edge(Edge e) { + auto array_number = get_edge_array_number(e.getId()); + auto array_pos = get_pos_in_array(e.getId()); + + if (array_number >= edges.size()) { + throw std::runtime_error("Exceeded edge id limit: Edge id " + std::to_string(e.getId()) + " > " + + std::to_string(edges_per_array * edges.size() - 1)); } - bool exists_edge(const uint64_t id) const override { - uint64_t array_number = get_edge_array_number(id); - uint64_t pos_in_array = get_pos_in_array(id); + /* if (edges.at(array_number)[array_pos].isValid()) { + throw std::runtime_error("Delete existing edge before overwriting it: edge-id " + e.to_string()); + } */ - if (array_number >= edges.size()) - return false; + edges.at(array_number)[array_pos] = e; + number_of_edges++; + } - return edges.at(array_number)[pos_in_array].isValid(); - } + bool exists_edge(const uint64_t id) const override { + uint64_t array_number = get_edge_array_number(id); + uint64_t pos_in_array = get_pos_in_array(id); - Edge get_edge(uint64_t id) override { - uint64_t array_number = get_edge_array_number(id); - uint64_t pos_in_array = get_pos_in_array(id); + if (array_number >= edges.size()) + return false; - assert (array_number < edges.size()); + return edges.at(array_number)[pos_in_array].isValid(); + } - return edges.at(array_number)[pos_in_array]; - } + Edge get_edge(uint64_t id) override { + uint64_t array_number = get_edge_array_number(id); + uint64_t pos_in_array = get_pos_in_array(id); - uint64_t edge_count() const override { - return number_of_edges; - } + assert(array_number < edges.size()); - std::pair get_size() const override { - auto [index_size, data_size] = EdgesContainer::get_size(); + return edges.at(array_number)[pos_in_array]; + } - // vector count, current_array_offset - index_size += 2 * sizeof(uint64_t); + uint64_t edge_count() const override { return number_of_edges; } - index_size += sizeof(std::vector); - // allocated memory for edges - data_size += edges.size() * sizeof(edge_array); - - return {index_size, data_size}; - } + std::pair get_size() const override { + auto [index_size, data_size] = EdgesContainer::get_size(); + + // vector count, current_array_offset + index_size += 2 * sizeof(uint64_t); + + index_size += sizeof(std::vector); + // allocated memory for edges + data_size += edges.size() * sizeof(edge_array); + + return {index_size, data_size}; + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_EDGES_VECTORARRAY_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 4e4ac705..c8654cf1 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -19,37 +19,33 @@ * @file adjacencylist.h * @brief Derived adj. list storage format class. Base: graph.h * @todo Adjust get_size_of_graph(), ?replace unordered_map with a fixed sized array -*/ + */ #ifndef MORPHSTORE_ADJACENCYLIST_H #define MORPHSTORE_ADJACENCYLIST_H -#include #include +#include #include -#include #include -#include +#include #include +#include -namespace morphstore{ +namespace morphstore { - class AdjacencyList: public Graph { + class AdjacencyList : public Graph { private: // const column as after finalized only read_only - using adjacency_column = column_base*; - using adjacency_vector = std::vector*; + using adjacency_column = column_base *; + using adjacency_vector = std::vector *; using adjacency_list_variant = std::variant; struct Adjacency_List_Size_Visitor { - size_t operator()(const adjacency_column c) const { - return c->get_size_used_byte(); - } - size_t operator()(const adjacency_vector v) const { - return v->size() * sizeof(uint64_t); - } + size_t operator()(const adjacency_column c) const { return c->get_size_used_byte(); } + size_t operator()(const adjacency_vector v) const { return v->size() * sizeof(uint64_t); } }; struct Adjacency_List_OutDegree_Visitor { @@ -57,9 +53,7 @@ namespace morphstore{ // assuming compressed col has the same value count (would not work for RLE) return c->get_count_values(); } - uint64_t operator()(const adjacency_vector v) const { - return v->size(); - } + uint64_t operator()(const adjacency_vector v) const { return v->size(); } }; // maps the a list of outgoing edges (ids) to a vertex-id @@ -81,10 +75,8 @@ namespace morphstore{ auto adj_vector = std::get(adj_list); // this allows adding new edges to smaller adj_lists (even after morphing) if (adj_vector->size() >= min_compr_degree) { - auto adj_col = const_cast(make_column( - adj_vector->data(), - adj_vector->size(), - true)); + auto adj_col = + const_cast(make_column(adj_vector->data(), adj_vector->size(), true)); (*adjacencylistPerVertex)[id] = adj_col; @@ -111,14 +103,13 @@ namespace morphstore{ public: ~AdjacencyList() { - for(auto [id, adj_list]: *this->adjacencylistPerVertex) { - if (std::holds_alternative(adj_list)) { - delete std::get(adj_list); - } - else { - free(std::get(adj_list)); - } - + for (auto [id, adj_list] : *this->adjacencylistPerVertex) { + if (std::holds_alternative(adj_list)) { + delete std::get(adj_list); + } else { + free(std::get(adj_list)); + } + delete adjacencylistPerVertex; } } @@ -129,9 +120,7 @@ namespace morphstore{ AdjacencyList(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} - std::string get_storage_format() const override { - return "Adjacency_List"; - } + std::string get_storage_format() const override { return "Adjacency_List"; } // function: to set graph allocations void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { @@ -153,7 +142,7 @@ namespace morphstore{ // avoid inserting an empty adjacencyVector (waste of memory) if (edgesToAdd.size() == 0) { - return ; + return; } std::vector *adjacencyVector; @@ -184,8 +173,7 @@ namespace morphstore{ auto entry = adjacencylistPerVertex->find(id); if (entry == adjacencylistPerVertex->end()) { return 0; - } - else { + } else { return std::visit(Adjacency_List_OutDegree_Visitor{}, entry->second); } } @@ -200,7 +188,7 @@ namespace morphstore{ auto uncompr_col = decompress_adjacency_column(std::get(adj_list)); const size_t column_size = uncompr_col->get_count_values(); // TODO: init vector via range-constructor / mem-cpy - //const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; + // const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; const uint64_t *start_addr = uncompr_col->get_data(); edge_ids.insert(edge_ids.end(), start_addr, start_addr + column_size); @@ -234,26 +222,24 @@ namespace morphstore{ this->finalize(); #if DEBUG - std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) << std::endl; + std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) + << std::endl; auto entry_count = adjacencylistPerVertex->size(); int progress = 0; -# endif +#endif for (auto const [id, adj_list] : *adjacencylistPerVertex) { #if DEBUG if (progress % 10000 == 0) { std::cout << "Compression Progress: " << progress << "/" << entry_count << std::endl; } progress++; -# endif +#endif // const_cast needed as map-value is not constant if (std::visit(Adjacency_List_OutDegree_Visitor{}, adj_list) >= min_compr_degree) { auto old_adj_col = std::get(adj_list); - (*adjacencylistPerVertex)[id] = const_cast(morph_graph_col( - old_adj_col, - current_compression, - target_format, - true)); + (*adjacencylistPerVertex)[id] = const_cast( + morph_graph_col(old_adj_col, current_compression, target_format, true)); } } @@ -290,7 +276,7 @@ namespace morphstore{ } } - return (double) column_count / getEdgeCount(); + return (double)column_count / getEdgeCount(); } // for measuring the size in bytes: @@ -301,7 +287,7 @@ namespace morphstore{ index_size += sizeof(std::unordered_map); index_size += adjacencylistPerVertex->size() * (sizeof(uint64_t) + sizeof(adjacency_list_variant)); - for(const auto [id, adj_list] : *adjacencylistPerVertex){ + for (const auto [id, adj_list] : *adjacencylistPerVertex) { data_size += std::visit(Adjacency_List_Size_Visitor{}, adj_list); } @@ -309,14 +295,13 @@ namespace morphstore{ } // for debugging: print neighbors a vertex - void print_neighbors_of_vertex(uint64_t id) override{ + void print_neighbors_of_vertex(uint64_t id) override { std::cout << std::endl << "Neighbours for Vertex with id " << id << std::endl; auto edge_ids = get_outgoing_edge_ids(id); - if(edge_ids.size() == 0) { + if (edge_ids.size() == 0) { std::cout << " No outgoing edges for vertex with id: " << id << std::endl; - } - else { + } else { for (const auto edge_id : edge_ids) { print_edge_by_id(edge_id); } @@ -330,10 +315,8 @@ namespace morphstore{ std::cout << "Compression ratio:" << compr_ratio() << std::endl; std::cout << "--------------------------------------------" << std::endl; std::cout << std::endl << std::endl; - } - }; -} +} // namespace morphstore -#endif //MORPHSTORE_ADJACENCYLIST_H +#endif // MORPHSTORE_ADJACENCYLIST_H diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index bfa2cebb..a91dec05 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -19,19 +19,19 @@ * @file csr.h * @brief Derived CSR storage format class. Base: graph.h * @todo Edge_value_array should only store edge-ids (not whole objects) -*/ + */ #ifndef MORPHSTORE_CSR_H #define MORPHSTORE_CSR_H #include -#include #include +#include -namespace morphstore{ +namespace morphstore { - class CSR: public Graph{ + class CSR : public Graph { private: /* graph topology: @@ -53,11 +53,10 @@ namespace morphstore{ CSR(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer) : Graph(vertices_container_type) {} - std::string get_storage_format() const override { - return "CSR"; - } + std::string get_storage_format() const override { return "CSR"; } - // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph topology arrays + // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph + // topology arrays // TODO: test that no data exists before (as this will get overwritten) void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); @@ -71,7 +70,7 @@ namespace morphstore{ edgeId_column->set_meta_data(numberEdges, edge_ids_size); // init node array: - uint64_t* offset_data = offset_column->get_data(); + uint64_t *offset_data = offset_column->get_data(); offset_data[0] = 0; } @@ -84,18 +83,18 @@ namespace morphstore{ // every vertex id contains a list of its neighbors void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { // TODO: throw error if not in order of vertex-ids ASC inserted (currently will only produce rubbish data) - // TODO: handle if sourceIDs are skipped - // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and insert offsets for skipped vertices - assert(expectedEdgeCount >= getEdgeCount()+edgesToAdd.size()); + // TODO: handle if sourceIDs are skipped + // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and + // insert offsets for skipped vertices + assert(expectedEdgeCount >= getEdgeCount() + edgesToAdd.size()); // currently only read-only if compressed if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { - throw std::runtime_error( - "Edge insertion only allowed in uncompressed format. Current format: " + - graph_compr_f_to_string(current_compression)); + throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + + graph_compr_f_to_string(current_compression)); } - uint64_t* offset_data = offset_column->get_data(); + uint64_t *offset_data = offset_column->get_data(); uint64_t offset = offset_data[sourceID]; uint64_t nextOffset = offset + edgesToAdd.size(); @@ -105,9 +104,9 @@ namespace morphstore{ // fill the arrays // TODO: fill array using memcpy? (put edgeIds into vector as prepare step) - uint64_t* edgeId_data = edgeId_column->get_data(); - for(const auto& edge : edgesToAdd){ - if(!vertices->exists_vertex(edge.getTargetId())) { + uint64_t *edgeId_data = edgeId_column->get_data(); + for (const auto &edge : edgesToAdd) { + if (!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found " + edge.to_string()); } edgeId_data[offset] = edge.getId(); @@ -116,8 +115,8 @@ namespace morphstore{ } // to avoid buffer overflow: - if(sourceID < getExpectedVertexCount()-1){ - offset_data[sourceID+1] = nextOffset; + if (sourceID < getExpectedVertexCount() - 1) { + offset_data[sourceID + 1] = nextOffset; } } @@ -126,16 +125,16 @@ namespace morphstore{ // decompressing offset_column in order to read correct offset // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); - uint64_t* offset_data = uncompr_offset_col->get_data(); + uint64_t *offset_data = uncompr_offset_col->get_data(); uint64_t offset = offset_data[id]; uint64_t nextOffset; // special case: last vertex id has no next offset - if(id == getVertexCount() -1){ + if (id == getVertexCount() - 1) { nextOffset = getEdgeCount(); - }else{ - nextOffset = offset_data[id+1]; + } else { + nextOffset = offset_data[id + 1]; } // deleting temporary column @@ -165,7 +164,8 @@ namespace morphstore{ void morph(GraphCompressionFormat target_format) override { #if DEBUG std::cout << "Morphing graph format specific data structures from " - << graph_compr_f_to_string(current_compression) << " to " << graph_compr_f_to_string(target_format) << std::endl; + << graph_compr_f_to_string(current_compression) << " to " + << graph_compr_f_to_string(target_format) << std::endl; #endif if (current_compression == target_format) { #if DEBUG @@ -174,15 +174,17 @@ namespace morphstore{ return; } - offset_column = const_cast(morph_graph_col(offset_column, current_compression, target_format, true)); - edgeId_column = const_cast(morph_graph_col(edgeId_column, current_compression, target_format, true)); + offset_column = + const_cast(morph_graph_col(offset_column, current_compression, target_format, true)); + edgeId_column = + const_cast(morph_graph_col(edgeId_column, current_compression, target_format, true)); this->current_compression = target_format; } // get size of storage format: std::pair get_size_of_graph() const override { - + auto [index_size, data_size] = Graph::get_size_of_graph(); index_size += edgeId_column->get_size_used_byte(); @@ -216,20 +218,16 @@ namespace morphstore{ return out_edge_ids; } - double offset_column_compr_ratio() { - return compression_ratio(offset_column, current_compression); - } + double offset_column_compr_ratio() { return compression_ratio(offset_column, current_compression); } - double edgeId_column_compr_ratio() { - return compression_ratio(edgeId_column, current_compression); - } + double edgeId_column_compr_ratio() { return compression_ratio(edgeId_column, current_compression); } // for debugging: // TODO: simply by using a get_outgoing_edges(id) method - void print_neighbors_of_vertex(uint64_t id) override{ + void print_neighbors_of_vertex(uint64_t id) override { std::cout << "Neighbours for Vertex with id " << id << std::endl; - for(auto const edge_id: get_outgoing_edge_ids(id)){ + for (auto const edge_id : get_outgoing_edge_ids(id)) { print_edge_by_id(edge_id); } } @@ -248,5 +246,5 @@ namespace morphstore{ std::cout << std::endl << std::endl; } }; -} -#endif //MORPHSTORE_CSR_H +} // namespace morphstore +#endif // MORPHSTORE_CSR_H diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index b27578ac..663de5d2 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -19,35 +19,34 @@ * @file graph.h * @brief base graph class for any storage format --> CSR,ADJ * @todo -*/ + */ #ifndef MORPHSTORE_GRAPH_H #define MORPHSTORE_GRAPH_H -#include "vertex/vertex.h" -#include "vertex/vertices_hashmap_container.h" -#include "vertex/vertices_vectorarray_container.h" #include "edge/edge.h" #include "edge/edges_hashmap_container.h" #include "edge/edges_vectorarray_container.h" #include "property_type.h" +#include "vertex/vertex.h" +#include "vertex/vertices_hashmap_container.h" +#include "vertex/vertices_vectorarray_container.h" #include -#include -#include -#include -#include #include +#include +#include #include +#include +#include #include -#include #include -#include - +#include +#include -namespace morphstore{ +namespace morphstore { - class Graph{ + class Graph { protected: GraphCompressionFormat current_compression = GraphCompressionFormat::UNCOMPRESSED; @@ -83,98 +82,85 @@ namespace morphstore{ } } - std::string vertices_container_description() { - return vertices->container_description(); - } + std::string vertices_container_description() { return vertices->container_description(); } - std::string edges_container_description() { - return edges->container_description(); - } + std::string edges_container_description() { return edges->container_description(); } // -------------------- Setters & Getters -------------------- - void set_vertex_type_dictionary(const std::map& types) { + void set_vertex_type_dictionary(const std::map &types) { assert(types.size() != 0); this->vertices->set_vertex_type_dictionary(types); } - void setEdgeTypeDictionary(const std::map& types) { + void setEdgeTypeDictionary(const std::map &types) { assert(types.size() != 0); this->edges->set_edge_type_dictionary(types); } - uint64_t getExpectedVertexCount() const { - return expectedVertexCount; - } + uint64_t getExpectedVertexCount() const { return expectedVertexCount; } - uint64_t getVertexCount() const { - return vertices->vertex_count(); - } + uint64_t getVertexCount() const { return vertices->vertex_count(); } - uint64_t getExpectedEdgeCount() const { - return expectedEdgeCount; - } + uint64_t getExpectedEdgeCount() const { return expectedEdgeCount; } - uint64_t getEdgeCount() const { - return edges->edge_count(); - } + uint64_t getEdgeCount() const { return edges->edge_count(); } - uint64_t add_vertex(const unsigned short int type = 0, const std::unordered_map props = {}) { + uint64_t add_vertex(const unsigned short int type = 0, + const std::unordered_map props = {}) { return vertices->add_vertex(type, props); }; // function which returns a pointer to vertex by id - VertexWithProperties get_vertex(uint64_t id){ - return vertices->get_vertex_with_properties(id); - } + VertexWithProperties get_vertex(uint64_t id) { return vertices->get_vertex_with_properties(id); } // function which returns a pointer to edge by id - EdgeWithProperties get_edge(uint64_t id){ - return edges->get_edge_with_properties(id); - } + EdgeWithProperties get_edge(uint64_t id) { return edges->get_edge_with_properties(id); } - // function to return a list of pair < vertex id, degree > DESC: + // function to return a list of pair < vertex id, degree > DESC: // TODO: move into seperate header and use graph as input parameter - std::vector> get_list_of_degree_DESC(){ + std::vector> get_list_of_degree_DESC() { std::vector> vertexDegreeList; vertexDegreeList.reserve(getVertexCount()); // fill the vector with every vertex key and his degree - for(uint64_t i = 0; i < getVertexCount(); ++i){ -/* if (i % 1000 == 0) { - std::cout << "Degree-List - Current Progress" << i << "/" << getVertexCount() << std::endl; - } */ + for (uint64_t i = 0; i < getVertexCount(); ++i) { + /* if (i % 1000 == 0) { + std::cout << "Degree-List - Current Progress" << i << "/" << getVertexCount() << + std::endl; + } */ vertexDegreeList.push_back({i, this->get_out_degree(i)}); } // sort the vector on degree DESC - std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), [](const std::pair &left, const std::pair &right) { - return left.second > right.second; - }); + std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), + [](const std::pair &left, const std::pair &right) { + return left.second > right.second; + }); return vertexDegreeList; } // function to measure graph characteristics (degree and count): // TODO: move into seperate header and use graph as input parameter - void measure_degree_count(std::string filePath){ + void measure_degree_count(std::string filePath) { std::vector> verticesDegree = get_list_of_degree_DESC(); // unordered map for mapping degree to count: - std::unordered_map results; + std::unordered_map results; - for(uint64_t i = 0; i < verticesDegree.size(); ++i){ - // increment count in results for a given degree: - results[verticesDegree[i].second]++; - } + for (uint64_t i = 0; i < verticesDegree.size(); ++i) { + // increment count in results for a given degree: + results[verticesDegree[i].second]++; + } - // write to file: - std::ofstream fs; + // write to file: + std::ofstream fs; std::stringstream ss; // open file for writing and delete existing stuff: fs.open(filePath, std::fstream::out | std::ofstream::trunc); - for(auto const& m : results){ + for (auto const &m : results) { ss << m.first << "," << m.second << "\n"; } - fs << ss.str() ; + fs << ss.str(); fs.close(); } @@ -195,7 +181,7 @@ namespace morphstore{ virtual uint64_t get_out_degree(uint64_t id) = 0; virtual std::vector get_neighbors_ids(uint64_t id) = 0; - virtual std::pair get_size_of_graph() const { + virtual std::pair get_size_of_graph() const { // including vertices + its properties + its type dict auto [index_size, data_size] = vertices->get_size(); @@ -210,7 +196,7 @@ namespace morphstore{ virtual void allocate_graph_structure(uint64_t expected_vertices, uint64_t expected_edges) { this->expectedVertexCount = expected_vertices; this->expectedEdgeCount = expected_edges; - + vertices->allocate(expected_vertices); edges->allocate(expected_edges); }; @@ -220,10 +206,11 @@ namespace morphstore{ // for debugging virtual void print_neighbors_of_vertex(uint64_t id) = 0; - virtual void statistics(){ + virtual void statistics() { std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getVertexCount() << std::endl; - std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() << std::endl; + std::cout << "Number of vertices with properties:" << vertices->vertices_with_properties_count() + << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edges->edges_with_properties_count() << std::endl; std::cout << "Compression Format:" << graph_compr_f_to_string(current_compression) << std::endl; @@ -237,18 +224,14 @@ namespace morphstore{ std::cout << "-----------------------------------------------" << std::endl; } - void print_edge_by_id(uint64_t id) { - edges->print_edge_by_id(id); - } + void print_edge_by_id(uint64_t id) { edges->print_edge_by_id(id); } - void print_type_dicts(){ + void print_type_dicts() { vertices->print_type_dict(); edges->print_type_dict(); } - }; -} - +} // namespace morphstore -#endif //MORPHSTORE_GRAPH_H +#endif // MORPHSTORE_GRAPH_H diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index ab5065a1..2b5a0d38 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -19,23 +19,23 @@ * @file graph_compr_format.h * @brief helper for specifying compression of graph format specific columns * @todo -*/ + */ #ifndef MORPHSTORE_GRAPH_COMPR_FORMAT_H #define MORPHSTORE_GRAPH_COMPR_FORMAT_H -#include -#include -#include -#include -#include #include #include +#include #include +#include +#include +#include +#include #include -namespace morphstore{ +namespace morphstore { // TODO: allow also other vector extensions (switch from safe_morph to morph) // example layout: dynamic_vbp_f<512, 32, 8> using ve = vectorlib::scalar>; @@ -44,8 +44,8 @@ namespace morphstore{ using column_uncompr = column; using column_delta = column; using column_for = column; - - enum class GraphCompressionFormat {DELTA, FOR, UNCOMPRESSED}; + + enum class GraphCompressionFormat { DELTA, FOR, UNCOMPRESSED }; std::string graph_compr_f_to_string(GraphCompressionFormat format) { std::string desc; @@ -67,7 +67,8 @@ namespace morphstore{ // casting the column to the actual column type before morphing (as compiler could not derive it) // delete_old_col -> delete input column after morphing (if the result is not the input column) - const column_base* morph_graph_col(const column_base* column, const GraphCompressionFormat src_f, const GraphCompressionFormat trg_f, bool delete_in_col = false) { + const column_base *morph_graph_col(const column_base *column, const GraphCompressionFormat src_f, + const GraphCompressionFormat trg_f, bool delete_in_col = false) { if (src_f == trg_f) { return column; } @@ -94,15 +95,11 @@ namespace morphstore{ if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { const column_delta *old_col = dynamic_cast(column); result = morph(old_col); - } - else { + } else { // as direct morphing is not yet supported .. go via decompressing first auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); - result = morph_graph_col( - uncompr_col, - GraphCompressionFormat::UNCOMPRESSED, - trg_f); - + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f); + delete uncompr_col; } break; @@ -111,39 +108,38 @@ namespace morphstore{ if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { const column_for *old_col = dynamic_cast(column); result = morph(old_col); - } - else { + } else { // as direct morphing is not yet supported .. go via decompressing first auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); - result = morph_graph_col( - uncompr_col, - GraphCompressionFormat::UNCOMPRESSED, - trg_f); + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f); delete uncompr_col; } break; } } - if (result != column && delete_in_col){ + if (result != column && delete_in_col) { delete column; } if (result == nullptr) { - throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + " trg: " + graph_compr_f_to_string(trg_f)); + throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + + " trg: " + graph_compr_f_to_string(trg_f)); } - return result; + return result; } - const column_uncompr* decompress_graph_col(const column_base* column, const GraphCompressionFormat src_f, bool delete_in_col = false) { - return static_cast(morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col)); + const column_uncompr *decompress_graph_col(const column_base *column, const GraphCompressionFormat src_f, + bool delete_in_col = false) { + return static_cast( + morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col)); } - double compression_ratio(const column_base* column, GraphCompressionFormat col_format) { - // TODO: need to delete decompressed_col? - return decompress_graph_col(column, col_format)->get_size_used_byte() / (double) column->get_size_used_byte(); + double compression_ratio(const column_base *column, GraphCompressionFormat col_format) { + // TODO: need to delete decompressed_col? + return decompress_graph_col(column, col_format)->get_size_used_byte() / (double)column->get_size_used_byte(); } -} +} // namespace morphstore -#endif //MORPHSTORE_GRAPH_COMPR_FORMAT_H \ No newline at end of file +#endif // MORPHSTORE_GRAPH_COMPR_FORMAT_H \ No newline at end of file diff --git a/include/core/storage/graph/importer/ldbc_import.h b/include/core/storage/graph/importer/ldbc_import.h index 8f53de69..7dbcc9a3 100644 --- a/include/core/storage/graph/importer/ldbc_import.h +++ b/include/core/storage/graph/importer/ldbc_import.h @@ -19,42 +19,37 @@ * @file ldbc_import.h * @brief this class reads the ldbc files and generates the graph in CSR or AdjList * @todo support for array properties (for simplicity only last one take currently) -*/ + */ #ifndef MORPHSTORE_LDBC_IMPORT_H #define MORPHSTORE_LDBC_IMPORT_H -#include #include "ldbc_schema.h" +#include +#include +#include #include -#include -#include #include -#include #include -#include +#include #include -#include -#include -#include #include -#include - - +#include +#include +#include +#include // hash function used to hash a pair of any kind using XOR (for verticesMap) struct hash_pair { - template - size_t operator()(const std::pair& p) const - { + template size_t operator()(const std::pair &p) const { auto hash1 = std::hash{}(p.first); auto hash2 = std::hash{}(p.second); return hash1 ^ hash2; } }; -namespace morphstore{ +namespace morphstore { class LDBCImport { @@ -67,9 +62,10 @@ namespace morphstore{ // data structure for lookup local ids with vertexType to global system id: (vertexType, ldbc_id) -> global id std::unordered_map, uint64_t, hash_pair> globalIdLookupMap; - // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array in the right order) + // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array + // in the right order) std::unordered_map> vertexEdgesLookup; - std::unordered_map> edgeProperties; + std::unordered_map> edgeProperties; public: // directory including a static/ and dynamic/ directory like in /ldbc_snb_datagen/social_network/ @@ -78,37 +74,33 @@ namespace morphstore{ insert_file_names(); } - std::string getDirectory() const { - return base_directory; - } + std::string getDirectory() const { return base_directory; } // get the vertex or edge type based on the fileName std::string getEntityType(std::filesystem::path filePath) { - // last [a-zA-Z] to remove ending _ - std::regex typeRegExp("[a-zA-Z_]+[a-zA-Z]"); - std::smatch match; + // last [a-zA-Z] to remove ending _ + std::regex typeRegExp("[a-zA-Z_]+[a-zA-Z]"); + std::smatch match; - std::string fileName = filePath.filename().string(); + std::string fileName = filePath.filename().string(); - if(std::regex_search(fileName, match, typeRegExp)) { - //std::cout << "EntityType: " << match[0] << std::endl; - //std::cout.flush(); - return match[0]; - } - else { - throw std::invalid_argument("No EntityType in: " + fileName); - } + if (std::regex_search(fileName, match, typeRegExp)) { + // std::cout << "EntityType: " << match[0] << std::endl; + // std::cout.flush(); + return match[0]; + } else { + throw std::invalid_argument("No EntityType in: " + fileName); + } } - // function which iterates through the base_directory to receive file names (entire path) void insert_file_names() { - - std::filesystem::path dynamic_data_dir (base_directory / "dynamic"); - std::filesystem::path static_data_dir (base_directory / "static"); - std::vector dirs{dynamic_data_dir, static_data_dir}; - for(const auto dir: dirs) { + std::filesystem::path dynamic_data_dir(base_directory / "dynamic"); + std::filesystem::path static_data_dir(base_directory / "static"); + std::vector dirs{dynamic_data_dir, static_data_dir}; + + for (const auto dir : dirs) { for (const auto &entry : std::filesystem::directory_iterator(dir)) { // ignore files starting with a '.' (+ 1 as '/' is the first character otherwise) if (entry.path().string()[dir.u8string().length() + 1] == '.') { @@ -120,7 +112,7 @@ namespace morphstore{ } } - if(verticesPaths.empty()) { + if (verticesPaths.empty()) { print_file_names(); throw std::invalid_argument("No vertex files found"); } @@ -144,13 +136,12 @@ namespace morphstore{ // this function reads the vertices-files and creates vertices in a graph // + creates the vertexTypeLookup (number to string) for the graph - void generate_vertices(Graph& graph) { + void generate_vertices(Graph &graph) { std::cout << "(1/2) Generating LDBC-Vertices ..."; std::cout.flush(); // iterate through vector of vertex-addresses - for (const auto &file : verticesPaths) - { + for (const auto &file : verticesPaths) { // data structure for attributes of entity, e.g. taglass -> id, name, url std::vector> attributes; @@ -163,8 +154,9 @@ namespace morphstore{ std::string address = file; - std::ifstream vertexFile(address, std::ios::binary | - std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream vertexFile( + address, + std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!vertexFile) { std::cerr << "Error, opening file. "; @@ -173,11 +165,13 @@ namespace morphstore{ // calculate file size if (vertexFile.is_open()) { - // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. - fileSize = static_cast(vertexFile.tellg()); + // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) + // on failure. + fileSize = static_cast(vertexFile.tellg()); vertexFile.clear(); - // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) - vertexFile.seekg(0, std::ios::beg); + // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file + // bit) + vertexFile.seekg(0, std::ios::beg); } // allocate memory @@ -187,16 +181,13 @@ namespace morphstore{ std::string delimiter = "|"; // read buffer and do the magic ... - for (size_t i = 0; i < fileSize; ++i) - { - if (buffer[i] == '\n') - { + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); // remove unnecessary '\n' at the beginning of a string - if (row.find('\n') != std::string::npos) - { + if (row.find('\n') != std::string::npos) { row.erase(0, 1); } @@ -204,17 +195,17 @@ namespace morphstore{ size_t next = 0; // first line of *.csv contains the attributes -> write to attributes vector - if (start == 0) - { + if (start == 0) { std::string property_key; Ldbc_Data_Type data_type; - // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to attributes vector - while ((next = row.find(delimiter, last)) != std::string::npos) - { + // extract attribute from delimiter, e.g. id|name|url to id,name,url and push back to + // attributes vector + while ((next = row.find(delimiter, last)) != std::string::npos) { property_key = row.substr(last, next - last); data_type = get_data_type(vertexType, property_key); if (data_type == Ldbc_Data_Type::ERROR) { - throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + " could not be found in schema"); + throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + + " could not be found in schema"); } attributes.push_back(std::make_pair(property_key, data_type)); last = next + 1; @@ -223,27 +214,27 @@ namespace morphstore{ property_key = row.substr(last); data_type = get_data_type(vertexType, property_key); if (data_type == Ldbc_Data_Type::ERROR) { - throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + " could not be found in schema"); + throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + + " could not be found in schema"); } attributes.push_back(std::make_pair(property_key, data_type)); - } - else - { + } else { // actual data: std::unordered_map properties; size_t attrIndex = 0; std::string ldbcID = row.substr(0, row.find(delimiter)); - while ((next = row.find(delimiter, last)) != std::string::npos) - { + while ((next = row.find(delimiter, last)) != std::string::npos) { auto key_to_datatype = attributes[attrIndex]; - property_type property_value = convert_property_value(row.substr(last, next - last), key_to_datatype.second); + property_type property_value = + convert_property_value(row.substr(last, next - last), key_to_datatype.second); properties.insert(std::make_pair(key_to_datatype.first, property_value)); last = next + 1; ++attrIndex; } // last attribute auto key_to_datatype = attributes[attrIndex]; - property_type propertyValue = convert_property_value(row.substr(last), key_to_datatype.second); + property_type propertyValue = + convert_property_value(row.substr(last), key_to_datatype.second); properties.insert(std::make_pair(key_to_datatype.first, propertyValue)); //----------------------------------------------------- @@ -303,7 +294,6 @@ namespace morphstore{ for (const auto &rel : edgesPaths) { std::cout << "\t" << rel << std::endl; } - } // function which clears all intermediates after import @@ -318,7 +308,8 @@ namespace morphstore{ vertexEdgesLookup.clear(); } - // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because of the vertexType lookup creation) + // function which returns the total number of edges (IMPORTANT: vertex generation has to be done first, because + // of the vertexType lookup creation) uint64_t get_total_number_edges() { uint64_t result = 0; @@ -329,7 +320,6 @@ namespace morphstore{ for (const auto &file : edgesPaths) { std::string edge_type = getEntityType(file); - // TOdo: use regExp ([a-zA-Z]+)_([a-zA-Z]+)_([a-zA-Z]+) std::string sourceVertexType = edge_type.substr(0, edge_type.find('_')); edge_type.erase(0, edge_type.find('_') + 1); @@ -343,8 +333,9 @@ namespace morphstore{ uint64_t fileSize = 0; - std::ifstream edgeFile(file, std::ios::binary | - std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream edgeFile( + file, std::ios::binary | + std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!edgeFile) { std::cerr << "Error, opening file. "; @@ -353,13 +344,16 @@ namespace morphstore{ // calculate file size if (edgeFile.is_open()) { - fileSize = static_cast(edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast( + edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the + // stream on success, pos_type(-1) on failure. edgeFile.clear(); - edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail + // bits first (such as the end-of-file bit) } // allocate memory - buffer = (char *) malloc(fileSize * sizeof(char)); + buffer = (char *)malloc(fileSize * sizeof(char)); edgeFile.read(buffer, fileSize); // read data as one big block bool firstLine = true; @@ -376,12 +370,10 @@ namespace morphstore{ } } } - } free(buffer); // free memory edgeFile.close(); - } } return result; @@ -396,8 +388,9 @@ namespace morphstore{ for (const auto &file : verticesPaths) { char *buffer; uint64_t fileSize = 0; - std::ifstream vertexFile(file, std::ios::binary | - std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream vertexFile( + file, + std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!vertexFile) { std::cerr << "Error, opening file. "; @@ -406,9 +399,12 @@ namespace morphstore{ // calculate file size if (vertexFile.is_open()) { - fileSize = static_cast(vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast( + vertexFile.tellg()); // tellg() returns: The current position of the get pointer in the stream + // on success, pos_type(-1) on failure. vertexFile.clear(); - // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file + // bit) vertexFile.seekg(0, std::ios::beg); } @@ -420,8 +416,7 @@ namespace morphstore{ // read buffer and do the magic ... for (size_t i = 0; i < fileSize; ++i) { - if (buffer[i] == '\n') - { + if (buffer[i] == '\n') { // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); @@ -448,18 +443,19 @@ namespace morphstore{ // this function reads the edge-files and fills the intermediate: vertexEdgeLookup // + creates the edgeLookup (number to string) for the graph - void fill_vertexEdgesLookup(Graph& graph){ + void fill_vertexEdgesLookup(Graph &graph) { - if(!edgesPaths.empty()) { + if (!edgesPaths.empty()) { std::cout << "(2/2) Generating LDBC-Edges ..."; std::cout.flush(); - //this variable is used for the edgeLookup-keys, starting by 0 + // this variable is used for the edgeLookup-keys, starting by 0 unsigned short int edgeTypeNumber = 0; // iterate through vector of vertex-addresses for (const auto &file : edgesPaths) { - // get the edge-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> person_likes_comment + // get the edge-infos from file name: e.g. ([...path...] / [person_likes_comment].csv) --> + // person_likes_comment // TODO: use regExp std::string edge_type = getEntityType(file); std::string sourceVertexType = edge_type.substr(0, edge_type.find('_')); @@ -470,11 +466,13 @@ namespace morphstore{ std::string targetVertexType = edge_type; - char* buffer; + char *buffer; uint64_t fileSize = 0; - std::ifstream edgeFile(file, std::ios::binary | std::ios::ate); // 'ate' means: open and seek to end immediately after opening + std::ifstream edgeFile( + file, std::ios::binary | + std::ios::ate); // 'ate' means: open and seek to end immediately after opening if (!edgeFile) { std::cerr << "Error, opening file. "; @@ -483,47 +481,54 @@ namespace morphstore{ // calculate file size if (edgeFile.is_open()) { - fileSize = static_cast(edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the stream on success, pos_type(-1) on failure. + fileSize = static_cast( + edgeFile.tellg()); // tellg() returns: The current position of the get pointer in the + // stream on success, pos_type(-1) on failure. edgeFile.clear(); - edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail bits first (such as the end-of-file bit) + edgeFile.seekg(0, std::ios::beg); // Seeks to the very beginning of the file, clearing any fail + // bits first (such as the end-of-file bit) } // allocate memory - buffer = (char*) malloc( fileSize * sizeof( char ) ); + buffer = (char *)malloc(fileSize * sizeof(char)); edgeFile.read(buffer, fileSize); // read data as one big block size_t start = 0; std::string delimiter = "|"; // check from file name whether it's an edge file or multi value attribute file - if(!get_vertex_type_number(targetVertexType).has_value()) { + if (!get_vertex_type_number(targetVertexType).has_value()) { // Multi-value-attributes: just take the last recently one std::string propertyKey; Ldbc_Data_Type data_type = Ldbc_Data_Type::STRING; std::unordered_map multiValueAttr; uint64_t systemID; property_type value; - - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ + + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); + if (row.find('\n') != std::string::npos) { + row.erase(0, 1); } - // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> get 'email' - if(start == 0){ + // first line: get the attribute a.k.a key for the property, e.g. Person.id|email -> + // get 'email' + if (start == 0) { propertyKey = row.substr(row.find(delimiter) + 1); - data_type = get_data_type(sourceVertexType ,propertyKey); + data_type = get_data_type(sourceVertexType, propertyKey); if (data_type == Ldbc_Data_Type::ERROR) - throw std::invalid_argument(file.string() + ":" + edgeType + ":" + propertyKey + " could not be found in schema"); - - }else{ - // (1) write data to vector: if key is already present, over write value (simplicity: we take the newest one) - systemID = globalIdLookupMap[{sourceVertexType, row.substr(0, row.find(delimiter))}]; + throw std::invalid_argument(file.string() + ":" + edgeType + ":" + + propertyKey + " could not be found in schema"); + + } else { + // (1) write data to vector: if key is already present, over write value + // (simplicity: we take the newest one) + systemID = + globalIdLookupMap[{sourceVertexType, row.substr(0, row.find(delimiter))}]; value = convert_property_value(row.substr(row.find(delimiter) + 1), data_type); multiValueAttr[systemID] = std::move(value); } @@ -532,33 +537,33 @@ namespace morphstore{ } } // iterate through multiValue map and assign property to vertex - for(const auto &pair : multiValueAttr){ - //const std::pair keyValuePair = {propertyKey, pair.second}; + for (const auto &pair : multiValueAttr) { + // const std::pair keyValuePair = {propertyKey, pair.second}; graph.add_property_to_vertex(pair.first, {propertyKey, pair.second}); } } // handling of edge-files ... - else{ + else { // check if the name already exists if (!exist_edge_type_name(edgeType)) { ++edgeTypeNumber; edgeTypeLookup.insert(std::make_pair(edgeTypeNumber, edgeType)); } - + bool hasProperties = false; std::string propertyKey; uint64_t sourceVertexId, targetVertexId; // read buffer and do the magic ... - for(size_t i = 0; i < fileSize; ++i){ - if(buffer[i] == '\n'){ + for (size_t i = 0; i < fileSize; ++i) { + if (buffer[i] == '\n') { // get a row into string form buffer with start- and end-point std::string row(&buffer[start], &buffer[i]); // remove unnecessary '\n' at the beginning of a string - if(row.find('\n') != std::string::npos){ - row.erase(0,1); + if (row.find('\n') != std::string::npos) { + row.erase(0, 1); } size_t last = 0; @@ -568,32 +573,36 @@ namespace morphstore{ // first line of *.csv: Differentiate whether it's // (1) edge without properties: e.g. Person.id|Person.id -> #delimiter = 1 // (2) edge with properties: e.g. Person.id|Person.id|fromDate -> #delimiter = 2 - if(start == 0){ + if (start == 0) { // if there are 2 delimiter ('|') -> edge file with properties - while ((next = row.find(delimiter, last)) != std::string::npos){ + while ((next = row.find(delimiter, last)) != std::string::npos) { last = next + 1; ++count; } - if(count == 2){ + if (count == 2) { hasProperties = true; propertyKey = row.substr(last); } } else { // lines of data: (from_local-ldbc-id), (to_local-ldbc-id) and property // get the system-(global) id's from local ids - sourceVertexId = globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); + sourceVertexId = + globalIdLookupMap.at({sourceVertexType, row.substr(0, row.find(delimiter))}); // remove from id from string row.erase(0, row.find(delimiter) + delimiter.length()); std::string value; - if(!hasProperties){ + if (!hasProperties) { // WITHOUT properties: just from the first delimiter on targetVertexId = globalIdLookupMap.at({targetVertexType, row}); // insert edge into vertexRealtionsLookup: - vertexEdgesLookup[sourceVertexId].push_back(morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber)); - }else{ - // with properties means: toID is until the next delimiter, and then the value for the property - targetVertexId = globalIdLookupMap.at({targetVertexType, row.substr(0, row.find(delimiter))}); + vertexEdgesLookup[sourceVertexId].push_back( + morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber)); + } else { + // with properties means: toID is until the next delimiter, and then the value + // for the property + targetVertexId = globalIdLookupMap.at( + {targetVertexType, row.substr(0, row.find(delimiter))}); row.erase(0, row.find(delimiter) + delimiter.length()); value = row; @@ -619,26 +628,26 @@ namespace morphstore{ // TODO: is this function really needed? // function for sorting the vertexEdgesLookup ASC (needed in CSR) // sorting for every vertex its vector list with target-ids ASC - void sort_VertexEdgesLookup(){ + void sort_VertexEdgesLookup() { // sorting the first element of the pair (target-id) - for(auto &rel: vertexEdgesLookup){ + for (auto &rel : vertexEdgesLookup) { std::sort(rel.second.begin(), rel.second.end()); } } // this function writes the actual data from the intermediate vertexEdgesLookup into the graph - void generate_edges(Graph& graph){ + void generate_edges(Graph &graph) { std::cout << " Writing edges into graph " << std::endl; // firstly, sorting the intermediates with their target IDs ASC sort_VertexEdgesLookup(); uint64_t graphSize = graph.getVertexCount(); - for(uint64_t vertexID = 0; vertexID < graphSize ; ++vertexID){ + for (uint64_t vertexID = 0; vertexID < graphSize; ++vertexID) { auto edges = vertexEdgesLookup[vertexID]; // add edge data: graph.add_edges(vertexID, edges); - for(auto edge: edges) { + for (auto edge : edges) { auto entry = edgeProperties.find(edge.getId()); if (entry != edgeProperties.end()) { graph.set_edge_properties(entry->first, entry->second); @@ -649,14 +658,14 @@ namespace morphstore{ void generate_vertex_type_lookup() { uint64_t vertex_type_number = 0; - for(std::string vertex_file: verticesPaths) { + for (std::string vertex_file : verticesPaths) { vertexTypeLookup.insert(std::make_pair(vertex_type_number, getEntityType(vertex_file))); vertex_type_number++; } } // MAIN IMPORT FUNCTION: see steps in comments - void import(Graph& graph) { + void import(Graph &graph) { std::cout << "Importing LDBC-files into graph ... "; std::cout.flush(); @@ -674,7 +683,7 @@ namespace morphstore{ // (3) generate vertices generate_vertices(graph); - + // (4) read edges and write to intermediate results fill_vertexEdgesLookup(graph); @@ -687,6 +696,6 @@ namespace morphstore{ std::cout << "--> done" << std::endl; } }; -} +} // namespace morphstore -#endif //MORPHSTORE_LDBC_IMPORT_H +#endif // MORPHSTORE_LDBC_IMPORT_H diff --git a/include/core/storage/graph/importer/ldbc_schema.h b/include/core/storage/graph/importer/ldbc_schema.h index 1bbcdeff..d8c1cac6 100644 --- a/include/core/storage/graph/importer/ldbc_schema.h +++ b/include/core/storage/graph/importer/ldbc_schema.h @@ -17,90 +17,84 @@ /** * @file lbc_schema.h - * @brief Schema of the LDBC graph based on https://raw.githubusercontent.com/ldbc/ldbc_snb_docs/dev/figures/schema-comfortable.png + * @brief Schema of the LDBC graph based on + * https://raw.githubusercontent.com/ldbc/ldbc_snb_docs/dev/figures/schema-comfortable.png * @todo -*/ + */ #ifndef MORPHSTORE_LDBC_SCHEMA_H #define MORPHSTORE_LDBC_SCHEMA_H #include -#include #include +#include #include -namespace morphstore{ - - enum class Ldbc_Data_Type {LONG_STRING, STRING, TEXT, INT_32, ID, DATE_TIME, DATE, ERROR}; +namespace morphstore { + + enum class Ldbc_Data_Type { LONG_STRING, STRING, TEXT, INT_32, ID, DATE_TIME, DATE, ERROR }; // static not included -> f.i. hasTag edge seen as property tag.id - static const std::map> ldbc_schema { - { - // vertices - {"person", { - {"creationDate", Ldbc_Data_Type::DATE_TIME}, - {"firstName", Ldbc_Data_Type::STRING}, - {"lastName", Ldbc_Data_Type::STRING}, - {"gender", Ldbc_Data_Type::STRING}, - {"birthday", Ldbc_Data_Type::DATE}, - // !TODO actually an array of emails - {"email", Ldbc_Data_Type::LONG_STRING}, - // !TODO actually an array of languages - - // (and not currently filled as csv header contains "language") - //{"speaks", Ldbc_Data_Type::STRING}, - // TODO actually values for "speaks" array - {"language", Ldbc_Data_Type::STRING}, - {"browserUsed", Ldbc_Data_Type::STRING}, - {"locationIP", Ldbc_Data_Type::STRING}}}, - {"forum", { - {"creationDate", Ldbc_Data_Type::DATE_TIME}, - {"title", Ldbc_Data_Type::LONG_STRING}}}, - {"post", { - {"creationDate", Ldbc_Data_Type::DATE_TIME}, - {"browserUsed", Ldbc_Data_Type::STRING}, - {"locationIP", Ldbc_Data_Type::STRING}, - {"length", Ldbc_Data_Type::INT_32}, - // TODO: extra nullable type for the following 3: like TEXT? - {"content", Ldbc_Data_Type::TEXT}, - {"language", Ldbc_Data_Type::STRING}, - {"imageFile", Ldbc_Data_Type::STRING}}}, - {"comment", { - {"creationDate", Ldbc_Data_Type::DATE_TIME}, - {"browserUsed", Ldbc_Data_Type::STRING}, - {"locationIP", Ldbc_Data_Type::STRING}, - {"content", Ldbc_Data_Type::TEXT}, - {"length", Ldbc_Data_Type::INT_32}}}, - {"tagclass", { - {"name", Ldbc_Data_Type::LONG_STRING}, - {"url", Ldbc_Data_Type::LONG_STRING}}}, - {"tag", { - {"name", Ldbc_Data_Type::LONG_STRING}, - {"url", Ldbc_Data_Type::LONG_STRING}}}, - {"place", { - {"name", Ldbc_Data_Type::LONG_STRING}, - {"url", Ldbc_Data_Type::LONG_STRING}, - {"type", Ldbc_Data_Type::STRING}}}, - {"organisation", { - {"name", Ldbc_Data_Type::LONG_STRING}, - {"type", Ldbc_Data_Type::STRING}, - {"url", Ldbc_Data_Type::LONG_STRING}}}, - // edges - {"likes", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, - {"hasMember", {{"joinDate", Ldbc_Data_Type::DATE_TIME}}}, - {"hasModerator", {}}, - {"hasCreator", {}}, - {"hasTag", {}}, - {"containerOf", {}}, - {"replyOf", {}}, - {"isSubclassOf", {}}, - {"isPartOf", {}}, - {"isLocatedIn", {}}, - {"studyAt", {{"classYear", Ldbc_Data_Type::INT_32}}}, - {"workAt", {{"workFrom", Ldbc_Data_Type::INT_32}}}, - {"knows", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, - }}; + static const std::map> ldbc_schema{{ + // vertices + {"person", + {{"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"firstName", Ldbc_Data_Type::STRING}, + {"lastName", Ldbc_Data_Type::STRING}, + {"gender", Ldbc_Data_Type::STRING}, + {"birthday", Ldbc_Data_Type::DATE}, + // !TODO actually an array of emails + {"email", Ldbc_Data_Type::LONG_STRING}, + // !TODO actually an array of languages + + // (and not currently filled as csv header contains "language") + //{"speaks", Ldbc_Data_Type::STRING}, + // TODO actually values for "speaks" array + {"language", Ldbc_Data_Type::STRING}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}}}, + {"forum", {{"creationDate", Ldbc_Data_Type::DATE_TIME}, {"title", Ldbc_Data_Type::LONG_STRING}}}, + {"post", + {{"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}, + {"length", Ldbc_Data_Type::INT_32}, + // TODO: extra nullable type for the following 3: like TEXT? + {"content", Ldbc_Data_Type::TEXT}, + {"language", Ldbc_Data_Type::STRING}, + {"imageFile", Ldbc_Data_Type::STRING}}}, + {"comment", + {{"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"browserUsed", Ldbc_Data_Type::STRING}, + {"locationIP", Ldbc_Data_Type::STRING}, + {"content", Ldbc_Data_Type::TEXT}, + {"length", Ldbc_Data_Type::INT_32}}}, + {"tagclass", {{"name", Ldbc_Data_Type::LONG_STRING}, {"url", Ldbc_Data_Type::LONG_STRING}}}, + {"tag", {{"name", Ldbc_Data_Type::LONG_STRING}, {"url", Ldbc_Data_Type::LONG_STRING}}}, + {"place", + {{"name", Ldbc_Data_Type::LONG_STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}, + {"type", Ldbc_Data_Type::STRING}}}, + {"organisation", + {{"name", Ldbc_Data_Type::LONG_STRING}, + {"type", Ldbc_Data_Type::STRING}, + {"url", Ldbc_Data_Type::LONG_STRING}}}, + // edges + {"likes", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, + {"hasMember", {{"joinDate", Ldbc_Data_Type::DATE_TIME}}}, + {"hasModerator", {}}, + {"hasCreator", {}}, + {"hasTag", {}}, + {"containerOf", {}}, + {"replyOf", {}}, + {"isSubclassOf", {}}, + {"isPartOf", {}}, + {"isLocatedIn", {}}, + {"studyAt", {{"classYear", Ldbc_Data_Type::INT_32}}}, + {"workAt", {{"workFrom", Ldbc_Data_Type::INT_32}}}, + {"knows", {{"creationDate", Ldbc_Data_Type::DATE_TIME}}}, + }}; Ldbc_Data_Type get_data_type(std::string entity_type, std::string property_key) { auto perEntity = ldbc_schema.find(entity_type); @@ -114,28 +108,29 @@ namespace morphstore{ // ldbc id is saved as an extra property as morphstore::graph generates new ones // static part of social network not included thus saved as property (!!wrongly!!) - if(property_key == "id") return Ldbc_Data_Type::ID; + if (property_key == "id") + return Ldbc_Data_Type::ID; - //std::cout << "Could not find a data type for " << entity_type << " " << property_key; + // std::cout << "Could not find a data type for " << entity_type << " " << property_key; return Ldbc_Data_Type::ERROR; } property_type convert_property_value(std::string value, Ldbc_Data_Type type) { property_type converted_value; - switch(type) { - case Ldbc_Data_Type::INT_32: - converted_value = std::stoi(value); - break; - case Ldbc_Data_Type::ID: - converted_value = std::stoull(value); - break; - default: - converted_value = value; + switch (type) { + case Ldbc_Data_Type::INT_32: + converted_value = std::stoi(value); + break; + case Ldbc_Data_Type::ID: + converted_value = std::stoull(value); + break; + default: + converted_value = value; }; return converted_value; } -} +} // namespace morphstore -#endif //MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file +#endif // MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file diff --git a/include/core/storage/graph/property_type.h b/include/core/storage/graph/property_type.h index b7ff6aee..63ad922c 100644 --- a/include/core/storage/graph/property_type.h +++ b/include/core/storage/graph/property_type.h @@ -17,32 +17,26 @@ /** * @file property_type.h - * @brief variant of supported data types as a property + * @brief variant of supported data types as a property * @todo Move into dedicated sub-folder (when different property mappings exists) -*/ + */ #ifndef MORPHSTORE_PROPERTY_TYPE_H #define MORPHSTORE_PROPERTY_TYPE_H -#include #include +#include -namespace morphstore{ +namespace morphstore { // only to used if properties are stored per node or triple store // TODO: handle date and datetime properties and maybe text using property_type = std::variant; struct PropertyValueVisitor { - void operator()(const std::string &s) const { - std::cout << "(string) " << s; - } - void operator()(uint64_t i) const - { - std::cout << "(uint_64t) " << i; - } + void operator()(const std::string &s) const { std::cout << "(string) " << s; } + void operator()(uint64_t i) const { std::cout << "(uint_64t) " << i; } }; -} - +} // namespace morphstore -#endif //MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file +#endif // MORPHSTORE_PROPERTY_TYPE_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertex.h b/include/core/storage/graph/vertex/vertex.h index 2805ca69..da4dfe43 100644 --- a/include/core/storage/graph/vertex/vertex.h +++ b/include/core/storage/graph/vertex/vertex.h @@ -19,25 +19,25 @@ * @file vertex.h * @brief abstract vertex class for storage formats * @todo -*/ + */ #ifndef MORPHSTORE_VERTEX_H #define MORPHSTORE_VERTEX_H #include -#include #include -#include #include +#include +#include -namespace morphstore{ +namespace morphstore { - class Vertex{ + class Vertex { protected: // vertex: id, - uint64_t id; + uint64_t id; // optional: type, properties unsigned short int type; @@ -45,30 +45,24 @@ namespace morphstore{ bool valid = false; public: - // default constr. needed for VertexWithProperties(Vertex vertex, const std::unordered_map properties) - // otherwise compiler won't accept - Vertex() {}; + // default constr. needed for VertexWithProperties(Vertex vertex, const std::unordered_map properties) otherwise compiler won't accept + Vertex(){}; - Vertex(uint64_t id, unsigned short int type = 0){ + Vertex(uint64_t id, unsigned short int type = 0) { this->id = id; this->type = type; this->valid = true; } - uint64_t getID() const { - return id; - } + uint64_t getID() const { return id; } - unsigned short getType() const { - return type; - } + unsigned short getType() const { return type; } - bool isValid() const { - return valid; - } + bool isValid() const { return valid; } // this is needed when using VerticesVectorArrayContainer when doing vertex_array[offset] = vertex - Vertex& operator= (const Vertex &vertex){ + Vertex &operator=(const Vertex &vertex) { // self-assignment guard if (this == &vertex) return *this; @@ -82,12 +76,12 @@ namespace morphstore{ return *this; } - // get size of vertex in bytes: + // get size of vertex in bytes: static size_t get_data_size_of_vertex() { size_t size = 0; - size += sizeof(uint64_t); // id + size += sizeof(uint64_t); // id size += sizeof(unsigned short int); // entity - size += sizeof(bool); // valid flag + size += sizeof(bool); // valid flag return size; } @@ -95,28 +89,23 @@ namespace morphstore{ // convinience class for returning whole vertices class VertexWithProperties { - private: - Vertex vertex; - std::unordered_map properties; - public: - VertexWithProperties(Vertex vertex, const std::unordered_map properties) { - this->vertex = vertex; - this->properties = properties; - } - - uint64_t getID() { - return vertex.getID(); - } - - unsigned short getType() const { - return vertex.getType(); - } - - std::unordered_map getProperties() { - return properties; - } + private: + Vertex vertex; + std::unordered_map properties; + + public: + VertexWithProperties(Vertex vertex, const std::unordered_map properties) { + this->vertex = vertex; + this->properties = properties; + } + + uint64_t getID() { return vertex.getID(); } + + unsigned short getType() const { return vertex.getType(); } + + std::unordered_map getProperties() { return properties; } }; -} +} // namespace morphstore -#endif //MORPHSTORE_VERTEX_H +#endif // MORPHSTORE_VERTEX_H diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 0b2a8c50..2f3f62a7 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -19,131 +19,125 @@ * @file vertices_container.h * @brief abstract class for storing vertices * @todo -*/ + */ #ifndef MORPHSTORE_VERTICES_CONTAINER_H #define MORPHSTORE_VERTICES_CONTAINER_H -#include #include +#include +#include #include #include -#include #include -namespace morphstore{ - enum class VerticesContainerType {HashMapContainer, VectorArrayContainer}; +namespace morphstore { + enum class VerticesContainerType { HashMapContainer, VectorArrayContainer }; class VerticesContainer { - protected: - uint64_t currentMaxVertexId = 0; - uint64_t expected_vertex_count = 0; - std::map vertex_type_dictionary; - - // TODO: try other property storage formats than per node .. (triple-store or per property) - std::unordered_map> vertex_properties; - - std::string get_vertex_type(unsigned short int type) const { - if (vertex_type_dictionary.find(type) != vertex_type_dictionary.end()) { - return vertex_type_dictionary.at(type); - } - else { - return "No Matching of type-number in the database! For type " + std::to_string(type); - } + protected: + uint64_t currentMaxVertexId = 0; + uint64_t expected_vertex_count = 0; + std::map vertex_type_dictionary; + + // TODO: try other property storage formats than per node .. (triple-store or per property) + std::unordered_map> vertex_properties; + + std::string get_vertex_type(unsigned short int type) const { + if (vertex_type_dictionary.find(type) != vertex_type_dictionary.end()) { + return vertex_type_dictionary.at(type); + } else { + return "No Matching of type-number in the database! For type " + std::to_string(type); } - - uint64_t getNextVertexId() { - return currentMaxVertexId++; + } + + uint64_t getNextVertexId() { return currentMaxVertexId++; } + + public: + virtual std::string container_description() const = 0; + virtual void insert_vertex(Vertex v) = 0; + virtual bool exists_vertex(const uint64_t id) const = 0; + virtual Vertex get_vertex(uint64_t id) = 0; + virtual uint64_t vertex_count() const = 0; + + virtual void allocate(uint64_t expected_vertices) { + vertex_properties.reserve(expected_vertices); + expected_vertex_count += expected_vertices; + } + + uint64_t add_vertex(const unsigned short int type, + const std::unordered_map properties = {}) { + assert(currentMaxVertexId < expected_vertex_count); + Vertex v = Vertex(getNextVertexId(), type); + insert_vertex(v); + if (!properties.empty()) { + vertex_properties.insert(std::make_pair(v.getID(), properties)); } - public: - virtual std::string container_description() const = 0; - virtual void insert_vertex(Vertex v) = 0; - virtual bool exists_vertex(const uint64_t id) const = 0; - virtual Vertex get_vertex(uint64_t id) = 0; - virtual uint64_t vertex_count() const = 0; - - - virtual void allocate(uint64_t expected_vertices) { - vertex_properties.reserve(expected_vertices); - expected_vertex_count += expected_vertices; - } + return v.getID(); + } - uint64_t add_vertex(const unsigned short int type, const std::unordered_map properties = {}) { - assert(currentMaxVertexId < expected_vertex_count); - Vertex v = Vertex(getNextVertexId(), type); - insert_vertex(v); - if (!properties.empty()) { - vertex_properties.insert(std::make_pair(v.getID(), properties)); - } + void add_property_to_vertex(uint64_t id, const std::pair property) { + assert(exists_vertex(id)); + vertex_properties[id].insert(property); + }; - return v.getID(); - } + void set_vertex_type_dictionary(const std::map &types) { + assert(types.size() != 0); + this->vertex_type_dictionary = types; + } - void add_property_to_vertex(uint64_t id, const std::pair property) { - assert(exists_vertex(id)); - vertex_properties[id].insert(property); - }; + const VertexWithProperties get_vertex_with_properties(uint64_t id) { + assert(exists_vertex(id)); + return VertexWithProperties(get_vertex(id), vertex_properties[id]); + } - void set_vertex_type_dictionary(const std::map& types) { - assert(types.size() != 0); - this->vertex_type_dictionary = types; - } - + uint64_t vertices_with_properties_count() { return vertex_properties.size(); } - const VertexWithProperties get_vertex_with_properties(uint64_t id) { - assert(exists_vertex(id)); - return VertexWithProperties(get_vertex(id), vertex_properties[id]); - } + virtual std::pair get_size() const { + size_t data_size = 0; + size_t index_size = 0; - uint64_t vertices_with_properties_count() { - return vertex_properties.size(); + // lookup type dicts + index_size += 2 * sizeof(std::map); + for (auto &type_mapping : vertex_type_dictionary) { + index_size += sizeof(unsigned short int); + index_size += sizeof(char) * (type_mapping.second.length()); } - virtual std::pair get_size() const { - size_t data_size = 0; - size_t index_size = 0; - - // lookup type dicts - index_size += 2 * sizeof(std::map); - for(auto& type_mapping : vertex_type_dictionary){ - index_size += sizeof(unsigned short int); - index_size += sizeof(char)*(type_mapping.second.length()); - } - - // vertex-properties: - index_size += sizeof(std::unordered_map>); - for (const auto &property_mapping : vertex_properties) { - index_size += sizeof(uint64_t) + sizeof(std::unordered_map); - for (const auto &property : property_mapping.second) { - data_size += sizeof(char) * property.first.length() + sizeof(property.second); - } + // vertex-properties: + index_size += sizeof(std::unordered_map>); + for (const auto &property_mapping : vertex_properties) { + index_size += sizeof(uint64_t) + sizeof(std::unordered_map); + for (const auto &property : property_mapping.second) { + data_size += sizeof(char) * property.first.length() + sizeof(property.second); } - - return {index_size, data_size}; } - void print_type_dict(){ - std::cout << "VertexType-Dict: " << std::endl; - for (auto const &entry : vertex_type_dictionary) { - std::cout << entry.first << " -> " << entry.second << std::endl; - } - } + return {index_size, data_size}; + } - void print_vertex_by_id(const uint64_t id) { - std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; - VertexWithProperties v = get_vertex_with_properties(id); - std::cout << "Vertex-ID: \t" << v.getID() << std::endl; - std::cout << "Type: \t" << get_vertex_type(v.getType()) << std::endl; - std::cout << "Properties: "; - for (const auto entry : v.getProperties()) { - auto value = entry.second; - std::cout << "{" << entry.first << ": "; - std::visit(PropertyValueVisitor{}, value); - std::cout << "}"; - } + void print_type_dict() { + std::cout << "VertexType-Dict: " << std::endl; + for (auto const &entry : vertex_type_dictionary) { + std::cout << entry.first << " -> " << entry.second << std::endl; + } + } + + void print_vertex_by_id(const uint64_t id) { + std::cout << "-------------- Vertex ID: " << id << " --------------" << std::endl; + VertexWithProperties v = get_vertex_with_properties(id); + std::cout << "Vertex-ID: \t" << v.getID() << std::endl; + std::cout << "Type: \t" << get_vertex_type(v.getType()) << std::endl; + std::cout << "Properties: "; + for (const auto entry : v.getProperties()) { + auto value = entry.second; + std::cout << "{" << entry.first << ": "; + std::visit(PropertyValueVisitor{}, value); + std::cout << "}"; } + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_VERTICES_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_VERTICES_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index aaea6787..97c9568d 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -19,7 +19,7 @@ * @file vertices__hashmap_container.h * @brief storing vertices using a hashmap * @todo -*/ + */ #ifndef MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H #define MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H @@ -30,54 +30,45 @@ #include #include -namespace morphstore{ +namespace morphstore { - class VerticesHashMapContainer : public VerticesContainer{ - protected: - std::unordered_map vertices; + class VerticesHashMapContainer : public VerticesContainer { + protected: + std::unordered_map vertices; - public: - std::string container_description() const override { - return "unordered_map"; - } + public: + std::string container_description() const override { return "unordered_map"; } - void allocate(const uint64_t expected_vertices) override { - VerticesContainer::allocate(expected_vertices); - this->vertices.reserve(expected_vertices); - } - - void insert_vertex(const Vertex v) override { - vertices[v.getID()] = v; - } + void allocate(const uint64_t expected_vertices) override { + VerticesContainer::allocate(expected_vertices); + this->vertices.reserve(expected_vertices); + } - Vertex get_vertex(uint64_t id) override { - return vertices[id]; - } + void insert_vertex(const Vertex v) override { vertices[v.getID()] = v; } - bool exists_vertex(const uint64_t id) const override { - if(vertices.find(id) == vertices.end()){ - return false; - } - return true; - } + Vertex get_vertex(uint64_t id) override { return vertices[id]; } - uint64_t vertex_count() const { - return vertices.size(); + bool exists_vertex(const uint64_t id) const override { + if (vertices.find(id) == vertices.end()) { + return false; } + return true; + } - std::pair get_size() const override { - auto [index_size, data_size] = VerticesContainer::get_size(); + uint64_t vertex_count() const { return vertices.size(); } - // container for indexes: - index_size += sizeof(std::unordered_map); - // index size of vertex: size of id and sizeof pointer - index_size += vertices.size() * sizeof(uint64_t); - data_size += vertices.size() * Vertex::get_data_size_of_vertex(); - + std::pair get_size() const override { + auto [index_size, data_size] = VerticesContainer::get_size(); - return {index_size, data_size}; - } + // container for indexes: + index_size += sizeof(std::unordered_map); + // index size of vertex: size of id and sizeof pointer + index_size += vertices.size() * sizeof(uint64_t); + data_size += vertices.size() * Vertex::get_data_size_of_vertex(); + + return {index_size, data_size}; + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_VERTICES_HASHMAP_CONTAINER_H \ No newline at end of file diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index b3c575dc..fece9847 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -19,7 +19,7 @@ * @file vertices__vectorarray_container.h * @brief storing vertices using a vector of arrays * @todo -*/ + */ #ifndef MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H #define MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H @@ -27,116 +27,105 @@ #include "vertex.h" #include "vertices_container.h" -#include -#include #include +#include +#include -namespace morphstore{ - - class VerticesVectorArrayContainer : public VerticesContainer{ - protected: - std::vector vertices; +namespace morphstore { - static const inline uint64_t vertex_array_size = 4096; - static const inline uint64_t vertices_per_array = vertex_array_size / sizeof(Vertex); + class VerticesVectorArrayContainer : public VerticesContainer { + protected: + std::vector vertices; - uint64_t number_of_vertices = 0; - Vertex* current_array; - uint64_t current_array_offset = 0; + static const inline uint64_t vertex_array_size = 4096; + static const inline uint64_t vertices_per_array = vertex_array_size / sizeof(Vertex); + uint64_t number_of_vertices = 0; + Vertex *current_array; + uint64_t current_array_offset = 0; - Vertex* allocate_vertex_array() { - auto array_pointer = (Vertex *) std::aligned_alloc( - sizeof(Vertex), - vertices_per_array * sizeof(Vertex)); + Vertex *allocate_vertex_array() { + auto array_pointer = (Vertex *)std::aligned_alloc(sizeof(Vertex), vertices_per_array * sizeof(Vertex)); - vertices.push_back(array_pointer); - //std::cout << " Added a page" << std::endl; - //std::cout.flush(); + vertices.push_back(array_pointer); + // std::cout << " Added a page" << std::endl; + // std::cout.flush(); - return array_pointer; - } + return array_pointer; + } - inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { - return vertex_id / vertices_per_array; - } + inline uint64_t get_vertex_vector_number(uint64_t vertex_id) const { return vertex_id / vertices_per_array; } - inline uint64_t get_pos_in_array(uint64_t vertex_id) const { - return vertex_id % vertices_per_array; - } + inline uint64_t get_pos_in_array(uint64_t vertex_id) const { return vertex_id % vertices_per_array; } - public: - // TODO: make array_size based on constructor - //VerticesVectorArrayContainer(array_size) + public: + // TODO: make array_size based on constructor + // VerticesVectorArrayContainer(array_size) - ~VerticesVectorArrayContainer() { - // TODO: find memory leak (destructor seems not to be called) - std::cout << "freeing vertex pages"; - for (auto array_pointer : this->vertices) { - free(array_pointer); - } + ~VerticesVectorArrayContainer() { + // TODO: find memory leak (destructor seems not to be called) + std::cout << "freeing vertex pages"; + for (auto array_pointer : this->vertices) { + free(array_pointer); } + } - std::string container_description() const override { - return "vector"; - } + std::string container_description() const override { return "vector"; } - void allocate(const uint64_t expected_vertices) override { - VerticesContainer::allocate(expected_vertices); - this->vertices.reserve(std::ceil(expected_vertices / (double) vertices_per_array)); + void allocate(const uint64_t expected_vertices) override { + VerticesContainer::allocate(expected_vertices); + this->vertices.reserve(std::ceil(expected_vertices / (double)vertices_per_array)); - if (current_array == nullptr) - current_array = allocate_vertex_array(); - } + if (current_array == nullptr) + current_array = allocate_vertex_array(); + } - void insert_vertex(Vertex v) { - // equals current array is full - if (current_array_offset == vertices_per_array) { - current_array = allocate_vertex_array(); - current_array_offset = 0; - } - // TODO: add check that there is no valid vertex stored there - // need to solve problem that aligned_alloc randomaly inits Vertices (ignores default values) - current_array[current_array_offset] = v; - current_array_offset++; - number_of_vertices++; + void insert_vertex(Vertex v) { + // equals current array is full + if (current_array_offset == vertices_per_array) { + current_array = allocate_vertex_array(); + current_array_offset = 0; } + // TODO: add check that there is no valid vertex stored there + // need to solve problem that aligned_alloc randomaly inits Vertices (ignores default values) + current_array[current_array_offset] = v; + current_array_offset++; + number_of_vertices++; + } - Vertex get_vertex(uint64_t id) override { - uint64_t array_number = get_vertex_vector_number(id); - uint64_t pos_in_array = get_pos_in_array(id); + Vertex get_vertex(uint64_t id) override { + uint64_t array_number = get_vertex_vector_number(id); + uint64_t pos_in_array = get_pos_in_array(id); - //assert (pos_in_array < vertices_per_array); - assert(array_number < vertices.size()); + // assert (pos_in_array < vertices_per_array); + assert(array_number < vertices.size()); - return vertices.at(array_number)[pos_in_array]; - } + return vertices.at(array_number)[pos_in_array]; + } - bool exists_vertex(const uint64_t id) const override { - // assumes no deletion! else retrieve vertrex at position and check isValid() - return number_of_vertices > id; - } + bool exists_vertex(const uint64_t id) const override { + // assumes no deletion! else retrieve vertrex at position and check isValid() + return number_of_vertices > id; + } - uint64_t vertex_count() const override { - return number_of_vertices; - } + uint64_t vertex_count() const override { return number_of_vertices; } - std::pair get_size() const override { - auto [index_size, data_size] = VerticesContainer::get_size(); + std::pair get_size() const override { + auto [index_size, data_size] = VerticesContainer::get_size(); - // vector count, current_array_offset - index_size += 2 * sizeof(uint64_t); - // current_array - index_size += sizeof(Vertex*); + // vector count, current_array_offset + index_size += 2 * sizeof(uint64_t); + // current_array + index_size += sizeof(Vertex *); - index_size += sizeof(std::vector); - index_size += vertices.size() * sizeof(Vertex*); - // allocated memory for vertices - data_size += vertices.size() * Vertex::get_data_size_of_vertex() * vertices_per_array; - - return {index_size, data_size}; - } + index_size += sizeof(std::vector); + index_size += vertices.size() * sizeof(Vertex *); + // allocated memory for vertices + data_size += vertices.size() * Vertex::get_data_size_of_vertex() * vertices_per_array; + + return {index_size, data_size}; + } }; -} +} // namespace morphstore -#endif //MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H \ No newline at end of file +#endif // MORPHSTORE_VERTICES_VECTORARRAY_CONTAINER_H \ No newline at end of file diff --git a/src/microbenchmarks/graph/benchmark_helper.h b/src/microbenchmarks/graph/benchmark_helper.h index 8deb036e..5191a194 100644 --- a/src/microbenchmarks/graph/benchmark_helper.h +++ b/src/microbenchmarks/graph/benchmark_helper.h @@ -18,15 +18,15 @@ /** * @file benchmark_helper.h * @brief Helper functions for graph benchmarks - * @todo -*/ + * @todo + */ #ifndef BENCHMARK_HELPER #define BENCHMARK_HELPER -#include #include #include +#include namespace morphstore { using highResClock = std::chrono::high_resolution_clock; @@ -43,4 +43,4 @@ namespace morphstore { } } // namespace morphstore -#endif //BENCHMARK_HELPER +#endif // BENCHMARK_HELPER diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index af34bbfc..5cacc929 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -21,29 +21,26 @@ * @todo allow different compression formats for the two csr columns; add full_iterate */ +#include "benchmark_helper.h" #include #include #include -#include "benchmark_helper.h" using namespace morphstore; - struct CompressionBenchmarkEntry { - GraphCompressionFormat compr_format; - int64_t compression_time; - double offset_col_compression_ratio; - double edgeId_col_compression_ratio; - int64_t random_access_time; - int64_t full_iterate; - - std::string to_string() { - return "|" + graph_compr_f_to_string(compr_format) + - "|" + std::to_string(compression_time) + - "|" + std::to_string(offset_col_compression_ratio) + - "|" + std::to_string(edgeId_col_compression_ratio) + - "|" + std::to_string(random_access_time); - } + GraphCompressionFormat compr_format; + int64_t compression_time; + double offset_col_compression_ratio; + double edgeId_col_compression_ratio; + int64_t random_access_time; + int64_t full_iterate; + + std::string to_string() { + return "|" + graph_compr_f_to_string(compr_format) + "|" + std::to_string(compression_time) + "|" + + std::to_string(offset_col_compression_ratio) + "|" + std::to_string(edgeId_col_compression_ratio) + + "|" + std::to_string(random_access_time); + } }; int main(void) { @@ -56,13 +53,9 @@ int main(void) { throw std::invalid_argument("Where are the ldbc files??"); } + std::vector compr_formats = {GraphCompressionFormat::UNCOMPRESSED, + GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR}; - std::vector compr_formats = { - GraphCompressionFormat::UNCOMPRESSED, - GraphCompressionFormat::DELTA, - GraphCompressionFormat::FOR - }; - // Load ldbc graph std::unique_ptr graph = std::make_unique(); std::unique_ptr ldbcImport = std::make_unique(sourceDir); @@ -76,39 +69,38 @@ int main(void) { random_accesses.push_back(dist(rd)); } - - std::cout << "Test compression of ldbc-graph in CSR format (times in micro-seconds)" << std::endl; - std::cout << "Compression-Format | compression-time | offset-column compr. ratio" << - " | edgeId-column compr. ratio | access of edges of " << - std::to_string(number_of_random_access) + " random vertices | full edge-list iterate" + std::cout << "Test compression of ldbc-graph in CSR format (times in " + "micro-seconds)" << std::endl; + std::cout << "Compression-Format | compression-time | offset-column compr. ratio" + << " | edgeId-column compr. ratio | access of edges of " + << std::to_string(number_of_random_access) + " random vertices | full edge-list iterate" << std::endl; for (auto current_f : compr_formats) { - for (int exec = 0; exec < number_of_executions; exec++) { - CompressionBenchmarkEntry current_try; - current_try.compr_format = current_f; - // restore start state - graph->morph(GraphCompressionFormat::UNCOMPRESSED); - - auto start = highResClock::now(); - graph->morph(current_f); - // compression time - current_try.compression_time = get_duration(start); - - // compression-ratios - current_try.offset_col_compression_ratio = graph->offset_column_compr_ratio(); - current_try.edgeId_col_compression_ratio = graph->edgeId_column_compr_ratio(); - - - // random access - start = highResClock::now(); - for (int random_pos : random_accesses) { - graph->get_outgoing_edge_ids(random_pos); + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.compr_format = current_f; + // restore start state + graph->morph(GraphCompressionFormat::UNCOMPRESSED); + + auto start = highResClock::now(); + graph->morph(current_f); + // compression time + current_try.compression_time = get_duration(start); + + // compression-ratios + current_try.offset_col_compression_ratio = graph->offset_column_compr_ratio(); + current_try.edgeId_col_compression_ratio = graph->edgeId_column_compr_ratio(); + + // random access + start = highResClock::now(); + for (int random_pos : random_accesses) { + graph->get_outgoing_edge_ids(random_pos); + } + current_try.random_access_time = get_duration(start); + + std::cout << current_try.to_string() << std::endl; } - current_try.random_access_time = get_duration(start); - - std::cout << current_try.to_string() << std::endl; - } } return 0; diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp index 4c6ad7ef..6d60bfd2 100644 --- a/src/microbenchmarks/graph/edge_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -21,95 +21,91 @@ * @todo Fix edge id generation for benchmark to work */ -#include +#include "benchmark_helper.h" #include +#include #include -#include "benchmark_helper.h" - using namespace morphstore; - int main(void) { - // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function + // use BenchmarkEntry struct instead of appending to string int number_of_executions = 5; std::cout << "Test edge storage structure (median of 5 for full_iterate and random access)" << std::endl; - std::cout << "Container type | edge_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the edge count in μs" << std::endl; + std::cout << "Container type | edge_count | loading time in μs | memory usage in bytes | full_iterate in μs | " + "random access 1/10 of the edge count in μs" + << std::endl; - std::vector storage_types = { - EdgesContainerType::HashMapContainer, - EdgesContainerType::VectorArrayContainer - }; + std::vector storage_types = {EdgesContainerType::HashMapContainer, + EdgesContainerType::VectorArrayContainer}; std::vector edge_counts = {10000, 100000, 1000000, 2000000, 5000000, 10000000, 15000000}; - for (int edge_count: edge_counts) { - std::random_device rd; - std::uniform_int_distribution dist(0, edge_count - 1); - std::vector random_accesses; - for (int i = 0; i < edge_count; i++) { - random_accesses.push_back(dist(rd)); - } - - for (auto storage_type : storage_types) { - std::unique_ptr graph = std::make_unique(storage_type); - graph->allocate_graph_structure(1, edge_count); - - std::string measurement_entry = - graph->edges_container_description() + " | "; - measurement_entry += std::to_string(edge_count) + " | "; - - auto vertex_id = graph->add_vertex(0); - std::vector edges; - + for (int edge_count : edge_counts) { + std::random_device rd; + std::uniform_int_distribution dist(0, edge_count - 1); + std::vector random_accesses; for (int i = 0; i < edge_count; i++) { - edges.push_back(Edge(i, vertex_id, vertex_id, 0)); + random_accesses.push_back(dist(rd)); } - auto start = highResClock::now(); - graph->add_edges(vertex_id, edges); - // loading time - measurement_entry += std::to_string(get_duration(start)) + " | "; + for (auto storage_type : storage_types) { + std::unique_ptr graph = std::make_unique(storage_type); + graph->allocate_graph_structure(1, edge_count); - // size - auto [index_size, data_size] = graph->get_size_of_graph(); - measurement_entry += std::to_string(index_size + data_size) + " | "; + std::string measurement_entry = graph->edges_container_description() + " | "; + measurement_entry += std::to_string(edge_count) + " | "; + auto vertex_id = graph->add_vertex(0); + std::vector edges; - std::vector durations; + for (int i = 0; i < edge_count; i++) { + edges.push_back(Edge(i, vertex_id, vertex_id, 0)); + } - // full iterate - for (int exec = 0; exec < number_of_executions; exec++) { - auto start = highResClock::now(); - // iterate - for (int i = 0; i < edge_count; i++) { - graph->get_edge(i); - } - durations.push_back(get_duration(start)); - } + auto start = highResClock::now(); + graph->add_edges(vertex_id, edges); + // loading time + measurement_entry += std::to_string(get_duration(start)) + " | "; - measurement_entry += std::to_string(get_median(durations)) + " | "; + // size + auto [index_size, data_size] = graph->get_size_of_graph(); + measurement_entry += std::to_string(index_size + data_size) + " | "; - // random access + std::vector durations; - durations.clear(); + // full iterate + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + // iterate + for (int i = 0; i < edge_count; i++) { + graph->get_edge(i); + } + durations.push_back(get_duration(start)); + } - for (int exec = 0; exec < number_of_executions; exec++) { - auto start = highResClock::now(); + measurement_entry += std::to_string(get_median(durations)) + " | "; - for (int random_pos : random_accesses) { - graph->get_edge(random_pos); - } + // random access - durations.push_back(get_duration(start)); - } + durations.clear(); + + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); - measurement_entry += std::to_string(get_median(durations)); + for (int random_pos : random_accesses) { + graph->get_edge(random_pos); + } - std::cout << measurement_entry << std::endl; - } + durations.push_back(get_duration(start)); + } + + measurement_entry += std::to_string(get_median(durations)); + + std::cout << measurement_entry << std::endl; + } } return 0; diff --git a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp index 11e4c7d2..d925e2d8 100644 --- a/src/microbenchmarks/graph/vertex_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/vertex_storage_benchmark.cpp @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -20,89 +20,86 @@ * @brief A little mirco benchmark of the vertex storage (hashmap vs vector>). */ +#include "benchmark_helper.h" #include #include #include -#include "benchmark_helper.h" using namespace morphstore; - int main(void) { // TODO: use core/utils/monitoring.h ? or a "time_it" function to stop a given function int number_of_executions = 5; std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; - std::cout << "Container type | vertex_count | loading time in μs | memory usage in bytes | full_iterate in μs | random access 1/10 of the vertex count in μs" << std::endl; + std::cout << "Container type | vertex_count | loading time in μs | memory usage in bytes | full_iterate in μs | " + "random access 1/10 of the vertex count in μs" + << std::endl; - std::vector storage_types = { - VerticesContainerType::HashMapContainer, - VerticesContainerType::VectorArrayContainer - }; + std::vector storage_types = {VerticesContainerType::HashMapContainer, + VerticesContainerType::VectorArrayContainer}; std::vector vertex_counts = {10000, 100000, 1000000, 2000000, 5000000, 10000000, 15000000}; - for (int vertex_count: vertex_counts) { - std::random_device rd; - std::uniform_int_distribution dist(0, vertex_count - 1); - std::vector random_accesses; - for (int i = 0; i < vertex_count; i++) { - random_accesses.push_back(dist(rd)); - } - - for (auto storage_type : storage_types) { - std::unique_ptr graph = std::make_unique(storage_type); - graph->allocate_graph_structure(vertex_count, 0); - - std::string measurement_entry = - graph->vertices_container_description() + " | "; - measurement_entry += std::to_string(vertex_count) + " | "; - - auto start = highResClock::now(); + for (int vertex_count : vertex_counts) { + std::random_device rd; + std::uniform_int_distribution dist(0, vertex_count - 1); + std::vector random_accesses; for (int i = 0; i < vertex_count; i++) { - graph->add_vertex(); + random_accesses.push_back(dist(rd)); } - // loading time - measurement_entry += std::to_string(get_duration(start)) + " | "; - // size - auto [index_size, data_size] = graph->get_size_of_graph(); - measurement_entry += std::to_string(index_size + data_size) + " | "; + for (auto storage_type : storage_types) { + std::unique_ptr graph = std::make_unique(storage_type); + graph->allocate_graph_structure(vertex_count, 0); + std::string measurement_entry = graph->vertices_container_description() + " | "; + measurement_entry += std::to_string(vertex_count) + " | "; - std::vector durations; + auto start = highResClock::now(); + for (int i = 0; i < vertex_count; i++) { + graph->add_vertex(); + } + // loading time + measurement_entry += std::to_string(get_duration(start)) + " | "; - // full iterate - for (int exec = 0; exec < number_of_executions; exec++) { - auto start = highResClock::now(); - // iterate - for (int i = 0; i < vertex_count; i++) { - graph->get_vertex(i); - } - durations.push_back(get_duration(start)); - } + // size + auto [index_size, data_size] = graph->get_size_of_graph(); + measurement_entry += std::to_string(index_size + data_size) + " | "; - measurement_entry += std::to_string(get_median(durations)) + " | "; + std::vector durations; - // random access + // full iterate + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + // iterate + for (int i = 0; i < vertex_count; i++) { + graph->get_vertex(i); + } + durations.push_back(get_duration(start)); + } - durations.clear(); + measurement_entry += std::to_string(get_median(durations)) + " | "; - for (int exec = 0; exec < number_of_executions; exec++) { - auto start = highResClock::now(); + // random access - for (int random_pos : random_accesses) { - graph->get_vertex(random_pos); - } + durations.clear(); - durations.push_back(get_duration(start)); - } + for (int exec = 0; exec < number_of_executions; exec++) { + auto start = highResClock::now(); + + for (int random_pos : random_accesses) { + graph->get_vertex(random_pos); + } - measurement_entry += std::to_string(get_median(durations)); + durations.push_back(get_duration(start)); + } - std::cout << measurement_entry << std::endl; - } + measurement_entry += std::to_string(get_median(durations)); + + std::cout << measurement_entry << std::endl; + } } return 0; diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index 5aca9335..6af4c075 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -21,9 +21,9 @@ * @todo */ -#include -#include #include +#include +#include void print_header(std::string storageFormat) { @@ -34,23 +34,17 @@ void print_header(std::string storageFormat) { std::cout << "\n"; } -template -void bfs_ldbc_graph_test (void) { +template void bfs_ldbc_graph_test(void) { - static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - - std::string sourceDir = ""; - std::string targetDir = ""; + static_assert(std::is_base_of::value, + "type parameter of this method must be a graph format"); + std::string sourceDir = ""; if (sourceDir.empty()) { throw std::invalid_argument("Where are the ldbc files??"); } - if (targetDir.empty()) { - throw std::invalid_argument("Degree count has to be saved somewhere"); - } - std::unique_ptr graph = std::make_unique(); std::string storageFormat = graph->get_storage_format(); @@ -59,7 +53,6 @@ void bfs_ldbc_graph_test (void) { // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) std::unique_ptr ldbcImport = std::make_unique(sourceDir); - // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); @@ -70,5 +63,5 @@ void bfs_ldbc_graph_test (void) { auto bfs = std::make_unique(graph); // for scale factor 1 and including static as well as dynamic part of the graph std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; - //bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); + // bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); } \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index e00c5f24..20fe510f 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -21,8 +21,8 @@ * @todo */ -#include #include +#include void print_header(std::string storageFormat) { std::cout << "\n"; @@ -32,22 +32,17 @@ void print_header(std::string storageFormat) { std::cout << "\n"; } -template -void ldbcGraphFormatTest (void) { +template void ldbcGraphFormatTest(void) { - static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + static_assert(std::is_base_of::value, + "type parameter of this method must be a graph format"); std::string sourceDir = ""; - std::string targetDir = ""; if (sourceDir.empty()) { throw std::invalid_argument("Where are the ldbc files??"); } - if (targetDir.empty()) { - throw std::invalid_argument("Degree count has to be saved somewhere"); - } - std::unique_ptr graph = std::make_unique(); std::string storageFormat = graph->get_storage_format(); @@ -57,7 +52,6 @@ void ldbcGraphFormatTest (void) { // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) std::unique_ptr ldbcImport = std::make_unique(sourceDir); - // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); graph->statistics(); @@ -73,8 +67,6 @@ void ldbcGraphFormatTest (void) { // measure degree distribution and write to file (file path as parameter): // TODO: but this into benchmark or so .. not actual test - //std::cout << "Measure degree count" << std::endl; - //graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); - - + // std::cout << "Measure degree count" << std::endl; + // graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); } \ No newline at end of file diff --git a/test/core/storage/graph/simple/simple_adj_graph_test.cpp b/test/core/storage/graph/simple/simple_adj_graph_test.cpp index 461c4223..2b857755 100644 --- a/test/core/storage/graph/simple/simple_adj_graph_test.cpp +++ b/test/core/storage/graph/simple/simple_adj_graph_test.cpp @@ -21,8 +21,8 @@ * @todo */ -#include #include "simple_graph_test.h" +#include int main(void) { simpleGraphFormatTest(); diff --git a/test/core/storage/graph/simple/simple_csr_graph_test.cpp b/test/core/storage/graph/simple/simple_csr_graph_test.cpp index 7f4a5a9d..8231eaba 100644 --- a/test/core/storage/graph/simple/simple_csr_graph_test.cpp +++ b/test/core/storage/graph/simple/simple_csr_graph_test.cpp @@ -21,8 +21,8 @@ * @todo */ -#include #include "simple_graph_test.h" +#include int main(void) { simpleGraphFormatTest(); diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 518b6998..997893ee 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -20,8 +20,8 @@ * @brief Base test for testing graph formats on a very simple graph * @todo */ -#include #include +#include void print_header(std::string storageFormat) { std::cout << "\n"; @@ -31,14 +31,14 @@ void print_header(std::string storageFormat) { std::cout << "\n"; } -template -void simpleGraphFormatTest (void) { - static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); +template void simpleGraphFormatTest(void) { + static_assert(std::is_base_of::value, + "type parameter of this method must be a graph format"); std::unique_ptr graph = std::make_unique(); print_header(graph->get_storage_format()); - graph->allocate_graph_structure(3, 3); + graph->allocate_graph_structure(3, 3); std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; std::map vertexTypeMap = {{0, "Person"}}; @@ -79,4 +79,3 @@ void simpleGraphFormatTest (void) { //assert(false); } - From 6acc75e0064ce4d540b9ee5edc68261d4dd8bce5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 17 May 2020 21:45:11 +0200 Subject: [PATCH 169/216] Fix compile errors * add missing library * apply function renaming --- include/core/storage/graph/edge/edges_vectorarray_container.h | 1 + include/core/storage/graph/formats/adjacencylist.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index 23efeee0..fd832a2b 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -30,6 +30,7 @@ #include #include #include +#include namespace morphstore { // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index c8654cf1..96e62cf4 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -222,7 +222,7 @@ namespace morphstore { this->finalize(); #if DEBUG - std::cout << "Compressing graph format specific data structures using: " << to_string(target_format) + std::cout << "Compressing graph format specific data structures using: " << graph_compr_f_to_string(target_format) << std::endl; auto entry_count = adjacencylistPerVertex->size(); int progress = 0; From ca0e1a3ec267d700b954fc23c6ce12e2eb2dad97 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 17 May 2020 21:47:41 +0200 Subject: [PATCH 170/216] Define ldbc resource dir via cmake as it is used not only in various places in the project (test for core and src & in benchmarks) --- CMakeLists.txt | 3 +++ .../graph/adjList_graph_compression_benchmark.cpp | 12 +++++------- .../graph/csr_graph_compression_benchmark.cpp | 11 +++++------ .../operators/graph/ldbc/bfs_ldbc_graph_test.h | 13 +++++-------- test/core/storage/graph/ldbc/ldbc_graph_test.h | 14 ++++++-------- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32e18393..889229ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,9 @@ morph_flag(-march=native) # remove build type to allow for custom flag handling set(CMAKE_BUILD_TYPE "") +# add resource directory for ldbc graph (something like "${HOME}/ldbc/ldbc_snb_datagen/social_network/") +#morph_flag(-DLDBC_DIR="") + # general compiler settings, meant for all subdirectories and tests morph_flag(-Werror) morph_flag(-pedantic) diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index 37e41349..aa75c5b9 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -46,15 +46,10 @@ struct CompressionBenchmarkEntry { }; int main(void) { +#ifdef LDBC_DIR // could be also build parameters? const int number_of_executions = 5; const int number_of_random_access = 1000; - std::string sourceDir = ""; - - if (sourceDir.empty()) { - throw std::invalid_argument("Where are the ldbc files??"); - } - std::vector compr_formats = { GraphCompressionFormat::DELTA, @@ -64,7 +59,7 @@ int main(void) { // Load ldbc graph std::unique_ptr graph = std::make_unique(); - std::unique_ptr ldbcImport = std::make_unique(sourceDir); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); ldbcImport->import(*graph); // prepare random-access @@ -109,4 +104,7 @@ int main(void) { } return 0; +#else + throw std::invalid_argument("Where are the ldbc files??"); +#endif } diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 5cacc929..62a31b0b 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -44,21 +44,17 @@ struct CompressionBenchmarkEntry { }; int main(void) { +#ifdef LDBC_DIR // could be also build parameters? const int number_of_executions = 5; const int number_of_random_access = 1000; - std::string sourceDir = ""; - - if (sourceDir.empty()) { - throw std::invalid_argument("Where are the ldbc files??"); - } std::vector compr_formats = {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR}; // Load ldbc graph std::unique_ptr graph = std::make_unique(); - std::unique_ptr ldbcImport = std::make_unique(sourceDir); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); ldbcImport->import(*graph); // prepare random-access @@ -104,4 +100,7 @@ int main(void) { } return 0; +#else + throw std::invalid_argument("Where are the ldbc files??"); +#endif } diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index 6af4c075..fcf5f731 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -35,23 +35,17 @@ void print_header(std::string storageFormat) { } template void bfs_ldbc_graph_test(void) { - +#ifdef LDBC_DIR static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - std::string sourceDir = ""; - - if (sourceDir.empty()) { - throw std::invalid_argument("Where are the ldbc files??"); - } - std::unique_ptr graph = std::make_unique(); std::string storageFormat = graph->get_storage_format(); print_header(storageFormat); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) - std::unique_ptr ldbcImport = std::make_unique(sourceDir); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); @@ -64,4 +58,7 @@ template void bfs_ldbc_graph_test(void) { // for scale factor 1 and including static as well as dynamic part of the graph std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; // bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); +#else + throw std::invalid_argument("Where are the ldbc files??"); +#endif } \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index 20fe510f..2811aaa1 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -36,13 +36,8 @@ template void ldbcGraphFormatTest(void) { static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - - std::string sourceDir = ""; - - if (sourceDir.empty()) { - throw std::invalid_argument("Where are the ldbc files??"); - } - + +#ifdef LDBC_DIR std::unique_ptr graph = std::make_unique(); std::string storageFormat = graph->get_storage_format(); @@ -50,7 +45,7 @@ template void ldbcGraphFormatTest(void) { print_header(storageFormat); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) - std::unique_ptr ldbcImport = std::make_unique(sourceDir); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); @@ -69,4 +64,7 @@ template void ldbcGraphFormatTest(void) { // TODO: but this into benchmark or so .. not actual test // std::cout << "Measure degree count" << std::endl; // graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); +#else + throw std::invalid_argument("Where are the ldbc files??"); +#endif } \ No newline at end of file From 2a404fb65bdd3f8dd87cca2522f0473565c7d2fd Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 17 May 2020 21:56:19 +0200 Subject: [PATCH 171/216] Fix debug mode by re-adding the DEBUG flag --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 889229ac..7e21f82e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ set( LOG_FILE "recentMorphStoreProjectConf.log" ) IF(CMAKE_BUILD_TYPE MATCHES Debug) morph_flag(-g) - morph_flag(-DNDEBUG) + morph_flag(-DDEBUG) message(STATUS "MorphStore is configured in DEBUG mode.") ELSEIF(CMAKE_BUILD_TYPE MATCHES Release) morph_flag(-O2) From 39c0e20e1aa6733684657176338ea68bed89bd48 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 18 May 2020 16:14:55 +0200 Subject: [PATCH 172/216] Fix benchmark helper somehow include of vector disappeared --- src/microbenchmarks/graph/benchmark_helper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/microbenchmarks/graph/benchmark_helper.h b/src/microbenchmarks/graph/benchmark_helper.h index 5191a194..01a4fb0c 100644 --- a/src/microbenchmarks/graph/benchmark_helper.h +++ b/src/microbenchmarks/graph/benchmark_helper.h @@ -27,6 +27,7 @@ #include #include #include +#include namespace morphstore { using highResClock = std::chrono::high_resolution_clock; From 1f6bb694d58924b56039afce4eb1d3ac26897aeb Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 18 May 2020 16:56:37 +0200 Subject: [PATCH 173/216] Reduce use of min_compr_degree in adj_list format goal is to have it as a function parameter --- .../storage/graph/formats/adjacencylist.h | 42 ++++++++----------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 96e62cf4..edb078f2 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -45,7 +45,7 @@ namespace morphstore { struct Adjacency_List_Size_Visitor { size_t operator()(const adjacency_column c) const { return c->get_size_used_byte(); } - size_t operator()(const adjacency_vector v) const { return v->size() * sizeof(uint64_t); } + size_t operator()(const adjacency_vector v) const { return sizeof(std::vector) + (v->size() * sizeof(uint64_t)); } }; struct Adjacency_List_OutDegree_Visitor { @@ -53,7 +53,8 @@ namespace morphstore { // assuming compressed col has the same value count (would not work for RLE) return c->get_count_values(); } - uint64_t operator()(const adjacency_vector v) const { return v->size(); } + uint64_t operator()(const adjacency_vector v) const { + return v->size(); } }; // maps the a list of outgoing edges (ids) to a vertex-id @@ -61,11 +62,9 @@ namespace morphstore { new std::unordered_map(); // as default formats allocate to much memory for small columns - // TODO: allow as parameter in compr - // TODO: as parameter this could provide issues when transforming from on format to another - // handle edge-case by finalizing also checking and potentially converting to (old) current_format - // other edge_case: might need to decompress some columns if min_compr_degree got larger - uint64_t min_compr_degree = 100; + // TODO: compress based on blocksize of format (as data smaller than blocksize gets not compressed?!) + // TODO: as function parameter f.i. in change_min_compr_degree -> recall finalize and morph to current_compression + static const uint64_t min_compr_degree = 100; // convert big-enough adj-vector to a (read-only) adj-column void finalize() { @@ -91,23 +90,13 @@ namespace morphstore { #endif } - const column_uncompr *decompress_adjacency_column(const adjacency_column col) const { - // assuming compressed col has the same value count (would not work for RLE) - if (min_compr_degree < col->get_count_values()) { - // decompress_graph_col just checks the format of the column here - return decompress_graph_col(col, GraphCompressionFormat::UNCOMPRESSED); - } else { - return decompress_graph_col(col, current_compression); - } - } - public: ~AdjacencyList() { for (auto [id, adj_list] : *this->adjacencylistPerVertex) { if (std::holds_alternative(adj_list)) { delete std::get(adj_list); } else { - free(std::get(adj_list)); + delete std::get(adj_list); } delete adjacencylistPerVertex; @@ -185,14 +174,17 @@ namespace morphstore { if (auto entry = adjacencylistPerVertex->find(id); entry != adjacencylistPerVertex->end()) { auto adj_list = entry->second; if (std::holds_alternative(adj_list)) { - auto uncompr_col = decompress_adjacency_column(std::get(adj_list)); + auto uncompr_col = decompress_graph_col(std::get(adj_list), current_compression); const size_t column_size = uncompr_col->get_count_values(); // TODO: init vector via range-constructor / mem-cpy // const uint8_t * end_addr = start_addr + sizeof(uint64_t) * out_degree; const uint64_t *start_addr = uncompr_col->get_data(); edge_ids.insert(edge_ids.end(), start_addr, start_addr + column_size); - delete uncompr_col; + + if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { + delete uncompr_col; + } } else { edge_ids = *std::get(adj_list); @@ -234,10 +226,11 @@ namespace morphstore { } progress++; #endif - - // const_cast needed as map-value is not constant - if (std::visit(Adjacency_List_OutDegree_Visitor{}, adj_list) >= min_compr_degree) { + // currently min_compr_degree is final in adj_list and determines which adj-lists are + // are columns (and not a vector) + if (std::holds_alternative(adj_list)) { auto old_adj_col = std::get(adj_list); + // const_cast needed as map-value is not constant (*adjacencylistPerVertex)[id] = const_cast( morph_graph_col(old_adj_col, current_compression, target_format, true)); } @@ -311,7 +304,8 @@ namespace morphstore { void statistics() override { Graph::statistics(); std::cout << "Number of adjacency lists:" << adjacencylistPerVertex->size() << std::endl; - std::cout << "Colum ratio:" << column_ratio() << std::endl; + std::cout << "Min. degree for compression: " << min_compr_degree << std::endl; + std::cout << "Column ratio:" << column_ratio() << std::endl; std::cout << "Compression ratio:" << compr_ratio() << std::endl; std::cout << "--------------------------------------------" << std::endl; std::cout << std::endl << std::endl; From ce8110ebc3d19b6768a1cd137211277747aa7d14 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 18 May 2020 17:47:18 +0200 Subject: [PATCH 174/216] Make min_compression_degree non-static aka adding a setter --- .../storage/graph/formats/adjacencylist.h | 20 +++- .../adjList_graph_compression_benchmark.cpp | 91 ++++++++++--------- 2 files changed, 64 insertions(+), 47 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index edb078f2..52ae2044 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -64,7 +64,7 @@ namespace morphstore { // as default formats allocate to much memory for small columns // TODO: compress based on blocksize of format (as data smaller than blocksize gets not compressed?!) // TODO: as function parameter f.i. in change_min_compr_degree -> recall finalize and morph to current_compression - static const uint64_t min_compr_degree = 100; + uint64_t min_compr_degree = 1024; // convert big-enough adj-vector to a (read-only) adj-column void finalize() { @@ -74,9 +74,14 @@ namespace morphstore { auto adj_vector = std::get(adj_list); // this allows adding new edges to smaller adj_lists (even after morphing) if (adj_vector->size() >= min_compr_degree) { - auto adj_col = + adjacency_column adj_col = const_cast(make_column(adj_vector->data(), adj_vector->size(), true)); + if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { + adj_col = const_cast(morph_graph_col( + adj_col, GraphCompressionFormat::UNCOMPRESSED, current_compression, true)); + } + (*adjacencylistPerVertex)[id] = adj_col; // as v is not needed anymore and allocated using new @@ -117,6 +122,15 @@ namespace morphstore { adjacencylistPerVertex->reserve(numberVertices); } + void set_min_compr_degree(uint64_t new_min_compr_degree) { + if (new_min_compr_degree > min_compr_degree) { + // allowing this would need re-transforming finalized columns to vectors + throw std::runtime_error("Only supporting an decreasing minimum compression degree"); + } + this->min_compr_degree = new_min_compr_degree; + finalize(); + } + // adding a single edge to vertex: void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { Edge e = Edge(sourceId, targetId, type); @@ -157,6 +171,8 @@ namespace morphstore { } } + uint64_t get_min_compr_degree() { return min_compr_degree; } + // get number of neighbors of vertex with id uint64_t get_out_degree(uint64_t id) override { auto entry = adjacencylistPerVertex->find(id); diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index aa75c5b9..aa6557a4 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -21,28 +21,26 @@ * @todo allow different compression formats for the two csr columns; add full_iterate */ +#include "benchmark_helper.h" #include #include #include -#include "benchmark_helper.h" using namespace morphstore; - struct CompressionBenchmarkEntry { - GraphCompressionFormat compr_format; - int64_t compression_time; - double compression_ratio; - double column_ratio; - int64_t random_access_time; - - std::string to_string() { - return "|" + graph_compr_f_to_string(compr_format) + - "|" + std::to_string(compression_time) + - "|" + std::to_string(compression_ratio) + - "|" + std::to_string(column_ratio) + - "|" + std::to_string(random_access_time); - } + GraphCompressionFormat compr_format; + uint64_t min_compr_degree; + int64_t compression_time; + double compression_ratio; + double column_ratio; + int64_t random_access_time; + + std::string to_string() { + return "|" + graph_compr_f_to_string(compr_format) + "|" + std::to_string(min_compr_degree) + "|" + + std::to_string(compression_time) + "|" + std::to_string(compression_ratio) + "|" + + std::to_string(column_ratio) + "|" + std::to_string(random_access_time); + } }; int main(void) { @@ -51,12 +49,11 @@ int main(void) { const int number_of_executions = 5; const int number_of_random_access = 1000; - std::vector compr_formats = { - GraphCompressionFormat::DELTA, - GraphCompressionFormat::FOR, - GraphCompressionFormat::UNCOMPRESSED - }; - + std::vector compr_formats = {GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR, + GraphCompressionFormat::UNCOMPRESSED}; + + std::vector min_compr_degrees = {1024, 500, 100}; + // Load ldbc graph std::unique_ptr graph = std::make_unique(); std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); @@ -70,41 +67,45 @@ int main(void) { random_accesses.push_back(dist(rd)); } - std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; - std::cout << "Compression-Format | compression-time | " + std::cout << "Compression-Format | minimum degree for compression | compression-time | " << "compr. ratio | column ratio | access of edges of 5000 random vertices" << std::endl; - for (auto current_f : compr_formats) { - for (int exec = 0; exec < number_of_executions; exec++) { - CompressionBenchmarkEntry current_try; - current_try.compr_format = current_f; - // restore start state - graph->morph(GraphCompressionFormat::UNCOMPRESSED); + for (auto min_compr_degree : min_compr_degrees) { + for (auto current_f : compr_formats) { + graph->set_min_compr_degree(min_compr_degree); - auto start = highResClock::now(); - graph->morph(current_f); - // compression time - current_try.compression_time = get_duration(start); + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.compr_format = current_f; + current_try.min_compr_degree = graph->get_min_compr_degree(); - current_try.compression_ratio = graph->compr_ratio(); - // currently based on fixed min_compr_degree - current_try.column_ratio = graph->column_ratio(); + // restore start state + graph->morph(GraphCompressionFormat::UNCOMPRESSED); + auto start = highResClock::now(); + graph->morph(current_f); + // compression time + current_try.compression_time = get_duration(start); - // random access - start = highResClock::now(); - for (int random_pos : random_accesses) { - graph->get_outgoing_edge_ids(random_pos); - } - current_try.random_access_time = get_duration(start); + current_try.compression_ratio = graph->compr_ratio(); + // currently based on fixed min_compr_degree + current_try.column_ratio = graph->column_ratio(); - std::cout << current_try.to_string() << std::endl; - } + // random access + start = highResClock::now(); + for (int random_pos : random_accesses) { + graph->get_outgoing_edge_ids(random_pos); + } + current_try.random_access_time = get_duration(start); + + std::cout << current_try.to_string() << std::endl; + } + } } return 0; -#else +#else throw std::invalid_argument("Where are the ldbc files??"); #endif } From 539e41f0afbc4c1e8a76faf886f3475aa9b1b143 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 19 May 2020 19:26:42 +0200 Subject: [PATCH 175/216] Generate edge ids at graph-scope and remove duplicate code * using workaround with composition instead of inheritance * saving EdgeWithId internally but for loading still Edge * only edge-id insertion is graph-format specific * add_edge now returns edge_id --- include/core/storage/graph/edge/edge.h | 141 +++++++++++------- .../core/storage/graph/edge/edges_container.h | 30 +++- .../graph/edge/edges_hashmap_container.h | 12 +- .../graph/edge/edges_vectorarray_container.h | 12 +- .../storage/graph/formats/adjacencylist.h | 107 +++++-------- include/core/storage/graph/formats/csr.h | 103 +++++-------- include/core/storage/graph/graph.h | 78 ++++++++-- .../core/storage/graph/importer/ldbc_import.h | 18 +-- .../graph/csr_graph_compression_benchmark.cpp | 4 +- .../graph/edge_storage_benchmark.cpp | 2 +- .../storage/graph/simple/simple_graph_test.h | 20 +-- 11 files changed, 286 insertions(+), 241 deletions(-) diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 9484bd4a..2657d52e 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -35,100 +35,141 @@ namespace morphstore { + // for loading class Edge { protected: // Edge characteristics - uint64_t sourceID, targetID, id; + uint64_t sourceId, targetId; unsigned short int type; + public: + Edge() {} + + virtual ~Edge() = default; + + Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) { + this->sourceId = sourceId; + this->targetId = targetId; + this->type = type; + } + + // --------------- Getter and Setter --------------- + + uint64_t getSourceId() const { return sourceId; } + + uint64_t getTargetId() const { return targetId; } + + unsigned short getType() const { return type; } + + // function for sorting algorithms in the ldbc-importer: + // compare target-ids and return if it's "lower" (we need the sorting for the CSR) + bool operator<(const Edge &e) const { return getTargetId() < e.getTargetId(); } + + // get size of edge object in bytes: + static size_t size_in_bytes() { + size_t size = 0; + size += sizeof(uint64_t) * 2; // source- and target-id + size += sizeof(unsigned short int); // type + return size; + } + + virtual std::string to_string() const { + return "(" + std::to_string(this->sourceId) + "->" + std::to_string(this->targetId) + ")"; + } + }; + + // for internal usage + class EdgeWithId : public Edge { + private: + uint64_t id; + // delete flag // TODO put as a std::bitset in vectorarray_container bool valid = false; - uint64_t getNextEdgeId() const { - // Todo: enable resetting maxEdgeId - // Ideal would be to pull id gen to graph.h but this requires rewriting Ldbc importer to use (edge property - // setting depends on it) - static uint64_t currentMaxEdgeId = 0; - return currentMaxEdgeId++; - } - public: // default constr. needed for EdgeWithProperties constructor - Edge() {} + EdgeWithId() {} - Edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) - : Edge(getNextEdgeId(), sourceId, targetId, type) {} + EdgeWithId(uint64_t id, uint64_t sourceId, uint64_t targetId, unsigned short int type) + : Edge(sourceId, targetId, type) { + this->id = id; + this->valid = true; + } - Edge(uint64_t id, uint64_t sourceId, uint64_t targetId, unsigned short int type) { - this->sourceID = sourceId; - this->targetID = targetId; - this->type = type; + EdgeWithId(uint64_t id, Edge edge) : Edge(edge.getSourceId(), edge.getTargetId(), edge.getType()) { this->id = id; this->valid = true; } - // this is needed for csr when doing edge_array[offset] = edge... - Edge &operator=(const Edge &edge) { + uint64_t getId() const { return id; } + + bool isValid() const { return valid; } + + // this is needed for edges_container when doing edges[id] = edge + EdgeWithId &operator=(const EdgeWithId &edge) { // self-assignment guard if (this == &edge) return *this; // do the copy - this->sourceID = edge.sourceID; - this->targetID = edge.targetID; - this->type = edge.type; - this->id = edge.id; - this->valid = edge.valid; + this->sourceId = edge.getSourceId(); + this->targetId = edge.getTargetId(); + this->type = edge.getType(); + this->id = edge.getId(); + this->valid = edge.isValid(); // return the existing object so we can chain this operator return *this; } - // --------------- Getter and Setter --------------- - - uint64_t getId() const { return id; } + // edge size + id and valid flag + static size_t size_in_bytes() { return Edge::size_in_bytes() + sizeof(uint64_t) + sizeof(bool); } - uint64_t getSourceId() const { return sourceID; } + std::string to_string() const override { + return "(id:" + std::to_string(this->id) + " ," + "valid: " + std::to_string(this->valid) + + Edge::to_string() + ")"; + } + }; - uint64_t getTargetId() const { return targetID; } + // for loading + class EdgeWithProperties { + private: + std::unordered_map properties; + // not using inheritance as vector elements could not get cast to EdgeWithProperties + Edge edge; - unsigned short getType() const { return type; } + public: + EdgeWithProperties(uint64_t sourceId, uint64_t targetId, unsigned short int type, + const std::unordered_map properties) { + this->edge = Edge(sourceId, targetId, type); + this->properties = properties; + } - bool isValid() const { return valid; } + EdgeWithProperties(uint64_t sourceId, uint64_t targetId, unsigned short int type) { + this->edge = Edge(sourceId, targetId, type); + } - // function for sorting algorithms in the ldbc-importer: - // compare target-ids and return if it's "lower" (we need the sorting for the CSR) - bool operator<(const Edge &e) const { return getTargetId() < e.getTargetId(); } + Edge getEdge() const { return edge; } - // get size of edge object in bytes: - static size_t size_in_bytes() { - size_t size = 0; - size += sizeof(uint64_t) * 3; // id, source- and target-id - size += sizeof(unsigned short int); // type - size += sizeof(bool); // valid flag - return size; - } + std::unordered_map getProperties() { return properties; } - std::string to_string() const { - return "(id:" + std::to_string(this->id) + " ," + std::to_string(this->sourceID) + "->" + - std::to_string(this->targetID) + " ," + "valid: " + std::to_string(this->valid) + ")"; - } + bool operator<(const EdgeWithProperties &e) const { return edge.getTargetId() < e.getEdge().getTargetId(); } }; - class EdgeWithProperties { + // for returning to user + class EdgeWithIdAndProperties { private: - Edge edge; std::unordered_map properties; + EdgeWithId edge; public: - EdgeWithProperties(Edge edge, const std::unordered_map properties) { + EdgeWithIdAndProperties(EdgeWithId edge, const std::unordered_map properties) { this->edge = edge; this->properties = properties; } - - Edge getEdge() { return edge; } + EdgeWithId getEdge() { return edge; } std::unordered_map getProperties() { return properties; } }; diff --git a/include/core/storage/graph/edge/edges_container.h b/include/core/storage/graph/edge/edges_container.h index 6797496f..19a43b66 100644 --- a/include/core/storage/graph/edge/edges_container.h +++ b/include/core/storage/graph/edge/edges_container.h @@ -38,6 +38,7 @@ namespace morphstore { class EdgesContainer { protected: uint64_t expected_edge_count = 0; + uint64_t current_max_edge_id = 0; std::map edge_type_dictionary; @@ -52,10 +53,12 @@ namespace morphstore { } } + uint64_t get_next_edge_id() { return current_max_edge_id++; } + public: virtual std::string container_description() const = 0; - virtual void insert_edge(Edge e) = 0; - virtual Edge get_edge(uint64_t id) = 0; + virtual void insert_edge(EdgeWithId e) = 0; + virtual EdgeWithId get_edge(uint64_t id) = 0; virtual bool exists_edge(const uint64_t id) const = 0; virtual uint64_t edge_count() const = 0; @@ -64,7 +67,21 @@ namespace morphstore { expected_edge_count += expected_edges; } - void add_edge(Edge edge) { insert_edge(edge); } + uint64_t add_edge(Edge edge) { + auto id = get_next_edge_id(); + insert_edge(EdgeWithId(id, edge)); + return id; + } + + uint64_t add_edge(EdgeWithProperties edge) { + auto id = add_edge(edge.getEdge()); + + if (auto properties = edge.getProperties(); !properties.empty()) { + edge_properties[id] = properties; + } + + return id; + } bool has_properties(uint64_t id) { return edge_properties.find(id) != edge_properties.end(); } @@ -90,9 +107,9 @@ namespace morphstore { this->edge_type_dictionary = types; } - const EdgeWithProperties get_edge_with_properties(uint64_t id) { + const EdgeWithIdAndProperties get_edge_with_properties(uint64_t id) { assert(exists_edge(id)); - return EdgeWithProperties(get_edge(id), edge_properties[id]); + return EdgeWithIdAndProperties(get_edge(id), edge_properties[id]); } uint64_t edges_with_properties_count() { return edge_properties.size(); } @@ -129,7 +146,7 @@ namespace morphstore { void print_edge_by_id(const uint64_t id) { std::cout << "-------------- Edge ID: " << id << " --------------" << std::endl; - EdgeWithProperties e = get_edge_with_properties(id); + auto e = get_edge_with_properties(id); std::cout << e.getEdge().to_string() << std::endl; std::cout << "Type: " << this->get_edge_type(e.getEdge().getType()) << std::endl; std::cout << "Properties: "; @@ -139,6 +156,7 @@ namespace morphstore { std::visit(PropertyValueVisitor{}, value); std::cout << "}"; } + std::cout << std::endl; } }; } // namespace morphstore diff --git a/include/core/storage/graph/edge/edges_hashmap_container.h b/include/core/storage/graph/edge/edges_hashmap_container.h index 3b432017..60cc43a8 100644 --- a/include/core/storage/graph/edge/edges_hashmap_container.h +++ b/include/core/storage/graph/edge/edges_hashmap_container.h @@ -34,17 +34,17 @@ namespace morphstore { class EdgesHashMapContainer : public EdgesContainer { protected: - std::unordered_map edges; + std::unordered_map edges; public: - std::string container_description() const override { return "unordered_map"; } + std::string container_description() const override { return "unordered_map"; } void allocate(const uint64_t expected_edges) override { EdgesContainer::allocate(expected_edges); this->edges.reserve(expected_edges); } - void insert_edge(const Edge e) override { edges[e.getId()] = e; } + void insert_edge(const EdgeWithId e) override { edges[e.getId()] = e; } bool exists_edge(const uint64_t id) const override { if (edges.find(id) == edges.end()) { @@ -53,7 +53,7 @@ namespace morphstore { return true; } - Edge get_edge(uint64_t id) override { return edges[id]; } + EdgeWithId get_edge(uint64_t id) override { return edges[id]; } uint64_t edge_count() const { return edges.size(); } @@ -61,10 +61,10 @@ namespace morphstore { auto [index_size, data_size] = EdgesContainer::get_size(); // container for indexes: - index_size += sizeof(std::unordered_map); + index_size += sizeof(std::unordered_map); // index size of edge: size of id and sizeof pointer index_size += edges.size() * sizeof(uint64_t); - data_size += edges.size() * Edge::size_in_bytes(); + data_size += edges.size() * EdgeWithId::size_in_bytes(); return {index_size, data_size}; } diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index fd832a2b..04c4165b 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -28,9 +28,9 @@ #include "edges_container.h" #include +#include #include #include -#include namespace morphstore { // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! @@ -38,9 +38,9 @@ namespace morphstore { class EdgesVectorArrayContainer : public EdgesContainer { protected: static const inline uint64_t edge_array_size = 4096; - static const inline uint64_t edges_per_array = edge_array_size / sizeof(Edge); + static const inline uint64_t edges_per_array = edge_array_size / sizeof(EdgeWithId); - using edge_array = std::array; + using edge_array = std::array; std::vector edges; uint64_t number_of_edges = 0; @@ -60,7 +60,7 @@ namespace morphstore { public: std::string container_description() const override { - return "vector>"; + return "vector>"; } void allocate(const uint64_t expected_edges) override { @@ -74,7 +74,7 @@ namespace morphstore { } } - void insert_edge(Edge e) { + void insert_edge(EdgeWithId e) { auto array_number = get_edge_array_number(e.getId()); auto array_pos = get_pos_in_array(e.getId()); @@ -101,7 +101,7 @@ namespace morphstore { return edges.at(array_number)[pos_in_array].isValid(); } - Edge get_edge(uint64_t id) override { + EdgeWithId get_edge(uint64_t id) override { uint64_t array_number = get_edge_array_number(id); uint64_t pos_in_array = get_pos_in_array(id); diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 52ae2044..dc967643 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -45,7 +45,9 @@ namespace morphstore { struct Adjacency_List_Size_Visitor { size_t operator()(const adjacency_column c) const { return c->get_size_used_byte(); } - size_t operator()(const adjacency_vector v) const { return sizeof(std::vector) + (v->size() * sizeof(uint64_t)); } + size_t operator()(const adjacency_vector v) const { + return sizeof(std::vector) + (v->size() * sizeof(uint64_t)); + } }; struct Adjacency_List_OutDegree_Visitor { @@ -53,8 +55,7 @@ namespace morphstore { // assuming compressed col has the same value count (would not work for RLE) return c->get_count_values(); } - uint64_t operator()(const adjacency_vector v) const { - return v->size(); } + uint64_t operator()(const adjacency_vector v) const { return v->size(); } }; // maps the a list of outgoing edges (ids) to a vertex-id @@ -63,7 +64,8 @@ namespace morphstore { // as default formats allocate to much memory for small columns // TODO: compress based on blocksize of format (as data smaller than blocksize gets not compressed?!) - // TODO: as function parameter f.i. in change_min_compr_degree -> recall finalize and morph to current_compression + // TODO: as function parameter f.i. in change_min_compr_degree -> recall finalize and morph to + // current_compression uint64_t min_compr_degree = 1024; // convert big-enough adj-vector to a (read-only) adj-column @@ -95,6 +97,30 @@ namespace morphstore { #endif } + protected: + // function that adds multiple edges (list of neighbors) at once to vertex + void add_to_vertex_edges_mapping(uint64_t sourceId, const std::vector edge_ids) override { + // avoid inserting an empty adjacencyVector (waste of memory) + if (edge_ids.size() == 0) { + return; + } + + std::vector *adjacencyVector; + if (auto entry = adjacencylistPerVertex->find(sourceId); entry != adjacencylistPerVertex->end()) { + if (std::holds_alternative(entry->second)) { + throw std::runtime_error("Not implemented to add edges, if adj. list is a (compressed) column"); + } + + adjacencyVector = std::get(entry->second); + } else { + adjacencyVector = new std::vector(); + adjacencylistPerVertex->insert({sourceId, adjacencyVector}); + } + + adjacencyVector->reserve(edge_ids.size()); + adjacencyVector->insert(adjacencyVector->end(), edge_ids.begin(), edge_ids.end()); + } + public: ~AdjacencyList() { for (auto [id, adj_list] : *this->adjacencylistPerVertex) { @@ -132,49 +158,15 @@ namespace morphstore { } // adding a single edge to vertex: - void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { + uint64_t add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { Edge e = Edge(sourceId, targetId, type); - add_edges(sourceId, {e}); - } - - // function that adds multiple edges (list of neighbors) at once to vertex - void add_edges(uint64_t sourceId, const std::vector edgesToAdd) override { - if (!vertices->exists_vertex(sourceId)) { - throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); - } - - // avoid inserting an empty adjacencyVector (waste of memory) - if (edgesToAdd.size() == 0) { - return; - } - - std::vector *adjacencyVector; - if (auto entry = adjacencylistPerVertex->find(sourceId); entry != adjacencylistPerVertex->end()) { - if (std::holds_alternative(entry->second)) { - throw std::runtime_error("Not implemented to add edges, if adj. list is a (compressed) column"); - } - - adjacencyVector = std::get(entry->second); - } else { - adjacencyVector = new std::vector(); - adjacencylistPerVertex->insert({sourceId, adjacencyVector}); - } - - adjacencyVector->reserve(edgesToAdd.size()); - - for (const auto edge : edgesToAdd) { - if (!vertices->exists_vertex(edge.getTargetId())) { - throw std::runtime_error("Target not found :" + edge.to_string()); - } - edges->add_edge(edge); - adjacencyVector->push_back(edge.getId()); - } + return add_edges(sourceId, {e})[0]; } uint64_t get_min_compr_degree() { return min_compr_degree; } // get number of neighbors of vertex with id - uint64_t get_out_degree(uint64_t id) override { + uint64_t get_out_degree(uint64_t id) const override { auto entry = adjacencylistPerVertex->find(id); if (entry == adjacencylistPerVertex->end()) { return 0; @@ -183,7 +175,7 @@ namespace morphstore { } } - std::vector get_outgoing_edge_ids(uint64_t id) { + std::vector get_outgoing_edge_ids(uint64_t id) const override { // basically column -> vector (as convinient to use in other methods) // maybe better idea would be to return a uint64_t* instead (together with a size value) std::vector edge_ids; @@ -210,19 +202,6 @@ namespace morphstore { return edge_ids; } - // get the neighbors-ids into vector for BFS alg. - // todo: this is actually format generic and can be pulled to graph.h - std::vector get_neighbors_ids(uint64_t id) override { - std::vector targetVertexIds; - - for (uint64_t const edgeId : get_outgoing_edge_ids(id)) { - assert(edges->exists_edge(edgeId)); - targetVertexIds.push_back(edges->get_edge(edgeId).getTargetId()); - } - - return targetVertexIds; - } - // morphes the adj-lists to the given target_format // !!! first time overhead: as convert each vector to a column (finalizing) !!! void morph(GraphCompressionFormat target_format) override { @@ -230,8 +209,8 @@ namespace morphstore { this->finalize(); #if DEBUG - std::cout << "Compressing graph format specific data structures using: " << graph_compr_f_to_string(target_format) - << std::endl; + std::cout << "Compressing graph format specific data structures using: " + << graph_compr_f_to_string(target_format) << std::endl; auto entry_count = adjacencylistPerVertex->size(); int progress = 0; #endif @@ -303,20 +282,6 @@ namespace morphstore { return {index_size, data_size}; } - // for debugging: print neighbors a vertex - void print_neighbors_of_vertex(uint64_t id) override { - std::cout << std::endl << "Neighbours for Vertex with id " << id << std::endl; - auto edge_ids = get_outgoing_edge_ids(id); - - if (edge_ids.size() == 0) { - std::cout << " No outgoing edges for vertex with id: " << id << std::endl; - } else { - for (const auto edge_id : edge_ids) { - print_edge_by_id(edge_id); - } - } - } - void statistics() override { Graph::statistics(); std::cout << "Number of adjacency lists:" << adjacencylistPerVertex->size() << std::endl; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index a91dec05..ba1e9a82 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -41,6 +41,41 @@ namespace morphstore { column_base *offset_column; column_base *edgeId_column; + protected: + // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC + void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) override { + // TODO: throw error if not in order of vertex-ids ASC inserted (currently will only produce rubbish data) + // TODO: handle if sourceIDs are skipped + // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and + // insert offsets for skipped vertices + + // avoid writting more than reserved (as fixed sized columns) + assert(expectedEdgeCount >= getEdgeCount()); + + // currently only read-only if compressed + if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { + throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + + graph_compr_f_to_string(current_compression)); + } + + uint64_t *offset_data = offset_column->get_data(); + uint64_t offset = offset_data[sourceID]; + uint64_t nextOffset = offset + edge_ids.size(); + + uint64_t *edgeId_data = edgeId_column->get_data(); + // TODO: get copy to work (should be faster than loop) + // std::copy(edge_ids.begin(), edge_ids.end(), edgeId_data); + for (auto edge_id : edge_ids) { + edgeId_data[offset] = edge_id; + offset++; + } + + // to avoid buffer overflow: + if (sourceID < getExpectedVertexCount() - 1) { + offset_data[sourceID + 1] = nextOffset; + } + } + public: ~CSR() { free(offset_column); @@ -75,53 +110,12 @@ namespace morphstore { } // TODO: add a single edge in graph arrays -> needs a memory reallocating strategy - void add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { + uint64_t add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { throw std::runtime_error("Singe edge addition not yet implemented for CSR" + sourceId + targetId + type); } - // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC - // every vertex id contains a list of its neighbors - void add_edges(uint64_t sourceID, const std::vector edgesToAdd) override { - // TODO: throw error if not in order of vertex-ids ASC inserted (currently will only produce rubbish data) - // TODO: handle if sourceIDs are skipped - // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and - // insert offsets for skipped vertices - assert(expectedEdgeCount >= getEdgeCount() + edgesToAdd.size()); - - // currently only read-only if compressed - if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { - throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + - graph_compr_f_to_string(current_compression)); - } - - uint64_t *offset_data = offset_column->get_data(); - uint64_t offset = offset_data[sourceID]; - uint64_t nextOffset = offset + edgesToAdd.size(); - - if (!vertices->exists_vertex(sourceID)) { - throw std::runtime_error("Source-id not found " + std::to_string(sourceID)); - } - - // fill the arrays - // TODO: fill array using memcpy? (put edgeIds into vector as prepare step) - uint64_t *edgeId_data = edgeId_column->get_data(); - for (const auto &edge : edgesToAdd) { - if (!vertices->exists_vertex(edge.getTargetId())) { - throw std::runtime_error("Target not found " + edge.to_string()); - } - edgeId_data[offset] = edge.getId(); - edges->add_edge(edge); - ++offset; - } - - // to avoid buffer overflow: - if (sourceID < getExpectedVertexCount() - 1) { - offset_data[sourceID + 1] = nextOffset; - } - } - // get number of edges of vertex with id - uint64_t get_out_degree(uint64_t id) override { + uint64_t get_out_degree(uint64_t id) const override { // decompressing offset_column in order to read correct offset // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); @@ -150,17 +144,6 @@ namespace morphstore { } } - // function to return a vector of ids of neighbors for BFS alg. - std::vector get_neighbors_ids(uint64_t id) override { - std::vector targetVertexIds; - for (auto edge_id : get_outgoing_edge_ids(id)) { - assert(edges->exists_edge(edge_id)); - targetVertexIds.push_back(edges->get_edge(edge_id).getTargetId()); - } - - return targetVertexIds; - } - void morph(GraphCompressionFormat target_format) override { #if DEBUG std::cout << "Morphing graph format specific data structures from " @@ -193,7 +176,7 @@ namespace morphstore { return {index_size, data_size}; } - std::vector get_outgoing_edge_ids(uint64_t id) { + std::vector get_outgoing_edge_ids(uint64_t id) const override { assert(vertices->exists_vertex(id)); std::vector out_edge_ids; @@ -222,16 +205,6 @@ namespace morphstore { double edgeId_column_compr_ratio() { return compression_ratio(edgeId_column, current_compression); } - // for debugging: - // TODO: simply by using a get_outgoing_edges(id) method - void print_neighbors_of_vertex(uint64_t id) override { - std::cout << "Neighbours for Vertex with id " << id << std::endl; - - for (auto const edge_id : get_outgoing_edge_ids(id)) { - print_edge_by_id(edge_id); - } - } - std::string get_column_info(const column_base *column) { return " values: " + std::to_string(column->get_count_values()) + " size in bytes: " + std::to_string(column->get_size_used_byte()) + diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 663de5d2..a0dc535b 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -57,6 +57,8 @@ namespace morphstore { std::unique_ptr vertices; std::unique_ptr edges; + virtual void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) = 0; + public: Graph(EdgesContainerType edges_container_type) : Graph(VerticesContainerType::VectorArrayContainer, edges_container_type) {} @@ -111,11 +113,9 @@ namespace morphstore { return vertices->add_vertex(type, props); }; - // function which returns a pointer to vertex by id VertexWithProperties get_vertex(uint64_t id) { return vertices->get_vertex_with_properties(id); } - // function which returns a pointer to edge by id - EdgeWithProperties get_edge(uint64_t id) { return edges->get_edge_with_properties(id); } + EdgeWithIdAndProperties get_edge(uint64_t id) { return edges->get_edge_with_properties(id); } // function to return a list of pair < vertex id, degree > DESC: // TODO: move into seperate header and use graph as input parameter @@ -172,14 +172,60 @@ namespace morphstore { edges->set_edge_properties(id, properties); }; - // -------------------- pure virtual functions -------------------- - virtual std::string get_storage_format() const = 0; - virtual void add_edge(uint64_t from, uint64_t to, unsigned short int rel) = 0; - virtual void add_edges(uint64_t sourceID, const std::vector relations) = 0; + virtual uint64_t add_edge(uint64_t from, uint64_t to, unsigned short int type) = 0; virtual void morph(GraphCompressionFormat target_format) = 0; - virtual uint64_t get_out_degree(uint64_t id) = 0; - virtual std::vector get_neighbors_ids(uint64_t id) = 0; + virtual std::vector get_outgoing_edge_ids(uint64_t id) const = 0; + virtual uint64_t get_out_degree(uint64_t id) const = 0; + + // function to return a vector of ids of neighbors for BFS alg. + std::vector get_neighbors_ids(uint64_t id) const { + std::vector targetVertexIds; + for (auto edge_id : get_outgoing_edge_ids(id)) { + assert(edges->exists_edge(edge_id)); + targetVertexIds.push_back(edges->get_edge(edge_id).getTargetId()); + } + + return targetVertexIds; + }; + + std::vector add_edges(uint64_t sourceId, const std::vector edges_to_add) { + std::vector edge_ids; + + if (!vertices->exists_vertex(sourceId)) { + throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); + } + + for (auto edge : edges_to_add) { + if (!vertices->exists_vertex(edge.getTargetId())) { + throw std::runtime_error("Target not found :" + edge.to_string()); + } + edge_ids.push_back(edges->add_edge(edge)); + } + + add_to_vertex_edges_mapping(sourceId, edge_ids); + + return edge_ids; + }; + + std::vector add_edges(uint64_t sourceId, const std::vector edges_to_add) { + std::vector edge_ids; + + if (!vertices->exists_vertex(sourceId)) { + throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); + } + + for (auto edge_with_props : edges_to_add) { + if (auto edge = edge_with_props.getEdge(); !vertices->exists_vertex(edge.getTargetId())) { + throw std::runtime_error("Target not found :" + edge.to_string()); + } + edge_ids.push_back(edges->add_edge(edge_with_props)); + } + + add_to_vertex_edges_mapping(sourceId, edge_ids); + + return edge_ids; + }; virtual std::pair get_size_of_graph() const { // including vertices + its properties + its type dict @@ -203,8 +249,18 @@ namespace morphstore { // -------------------- debugging functions -------------------- - // for debugging - virtual void print_neighbors_of_vertex(uint64_t id) = 0; + void print_neighbors_of_vertex(uint64_t id) { + std::cout << std::endl << "Neighbours for Vertex with id " << id << std::endl; + auto edge_ids = get_outgoing_edge_ids(id); + + if (edge_ids.size() == 0) { + std::cout << " No outgoing edges for vertex with id: " << id << std::endl; + } else { + for (const auto edge_id : edge_ids) { + print_edge_by_id(edge_id); + } + } + } virtual void statistics() { std::cout << "---------------- Statistics ----------------" << std::endl; diff --git a/include/core/storage/graph/importer/ldbc_import.h b/include/core/storage/graph/importer/ldbc_import.h index 7dbcc9a3..0adf9af1 100644 --- a/include/core/storage/graph/importer/ldbc_import.h +++ b/include/core/storage/graph/importer/ldbc_import.h @@ -64,8 +64,7 @@ namespace morphstore { // unordered_map for lookup system-id and its in the graph (for further processing, e.g. filling the edge_array // in the right order) - std::unordered_map> vertexEdgesLookup; - std::unordered_map> edgeProperties; + std::unordered_map> vertexEdgesLookup; public: // directory including a static/ and dynamic/ directory like in /ldbc_snb_datagen/social_network/ @@ -303,7 +302,6 @@ namespace morphstore { edgeTypeLookup.clear(); vertexTypeLookup.clear(); edgesPaths.clear(); - edgeProperties.clear(); verticesPaths.clear(); vertexEdgesLookup.clear(); } @@ -597,7 +595,7 @@ namespace morphstore { // insert edge into vertexRealtionsLookup: vertexEdgesLookup[sourceVertexId].push_back( - morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber)); + EdgeWithProperties(sourceVertexId, targetVertexId, edgeTypeNumber)); } else { // with properties means: toID is until the next delimiter, and then the value // for the property @@ -607,10 +605,10 @@ namespace morphstore { value = row; // insert edge into vertexEdgesLookup with its edge-property: - auto edge = morphstore::Edge(sourceVertexId, targetVertexId, edgeTypeNumber); - vertexEdgesLookup[sourceVertexId].push_back(edge); // assuming all properties of an edge are defined in the same file - edgeProperties[edge.getId()] = {{propertyKey, value}}; + auto edge = EdgeWithProperties(sourceVertexId, targetVertexId, edgeTypeNumber, + {{propertyKey, value}}); + vertexEdgesLookup[sourceVertexId].push_back(edge); } } start = i; // set new starting point for buffer (otherwise it's concatenated) @@ -647,12 +645,6 @@ namespace morphstore { auto edges = vertexEdgesLookup[vertexID]; // add edge data: graph.add_edges(vertexID, edges); - for (auto edge : edges) { - auto entry = edgeProperties.find(edge.getId()); - if (entry != edgeProperties.end()) { - graph.set_edge_properties(entry->first, entry->second); - } - } } } diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 62a31b0b..cf6232fe 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -70,7 +70,7 @@ int main(void) { << std::endl; std::cout << "Compression-Format | compression-time | offset-column compr. ratio" << " | edgeId-column compr. ratio | access of edges of " - << std::to_string(number_of_random_access) + " random vertices | full edge-list iterate" << std::endl; + << std::to_string(number_of_random_access) + " random vertices" << std::endl; for (auto current_f : compr_formats) { for (int exec = 0; exec < number_of_executions; exec++) { @@ -101,6 +101,6 @@ int main(void) { return 0; #else - throw std::invalid_argument("Where are the ldbc files??"); + throw std::invalid_argument("Where are the ldbc files??"); #endif } diff --git a/src/microbenchmarks/graph/edge_storage_benchmark.cpp b/src/microbenchmarks/graph/edge_storage_benchmark.cpp index 6d60bfd2..4d359d6d 100644 --- a/src/microbenchmarks/graph/edge_storage_benchmark.cpp +++ b/src/microbenchmarks/graph/edge_storage_benchmark.cpp @@ -62,7 +62,7 @@ int main(void) { std::vector edges; for (int i = 0; i < edge_count; i++) { - edges.push_back(Edge(i, vertex_id, vertex_id, 0)); + edges.push_back(Edge(vertex_id, vertex_id, 0)); } auto start = highResClock::now(); diff --git a/test/core/storage/graph/simple/simple_graph_test.h b/test/core/storage/graph/simple/simple_graph_test.h index 997893ee..d9746575 100644 --- a/test/core/storage/graph/simple/simple_graph_test.h +++ b/test/core/storage/graph/simple/simple_graph_test.h @@ -49,18 +49,22 @@ template void simpleGraphFormatTest(void) { uint64_t v2 = graph->add_vertex(0); uint64_t v3 = graph->add_vertex(0); - auto e1 = morphstore::Edge(v1, v2, 1); - - graph->add_edges(v1, {e1}); - graph->set_edge_properties(e1.getId(), {{"rating", 42}, {"description", "has the answer to everything"}}); + auto v1_edge_ids = + graph->add_edges(v1, {morphstore::EdgeWithProperties( + v1, v2, 1, {{"rating", 42}, {"description", "has the answer to everything"}})}); graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v3, 1)}); // (DEBUG) graph->statistics(); + graph->print_neighbors_of_vertex(0); + /* graph->print_neighbors_of_vertex(v1); + graph->print_neighbors_of_vertex(v2); + graph->print_neighbors_of_vertex(v3); */ + assert(graph->getVertexCount() == 3); assert(graph->getEdgeCount() == 3); - assert((int)graph->get_edge(e1.getId()).getProperties().size() == 2); + assert((int)graph->get_edge(v1_edge_ids[0]).getProperties().size() == 2); assert(graph->get_out_degree(v3) == 0); assert(graph->get_out_degree(v1) == 1); assert(graph->get_out_degree(v2) == 2); @@ -69,13 +73,9 @@ template void simpleGraphFormatTest(void) { graph->statistics(); -/* graph->print_neighbors_of_vertex(v1); - graph->print_neighbors_of_vertex(v2); - graph->print_neighbors_of_vertex(v3); */ - assert(graph->get_out_degree(v3) == 0); assert(graph->get_out_degree(v1) == 1); assert(graph->get_out_degree(v2) == 2); - //assert(false); + // assert(false); } From 7c996bd876e4f32bd08973360a15417af64791e3 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 25 May 2020 16:59:08 +0200 Subject: [PATCH 176/216] Add dynamic_vbp as a graph compression format --- .../core/storage/graph/graph_compr_format.h | 35 ++++++++++++++----- .../adjList_graph_compression_benchmark.cpp | 9 +++-- .../graph/csr_graph_compression_benchmark.cpp | 3 +- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index 2b5a0d38..126c1ffb 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -40,12 +41,12 @@ namespace morphstore { // example layout: dynamic_vbp_f<512, 32, 8> using ve = vectorlib::scalar>; - // TODO use column_base (currently not working as template argument deduction/substitution fails) using column_uncompr = column; + using column_dyn_vbp = column; using column_delta = column; using column_for = column; - enum class GraphCompressionFormat { DELTA, FOR, UNCOMPRESSED }; + enum class GraphCompressionFormat { DELTA, FOR, UNCOMPRESSED, DYNAMIC_VBP }; std::string graph_compr_f_to_string(GraphCompressionFormat format) { std::string desc; @@ -60,6 +61,9 @@ namespace morphstore { case GraphCompressionFormat::FOR: desc = "Frame of Reference (Default)"; break; + case GraphCompressionFormat::DYNAMIC_VBP: + desc = "Dynamic vertical bitpacking (Default)"; + break; } return desc; @@ -85,6 +89,9 @@ namespace morphstore { case GraphCompressionFormat::FOR: result = morph(old_col); break; + case GraphCompressionFormat::DYNAMIC_VBP: + result = morph(old_col); + break; case GraphCompressionFormat::UNCOMPRESSED: // handled by src_f == trg_f break; @@ -97,10 +104,8 @@ namespace morphstore { result = morph(old_col); } else { // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); - result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f); - - delete uncompr_col; + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); } break; } @@ -110,14 +115,26 @@ namespace morphstore { result = morph(old_col); } else { // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col); - result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f); - delete uncompr_col; + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + case GraphCompressionFormat::DYNAMIC_VBP: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_dyn_vbp *old_col = dynamic_cast(column); + result = morph(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + // delete_in_col = true as temporary uncompr_col should always be deleted + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); } break; } } + // free input column if possible if (result != column && delete_in_col) { delete column; } diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index aa6557a4..b8c9feae 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -50,16 +50,17 @@ int main(void) { const int number_of_random_access = 1000; std::vector compr_formats = {GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR, + GraphCompressionFormat::DYNAMIC_VBP, GraphCompressionFormat::UNCOMPRESSED}; - std::vector min_compr_degrees = {1024, 500, 100}; + std::vector min_compr_degrees = {1024, 500, 100, 64, 1}; // Load ldbc graph std::unique_ptr graph = std::make_unique(); std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); ldbcImport->import(*graph); - // prepare random-access + // prepare random-access (TODO: makes only sense if column_ratio is high enough) --> also measure full iterate here std::random_device rd; std::uniform_int_distribution dist(0, graph->getVertexCount() - 1); std::vector random_accesses; @@ -73,6 +74,10 @@ int main(void) { for (auto min_compr_degree : min_compr_degrees) { for (auto current_f : compr_formats) { + if (min_compr_degree < 100 && (current_f != GraphCompressionFormat::DYNAMIC_VBP || current_f != GraphCompressionFormat::UNCOMPRESSED)) { + continue; + } + graph->set_min_compr_degree(min_compr_degree); for (int exec = 0; exec < number_of_executions; exec++) { diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index cf6232fe..92a88863 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -50,7 +50,8 @@ int main(void) { const int number_of_random_access = 1000; std::vector compr_formats = {GraphCompressionFormat::UNCOMPRESSED, - GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR}; + GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR, + GraphCompressionFormat::DYNAMIC_VBP}; // Load ldbc graph std::unique_ptr graph = std::make_unique(); From 4b4dd33c0a7a3baa986be726ef9b81e01b29d502 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 25 May 2020 17:00:24 +0200 Subject: [PATCH 177/216] Correct column-ratio by not counting empty adjacency lists --- include/core/storage/graph/formats/adjacencylist.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index dc967643..0d5f2a48 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -91,6 +91,9 @@ namespace morphstore { vectors_transformed++; } } + // TODO: higher-min compr degree -> transform columns back to vector using: + // new std::vector() + // adjacency_vector dest(src, src + n); } #if DEBUG std::cout << "Transformed " << vectors_transformed << " vectors into columns" << std::endl; @@ -252,9 +255,11 @@ namespace morphstore { return total_compr_ratio; } + // ratio of adjacency columns (rest would be vectors) double column_ratio() const { + // neither coloumns or vectors if (getEdgeCount() == 0) { - return 1; + return -1; } uint64_t column_count = 0; @@ -264,7 +269,7 @@ namespace morphstore { } } - return (double)column_count / getEdgeCount(); + return (double)column_count / adjacencylistPerVertex->size(); } // for measuring the size in bytes: From 406e54535a232576776f3dc0ded271bf62e5e23d Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 25 May 2020 19:00:00 +0200 Subject: [PATCH 178/216] Set min-compr-degree based on blocksize of compression format --- .../storage/graph/formats/adjacencylist.h | 23 ++++++++++++------- .../core/storage/graph/graph_compr_format.h | 23 ++++++++++++++++++- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 0d5f2a48..897180e1 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -62,9 +62,7 @@ namespace morphstore { std::unordered_map *adjacencylistPerVertex = new std::unordered_map(); - // as default formats allocate to much memory for small columns - // TODO: compress based on blocksize of format (as data smaller than blocksize gets not compressed?!) - // TODO: as function parameter f.i. in change_min_compr_degree -> recall finalize and morph to + // as formats allocate to much memory for small columns // current_compression uint64_t min_compr_degree = 1024; @@ -86,7 +84,7 @@ namespace morphstore { (*adjacencylistPerVertex)[id] = adj_col; - // as v is not needed anymore and allocated using new + // as vector is not needed anymore and allocated using new delete adj_vector; vectors_transformed++; } @@ -208,8 +206,17 @@ namespace morphstore { // morphes the adj-lists to the given target_format // !!! first time overhead: as convert each vector to a column (finalizing) !!! void morph(GraphCompressionFormat target_format) override { - // transform big enough vectors into columns - this->finalize(); + morph(target_format, false); + } + + // as if blocksize > size of adjlist -> stays uncompressed but still allocates a whole block + void morph(GraphCompressionFormat target_format, bool blocksize_based_min_degree) { + if (blocksize_based_min_degree) { + set_min_compr_degree(graph_compr_f_block_size(target_format)); + } else { + // transform big enough vectors into columns + this->finalize(); + } #if DEBUG std::cout << "Compressing graph format specific data structures using: " @@ -291,8 +298,8 @@ namespace morphstore { Graph::statistics(); std::cout << "Number of adjacency lists:" << adjacencylistPerVertex->size() << std::endl; std::cout << "Min. degree for compression: " << min_compr_degree << std::endl; - std::cout << "Column ratio:" << column_ratio() << std::endl; - std::cout << "Compression ratio:" << compr_ratio() << std::endl; + std::cout << "Column/Vector ratio: " << column_ratio() << std::endl; + std::cout << "Compression ratio: " << compr_ratio() << std::endl; std::cout << "--------------------------------------------" << std::endl; std::cout << std::endl << std::endl; } diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index 126c1ffb..ec881f59 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -38,7 +38,6 @@ namespace morphstore { // TODO: allow also other vector extensions (switch from safe_morph to morph) - // example layout: dynamic_vbp_f<512, 32, 8> using ve = vectorlib::scalar>; using column_uncompr = column; @@ -69,6 +68,28 @@ namespace morphstore { return desc; } + // !! assuming using ve = vectorlib::scalar> + size_t inline graph_compr_f_block_size(GraphCompressionFormat format) { + size_t block_size = 1; + + switch (format) { + case GraphCompressionFormat::DELTA: + block_size = 1024; + break; + case GraphCompressionFormat::UNCOMPRESSED: + block_size = 1; + break; + case GraphCompressionFormat::FOR: + block_size = 1024; + break; + case GraphCompressionFormat::DYNAMIC_VBP: + block_size = 64; + break; + } + + return block_size; + } + // casting the column to the actual column type before morphing (as compiler could not derive it) // delete_old_col -> delete input column after morphing (if the result is not the input column) const column_base *morph_graph_col(const column_base *column, const GraphCompressionFormat src_f, From c3e82c4a283afd14850fe618090690038f6cd8b3 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 25 May 2020 19:26:57 +0200 Subject: [PATCH 179/216] Benchmark full_iterate for adj-list graph .. as only a few adjacency lists can be columns --- .../adjList_graph_compression_benchmark.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index b8c9feae..3d455df1 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -35,11 +35,13 @@ struct CompressionBenchmarkEntry { double compression_ratio; double column_ratio; int64_t random_access_time; + int64_t full_iterate; std::string to_string() { return "|" + graph_compr_f_to_string(compr_format) + "|" + std::to_string(min_compr_degree) + "|" + std::to_string(compression_time) + "|" + std::to_string(compression_ratio) + "|" + - std::to_string(column_ratio) + "|" + std::to_string(random_access_time); + std::to_string(column_ratio) + "|" + std::to_string(random_access_time) + "|" + + std::to_string(full_iterate); } }; @@ -70,11 +72,12 @@ int main(void) { std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; std::cout << "Compression-Format | minimum degree for compression | compression-time | " - << "compr. ratio | column ratio | access of edges of 5000 random vertices" << std::endl; + << "compr. ratio | column ratio | access of edges of 5000 random vertices | full-iterate" << std::endl; for (auto min_compr_degree : min_compr_degrees) { for (auto current_f : compr_formats) { - if (min_compr_degree < 100 && (current_f != GraphCompressionFormat::DYNAMIC_VBP || current_f != GraphCompressionFormat::UNCOMPRESSED)) { + if (min_compr_degree < 100 && !(current_f == GraphCompressionFormat::DYNAMIC_VBP || + current_f == GraphCompressionFormat::UNCOMPRESSED)) { continue; } @@ -104,6 +107,15 @@ int main(void) { } current_try.random_access_time = get_duration(start); + // full iterate + auto vertex_count = graph->getVertexCount(); + start = highResClock::now(); + for (uint64_t id = 0; id < vertex_count; id++) { + graph->get_outgoing_edge_ids(id); + } + + current_try.full_iterate = get_duration(start); + std::cout << current_try.to_string() << std::endl; } } From 329261382b09951ecd5bdfa020be5bc04ebff762 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 25 May 2020 19:41:13 +0200 Subject: [PATCH 180/216] Fix compression benchmark output --- .../graph/adjList_graph_compression_benchmark.cpp | 6 +++--- .../graph/csr_graph_compression_benchmark.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index 3d455df1..104debf2 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -38,9 +38,9 @@ struct CompressionBenchmarkEntry { int64_t full_iterate; std::string to_string() { - return "|" + graph_compr_f_to_string(compr_format) + "|" + std::to_string(min_compr_degree) + "|" + + return graph_compr_f_to_string(compr_format) + "|" + std::to_string(min_compr_degree) + "|" + std::to_string(compression_time) + "|" + std::to_string(compression_ratio) + "|" + - std::to_string(column_ratio) + "|" + std::to_string(random_access_time) + "|" + + std::to_string(column_ratio) + "|" + std::to_string(random_access_time) + "|" + std::to_string(full_iterate); } }; @@ -72,7 +72,7 @@ int main(void) { std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; std::cout << "Compression-Format | minimum degree for compression | compression-time | " - << "compr. ratio | column ratio | access of edges of 5000 random vertices | full-iterate" << std::endl; + << "compr. ratio | column ratio | access of edges of 5000 random vertices | full-iterate " << std::endl; for (auto min_compr_degree : min_compr_degrees) { for (auto current_f : compr_formats) { diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 92a88863..263b61f9 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -37,7 +37,7 @@ struct CompressionBenchmarkEntry { int64_t full_iterate; std::string to_string() { - return "|" + graph_compr_f_to_string(compr_format) + "|" + std::to_string(compression_time) + "|" + + return graph_compr_f_to_string(compr_format) + "|" + std::to_string(compression_time) + "|" + std::to_string(offset_col_compression_ratio) + "|" + std::to_string(edgeId_col_compression_ratio) + "|" + std::to_string(random_access_time); } From ab6835c05356b2d35d54b03d17f68cfc1a4dba03 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 29 May 2020 16:37:27 +0200 Subject: [PATCH 181/216] Replace hard coded block-sizes --- include/core/storage/graph/graph_compr_format.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index ec881f59..e4e0e7f5 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -68,22 +68,21 @@ namespace morphstore { return desc; } - // !! assuming using ve = vectorlib::scalar> size_t inline graph_compr_f_block_size(GraphCompressionFormat format) { size_t block_size = 1; switch (format) { case GraphCompressionFormat::DELTA: - block_size = 1024; + block_size = DEFAULT_DELTA_DYNAMIC_VBP_F(ve)::m_BlockSize; break; case GraphCompressionFormat::UNCOMPRESSED: - block_size = 1; + block_size = uncompr_f::m_BlockSize; break; case GraphCompressionFormat::FOR: - block_size = 1024; + block_size = DEFAULT_FOR_DYNAMIC_VBP_F(ve)::m_BlockSize; break; case GraphCompressionFormat::DYNAMIC_VBP: - block_size = 64; + block_size = DEFAULT_DYNAMIC_VBP_F(ve)::m_BlockSize; break; } From 137ce369fabbea1ca58150a84bc3cff3cd5b1989 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 29 May 2020 17:06:45 +0200 Subject: [PATCH 182/216] Use format blocksize as min_compr_degree by default --- .../storage/graph/formats/adjacencylist.h | 21 ++++++++++--------- .../adjList_graph_compression_benchmark.cpp | 5 +++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 897180e1..f01b912f 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -91,7 +91,9 @@ namespace morphstore { } // TODO: higher-min compr degree -> transform columns back to vector using: // new std::vector() - // adjacency_vector dest(src, src + n); + // adjacency_vector adj_vec(src, src + n); + // (*adjacencylistPerVertex)[id] = adj_vec; + // delete old column } #if DEBUG std::cout << "Transformed " << vectors_transformed << " vectors into columns" << std::endl; @@ -152,7 +154,9 @@ namespace morphstore { void set_min_compr_degree(uint64_t new_min_compr_degree) { if (new_min_compr_degree > min_compr_degree) { // allowing this would need re-transforming finalized columns to vectors - throw std::runtime_error("Only supporting an decreasing minimum compression degree"); + throw std::runtime_error("Only supporting an decreasing minimum compression degree (new: " + + std::to_string(new_min_compr_degree) + + ", current: " + std::to_string(min_compr_degree) + ")"); } this->min_compr_degree = new_min_compr_degree; finalize(); @@ -204,14 +208,12 @@ namespace morphstore { } // morphes the adj-lists to the given target_format - // !!! first time overhead: as convert each vector to a column (finalizing) !!! - void morph(GraphCompressionFormat target_format) override { - morph(target_format, false); - } - - // as if blocksize > size of adjlist -> stays uncompressed but still allocates a whole block + void morph(GraphCompressionFormat target_format) override { morph(target_format, true); } + + // ! vector<->column conversion overhead if min_degree is different void morph(GraphCompressionFormat target_format, bool blocksize_based_min_degree) { if (blocksize_based_min_degree) { + // as if blocksize > size of adjlist -> stays uncompressed but still allocates a whole block set_min_compr_degree(graph_compr_f_block_size(target_format)); } else { // transform big enough vectors into columns @@ -231,8 +233,7 @@ namespace morphstore { } progress++; #endif - // currently min_compr_degree is final in adj_list and determines which adj-lists are - // are columns (and not a vector) + // adj. lists >= min_compr_degree are columns if (std::holds_alternative(adj_list)) { auto old_adj_col = std::get(adj_list); // const_cast needed as map-value is not constant diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index 104debf2..3ea421bb 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -89,10 +89,11 @@ int main(void) { current_try.min_compr_degree = graph->get_min_compr_degree(); // restore start state - graph->morph(GraphCompressionFormat::UNCOMPRESSED); + graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); auto start = highResClock::now(); - graph->morph(current_f); + // "false" as otherwise blocksize would be set based on format + graph->morph(current_f, false); // compression time current_try.compression_time = get_duration(start); From 6800fc0f6fa92bd2eeb440171101e8d28811f044 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 31 May 2020 22:31:07 +0200 Subject: [PATCH 183/216] Start with `morph_saving_offsets` --- include/core/morphing/morph_saving_offsets.h | 179 ++++++++++++++++++ include/core/storage/column.h | 22 +++ .../morphing/morph_saving_offsets_test.cpp | 53 ++++++ 3 files changed, 254 insertions(+) create mode 100644 include/core/morphing/morph_saving_offsets.h create mode 100644 test/core/morphing/morph_saving_offsets_test.cpp diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h new file mode 100644 index 00000000..b68e19d6 --- /dev/null +++ b/include/core/morphing/morph_saving_offsets.h @@ -0,0 +1,179 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file morph_saving_offset.h + * @brief based on morph.h, just calling morph_batch_t for every block (if blocksize > 1) + */ + +#ifndef MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H +#define MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace morphstore { + +// **************************************************************************** +// Column-level +// **************************************************************************** + +// ---------------------------------------------------------------------------- +// General interface +// ---------------------------------------------------------------------------- + +/** + * @brief A struct wrapping the actual morph_saving_offsets-operator. + * + * This is necessary to enable partial template specialization, which is + * required, since some compressed formats have their own template parameters. + */ +template struct morph_saving_offsets_t { + /** + * @brief Morph_with_offsets-operator. Changes the (compressed) format of the given + * column from the source format `t_src_f` to the destination format + * `t_dst_f` without logically changing the data. + * + * This function is deleted by default, to guarantee that using this struct + * with a format combination it is not specialized for causes a compiler + * error, not a linker error. + * + * @param inCol The data represented in the source format. + * @return The same data represented in the destination format. + */ + static column_with_blockoffsets *apply(const column *inCol) = delete; +}; + +/** + * A convenience function wrapping the morph-operator. + * + * Changes the (compressed) format of the given column from the source format + * `t_src_f` to the destination format `t_dst_f` without logically changing the + * data. + * + * @param inCol The data represented in the source format. + * @return The same data represented in the destination format. + */ +template +column_with_blockoffsets *morph_saving_offsets(const column *inCol) { + return morph_saving_offsets_t::apply(inCol); +} + +// ---------------------------------------------------------------------------- +// Partial specialization for morphing from a format to itself +// ---------------------------------------------------------------------------- + +/** + * @brief A template specialization of the morph-operator handling the case + * when the source and the destination format are the same. + * + * It merely returns the given column without doing any work. + */ +template struct morph_saving_offsets_t { + static column_with_blockoffsets *apply(const column *inCol) { + return column_with_blockoffsets(inCol); + }; +}; + +/** + * @brief A template specialization of the morph-operator handling the case + * when the source and the destination format are both uncompressed. + * + * We need to make this case explicit, since otherwise, the choice of the + * right partial template specialization is ambiguous for the compiler. + */ +template struct morph_saving_offsets_t { + static column_with_blockoffsets *apply(const column *inCol) { + return new column_with_blockoffsets(inCol); + }; +}; + +// ---------------------------------------------------------------------------- +// Partial specialization for all compressing morph operators +// ---------------------------------------------------------------------------- + +template +struct morph_saving_offsets_t { + using src_f = uncompr_f; + + static column_with_blockoffsets *apply(const column *inCol) { + if (src_f::m_BlockSize == 1) { + return new column_with_blockoffsets(morph(inCol)); + } + + std::vector* block_offsets = new std::vector(); + + const size_t countLog = inCol->get_count_values(); + const size_t outCountLogCompr = round_down_to_multiple(countLog, t_dst_f::m_BlockSize); + const size_t outSizeRestByte = uncompr_f::get_size_max_byte(countLog - outCountLogCompr); + block_offsets->reserve(outCountLogCompr + 1); + + const uint8_t * in8 = inCol->get_data(); + + auto outCol = new column( + get_size_max_byte_any_len(countLog) + ); + uint8_t * out8 = outCol->get_data(); + const uint8_t * const initOut8 = out8; + + // TODO: save block_offsets (call morph_batch_t based on blocksize) + morph_batch( + in8, out8, outCountLogCompr + ); + const size_t sizeComprByte = out8 - initOut8; + + // needed for last block + if(outSizeRestByte) { + out8 = column::create_data_uncompr_start(out8); + memcpy(out8, in8, outSizeRestByte); + } + + outCol->set_meta_data( + countLog, out8 - initOut8 + outSizeRestByte, sizeComprByte + ); + + return new column_with_blockoffsets(outCol, block_offsets); + } +}; + +// ---------------------------------------------------------------------------- +// Partial specialization for all decompressing morph operators +// ---------------------------------------------------------------------------- + +// as uncompressed has a blocksize of 1 --> no need to save blockoffsets +template +struct morph_saving_offsets_t { + using dst_f = uncompr_f; + + static + const column_with_blockoffsets * + apply(const column * inCol) { + return new column_with_blockoffsets(morph(inCol)); + } +}; + +} + +#endif //MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H diff --git a/include/core/storage/column.h b/include/core/storage/column.h index 4f2884e7..277cfd06 100644 --- a/include/core/storage/column.h +++ b/include/core/storage/column.h @@ -309,6 +309,28 @@ class column : public column_base { } }; +// used to only partial decompress column blocks (for random access) +// blockoffsets should only be saved, if blocksize > 1 +template< class F> +struct column_with_blockoffsets { + const column * col; + // TODO: use std::optional + std::vector* block_offsets; + + column_with_blockoffsets(const column * c) : column_with_blockoffsets(c, new std::vector()) {} + + column_with_blockoffsets(const column * c, std::vector* offsets) { + col = c; + block_offsets = offsets; + } + + ~column_with_blockoffsets() { + // ? deleting the column might be not always wanted + delete col; + delete block_offsets; + } +}; + } #endif //MORPHSTORE_CORE_STORAGE_COLUMN_H diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp new file mode 100644 index 00000000..4f94be67 --- /dev/null +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -0,0 +1,53 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file morph_saving_offsets_test.cpp + * @brief Tests morph_saving_offsets. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace morphstore; +using namespace vectorlib; + +using ve = scalar>; + +// **************************************************************************** +// Main program. +// **************************************************************************** + +int main(void) { + auto origCol = generate_sorted_unique(3000); + + auto col_with_offsets = morph_saving_offsets(origCol); + + assert(col_with_offsets->block_offsets->size() == 3); + + return false; +} \ No newline at end of file From 3c017aa44b722b231e8b533019dd0cec854c4da1 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 31 May 2020 22:31:33 +0200 Subject: [PATCH 184/216] Check if SSE was defined as SSE is necessary for these tests --- test/core/morphing/CMakeLists.txt | 39 ++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/test/core/morphing/CMakeLists.txt b/test/core/morphing/CMakeLists.txt index 87bd57df..60fdcde4 100644 --- a/test/core/morphing/CMakeLists.txt +++ b/test/core/morphing/CMakeLists.txt @@ -1,27 +1,38 @@ if ( CTEST_ALL OR CTEST_MORPHING ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/delta_test_app ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/k_wise_ns_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/morph_saving_offsets_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/vbp_test_app ) - add_executable( delta_test_app delta_test.cpp ) - add_executable( k_wise_ns_test_app k_wise_ns_test.cpp ) + add_executable( morph_saving_offsets_app morph_saving_offsets_test.cpp ) add_executable( vbp_test_app vbp_test.cpp ) + + if (SSE) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/delta_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/k_wise_ns_test_app ) + + add_executable( delta_test_app delta_test.cpp ) + add_executable( k_wise_ns_test_app k_wise_ns_test.cpp ) + + target_compile_options( delta_test_app PRIVATE + # space to add custom flags for THIS SPECIFIC TARGET + ) + target_compile_options( k_wise_ns_test_app PRIVATE + # space to add custom flags for THIS SPECIFIC TARGET + ) + + target_link_libraries( delta_test_app PRIVATE "-ldl" ) + target_link_libraries( k_wise_ns_test_app PRIVATE "-ldl" ) + + add_test( delta_test delta_test_app ) + add_test( k_wise_ns_test k_wise_ns_test_app ) + endif(SSE) - target_compile_options( delta_test_app PRIVATE - # space to add custom flags for THIS SPECIFIC TARGET - ) - target_compile_options( k_wise_ns_test_app PRIVATE - # space to add custom flags for THIS SPECIFIC TARGET - ) target_compile_options( vbp_test_app PRIVATE # space to add custom flags for THIS SPECIFIC TARGET ) - target_link_libraries( delta_test_app PRIVATE "-ldl" ) - target_link_libraries( k_wise_ns_test_app PRIVATE "-ldl" ) target_link_libraries( vbp_test_app PRIVATE "-ldl" ) + target_link_libraries( morph_saving_offsets_app PRIVATE "-ldl" ) - add_test( delta_test delta_test_app ) - add_test( k_wise_ns_test k_wise_ns_test_app ) + add_test( morph_saving_offsets_test morph_saving_offsets_app ) add_test( vbp_test vbp_test_app ) endif() \ No newline at end of file From 582aa5aa3f62e8bfed61d64833a94ca09de441c6 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 1 Jun 2020 16:46:11 +0200 Subject: [PATCH 185/216] Implement `morph_saving_offsets` --- include/core/morphing/morph_saving_offsets.h | 27 +++++++--- include/core/storage/column.h | 24 --------- .../core/storage/column_with_blockoffsets.h | 51 +++++++++++++++++++ .../morphing/morph_saving_offsets_test.cpp | 32 +++++++++--- 4 files changed, 97 insertions(+), 37 deletions(-) create mode 100644 include/core/storage/column_with_blockoffsets.h diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index b68e19d6..ffe8d251 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -119,14 +120,16 @@ struct morph_saving_offsets_t { using src_f = uncompr_f; static column_with_blockoffsets *apply(const column *inCol) { - if (src_f::m_BlockSize == 1) { + const size_t t_BlockSize = t_dst_f::m_BlockSize; + + if (t_BlockSize == 1) { return new column_with_blockoffsets(morph(inCol)); } - std::vector* block_offsets = new std::vector(); + std::vector *block_offsets = new std::vector(); const size_t countLog = inCol->get_count_values(); - const size_t outCountLogCompr = round_down_to_multiple(countLog, t_dst_f::m_BlockSize); + const size_t outCountLogCompr = round_down_to_multiple(countLog, t_BlockSize); const size_t outSizeRestByte = uncompr_f::get_size_max_byte(countLog - outCountLogCompr); block_offsets->reserve(outCountLogCompr + 1); @@ -136,12 +139,22 @@ struct morph_saving_offsets_t { get_size_max_byte_any_len(countLog) ); uint8_t * out8 = outCol->get_data(); + // so block_offset[x] is for x-th block + block_offsets->push_back(out8); const uint8_t * const initOut8 = out8; - // TODO: save block_offsets (call morph_batch_t based on blocksize) - morph_batch( - in8, out8, outCountLogCompr - ); + const size_t countBlocks = countLog / t_BlockSize; + + // morphing each block and save the offset + for(size_t blockIdx = 0; blockIdx < countBlocks; blockIdx++) { + // only t_BlockSizeLog as only on block at a time should be morphed + morph_batch( + in8, out8, t_BlockSize + ); + + block_offsets->push_back(out8); + } + const size_t sizeComprByte = out8 - initOut8; // needed for last block diff --git a/include/core/storage/column.h b/include/core/storage/column.h index 277cfd06..45a7ecc4 100644 --- a/include/core/storage/column.h +++ b/include/core/storage/column.h @@ -308,29 +308,5 @@ class column : public column_base { ); } }; - -// used to only partial decompress column blocks (for random access) -// blockoffsets should only be saved, if blocksize > 1 -template< class F> -struct column_with_blockoffsets { - const column * col; - // TODO: use std::optional - std::vector* block_offsets; - - column_with_blockoffsets(const column * c) : column_with_blockoffsets(c, new std::vector()) {} - - column_with_blockoffsets(const column * c, std::vector* offsets) { - col = c; - block_offsets = offsets; - } - - ~column_with_blockoffsets() { - // ? deleting the column might be not always wanted - delete col; - delete block_offsets; - } -}; - - } #endif //MORPHSTORE_CORE_STORAGE_COLUMN_H diff --git a/include/core/storage/column_with_blockoffsets.h b/include/core/storage/column_with_blockoffsets.h new file mode 100644 index 00000000..bf04bebb --- /dev/null +++ b/include/core/storage/column_with_blockoffsets.h @@ -0,0 +1,51 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file column_with_blockoffsets.h + * @brief Wrapper around column + its block-offsets + */ + +#ifndef MORPHSTORE_CORE_STORAGE_COLUMN_WITH_BLOCKOFFSETS_H +#define MORPHSTORE_CORE_STORAGE_COLUMN_WITH_BLOCKOFFSETS_H + +#include + +namespace morphstore { + +// used to allow only partial decompression of column blocks (for random access) +// blockoffsets should only be saved, if blocksize > 1 +template struct column_with_blockoffsets { + const column *col; + // TODO: use std::optional + std::vector *block_offsets; + + column_with_blockoffsets(const column *c) : column_with_blockoffsets(c, new std::vector()) {} + + column_with_blockoffsets(const column *c, std::vector *offsets) { + col = c; + block_offsets = offsets; + } + + ~column_with_blockoffsets() { + // ? deleting the column might be not always wanted + delete col; + delete block_offsets; + } +}; +} +#endif //MORPHSTORE_CORE_STORAGE_COLUMN_WITH_BLOCKOFFSETS_H diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp index 4f94be67..b0c77995 100644 --- a/test/core/morphing/morph_saving_offsets_test.cpp +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -24,12 +24,15 @@ #include #include #include +#include #include #include #include +#include #include #include #include +#include #include @@ -37,17 +40,34 @@ using namespace morphstore; using namespace vectorlib; using ve = scalar>; - -// **************************************************************************** -// Main program. -// **************************************************************************** +using compr_f = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); int main(void) { auto origCol = generate_sorted_unique(3000); - auto col_with_offsets = morph_saving_offsets(origCol); + auto col_with_offsets = morph_saving_offsets(origCol); assert(col_with_offsets->block_offsets->size() == 3); - return false; + const uint8_t * second_block_offset = col_with_offsets->block_offsets->at(1); + auto block_size = compr_f::m_BlockSize; + auto alloc_size = block_size * sizeof(uint64_t); + auto decompr_col_block = new column(alloc_size); + decompr_col_block->set_meta_data(block_size, alloc_size); + uint8_t * out8 = decompr_col_block->get_data(); + + // decompress a single block of a column + morph_batch( + second_block_offset, + out8, + block_size + ); + + // should start with 1024 + print_columns( + print_buffer_base::decimal, + decompr_col_block, + "single column block"); + + return 0; } \ No newline at end of file From 58a4f352e731f282de02e5b83f6b306987e7d766 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 2 Jun 2020 15:43:38 +0200 Subject: [PATCH 186/216] Set reasonable LDBC_DIR default value --- CMakeLists.txt | 4 ++-- .../graph/adjList_graph_compression_benchmark.cpp | 2 +- src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp | 2 +- test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h | 2 +- test/core/storage/graph/ldbc/ldbc_graph_test.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e21f82e..03b4fdf3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,8 +88,8 @@ morph_flag(-march=native) # remove build type to allow for custom flag handling set(CMAKE_BUILD_TYPE "") -# add resource directory for ldbc graph (something like "${HOME}/ldbc/ldbc_snb_datagen/social_network/") -#morph_flag(-DLDBC_DIR="") +# add resource directory for ldbc graph (something like "$ENV{HOME}/ldbc/ldbc_snb_datagen/social_network/") +morph_flag(-DLDBC_DIR="$ENV{HOME}/ldbc/ldbc_snb_datagen/social_network/") # general compiler settings, meant for all subdirectories and tests morph_flag(-Werror) diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index 3ea421bb..e7e1348e 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -124,6 +124,6 @@ int main(void) { return 0; #else - throw std::invalid_argument("Where are the ldbc files??"); + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif } diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 263b61f9..16585130 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -102,6 +102,6 @@ int main(void) { return 0; #else - throw std::invalid_argument("Where are the ldbc files??"); + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif } diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index fcf5f731..010f451d 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -59,6 +59,6 @@ template void bfs_ldbc_graph_test(void) { std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; // bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); #else - throw std::invalid_argument("Where are the ldbc files??"); + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif } \ No newline at end of file diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index 2811aaa1..803daff3 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -65,6 +65,6 @@ template void ldbcGraphFormatTest(void) { // std::cout << "Measure degree count" << std::endl; // graph->measure_degree_count(targetDir + "graph_degree_count_" + storageFormat + "SF1.csv"); #else - throw std::invalid_argument("Where are the ldbc files??"); + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif } \ No newline at end of file From b250e4a90df88681fe6acf4b58d8fbb95b9e6ca6 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 2 Jun 2020 15:44:39 +0200 Subject: [PATCH 187/216] Remove one evil `const` .. and comment a template specialization (which does not work here) --- include/core/morphing/morph_saving_offsets.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index ffe8d251..08469eda 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -91,12 +91,14 @@ column_with_blockoffsets *morph_saving_offsets(const column *i * when the source and the destination format are the same. * * It merely returns the given column without doing any work. + * @todo: reneable this (currently this would be invalid, as potential block_offsets are not saved) + * currently has to be catched beforehand */ -template struct morph_saving_offsets_t { +/* template struct morph_saving_offsets_t { static column_with_blockoffsets *apply(const column *inCol) { - return column_with_blockoffsets(inCol); + return new column_with_blockoffsets(inCol); }; -}; +}; */ /** * @brief A template specialization of the morph-operator handling the case @@ -180,13 +182,10 @@ template struct morph_saving_offsets_t { using dst_f = uncompr_f; - static - const column_with_blockoffsets * - apply(const column * inCol) { + static column_with_blockoffsets *apply(const column *inCol) { return new column_with_blockoffsets(morph(inCol)); } }; - } #endif //MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H From a02107ae46cdad491c8e522e815851e570e334cd Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 2 Jun 2020 16:06:05 +0200 Subject: [PATCH 188/216] Add template free column_with_blockoffsets_base not an ideal solution .. --- .../core/storage/column_with_blockoffsets.h | 64 +++++++++++++------ .../morphing/morph_saving_offsets_test.cpp | 6 +- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/include/core/storage/column_with_blockoffsets.h b/include/core/storage/column_with_blockoffsets.h index bf04bebb..70469b30 100644 --- a/include/core/storage/column_with_blockoffsets.h +++ b/include/core/storage/column_with_blockoffsets.h @@ -27,25 +27,53 @@ namespace morphstore { +// interface (needed as current Graph formats don't use templates) +class column_with_blockoffsets_base { + public: + virtual ~column_with_blockoffsets_base() {} + + virtual const std::vector *get_block_offsets() = 0; + virtual const uint8_t *get_block_offset(size_t pos) = 0; + virtual const column_base *get_column() = 0; + virtual size_t get_block_size() = 0; + virtual size_t get_size_used_byte() = 0; +}; + // used to allow only partial decompression of column blocks (for random access) // blockoffsets should only be saved, if blocksize > 1 -template struct column_with_blockoffsets { - const column *col; - // TODO: use std::optional - std::vector *block_offsets; - - column_with_blockoffsets(const column *c) : column_with_blockoffsets(c, new std::vector()) {} - - column_with_blockoffsets(const column *c, std::vector *offsets) { - col = c; - block_offsets = offsets; - } - - ~column_with_blockoffsets() { - // ? deleting the column might be not always wanted - delete col; - delete block_offsets; - } +template class column_with_blockoffsets : public column_with_blockoffsets_base { + static_assert(std::is_base_of::value, "column: template parameter F must be a subclass of format"); + + private: + const column *col; + // TODO: use std::optional + const std::vector *block_offsets; + + public: + column_with_blockoffsets(const column *c) + : column_with_blockoffsets(c, new std::vector()) {} + + column_with_blockoffsets(const column *c, std::vector *offsets) { + col = c; + block_offsets = offsets; + } + + ~column_with_blockoffsets() { + // ? deleting the column might be not always wanted + delete col; + delete block_offsets; + } + + const std::vector *get_block_offsets() { return block_offsets; } + const uint8_t *get_block_offset(size_t pos) { return block_offsets->at(pos); } + + const column *get_column() { return col; } + + size_t get_block_size() { return F::m_BlockSize; } + + size_t get_size_used_byte() { + return col->get_size_used_byte() + (block_offsets->size() * sizeof(uint8_t *)); + } }; -} +} // namespace morphstore #endif //MORPHSTORE_CORE_STORAGE_COLUMN_WITH_BLOCKOFFSETS_H diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp index b0c77995..fbb78334 100644 --- a/test/core/morphing/morph_saving_offsets_test.cpp +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -47,10 +47,10 @@ int main(void) { auto col_with_offsets = morph_saving_offsets(origCol); - assert(col_with_offsets->block_offsets->size() == 3); + assert(col_with_offsets->get_block_offsets()->size() == 3); - const uint8_t * second_block_offset = col_with_offsets->block_offsets->at(1); - auto block_size = compr_f::m_BlockSize; + const uint8_t * second_block_offset = col_with_offsets->get_block_offset(1); + auto block_size = col_with_offsets->get_block_size(); auto alloc_size = block_size * sizeof(uint64_t); auto decompr_col_block = new column(alloc_size); decompr_col_block->set_meta_data(block_size, alloc_size); From ab94a1447bd6b381d63fe3da80e340d094743afb Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 2 Jun 2020 22:54:17 +0200 Subject: [PATCH 189/216] Put morph_graph_col into seperate header --- include/core/morphing/graph/morph_graph_col.h | 135 ++++++++++++++++++ .../storage/graph/formats/adjacencylist.h | 1 + .../core/storage/graph/graph_compr_format.h | 107 ++------------ 3 files changed, 144 insertions(+), 99 deletions(-) create mode 100644 include/core/morphing/graph/morph_graph_col.h diff --git a/include/core/morphing/graph/morph_graph_col.h b/include/core/morphing/graph/morph_graph_col.h new file mode 100644 index 00000000..989c93f2 --- /dev/null +++ b/include/core/morphing/graph/morph_graph_col.h @@ -0,0 +1,135 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file morph_graph_col.h + * @brief helper for morphing graph columns (template-free columns). Basically need to cast to template column as it + * cannot be derieved + * @todo Remove this helper and make graph formats accept templates (can use normal morph() then) + */ + +#ifndef MORPHSTORE_GRAPH_MORPH_GRAPH_COL_H +#define MORPHSTORE_GRAPH_MORPH_GRAPH_COL_H + +#include +#include +#include + +#include + +namespace morphstore { + using column_uncompr = column; + using column_dyn_vbp = column; + using column_delta = column; + using column_for = column; + + // casting the column to the actual column type before morphing (as compiler could not derive it) + // delete_old_col -> delete input column after morphing (if the result is not the input column) + const column_base *morph_graph_col(const column_base *column, const GraphCompressionFormat src_f, + const GraphCompressionFormat trg_f, bool delete_in_col = false) { + if (src_f == trg_f) { + return column; + } + + const column_base *result = column; + + switch (src_f) { + case GraphCompressionFormat::UNCOMPRESSED: { + const column_uncompr *old_col = dynamic_cast(column); + switch (trg_f) { + case GraphCompressionFormat::DELTA: + result = morph(old_col); + break; + case GraphCompressionFormat::FOR: + result = morph(old_col); + break; + case GraphCompressionFormat::DYNAMIC_VBP: + result = morph(old_col); + break; + case GraphCompressionFormat::UNCOMPRESSED: + // handled by src_f == trg_f + break; + } + break; + } + case GraphCompressionFormat::DELTA: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_delta *old_col = dynamic_cast(column); + result = morph(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + case GraphCompressionFormat::FOR: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_for *old_col = dynamic_cast(column); + result = morph(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + case GraphCompressionFormat::DYNAMIC_VBP: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_dyn_vbp *old_col = dynamic_cast(column); + result = morph(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); + // delete_in_col = true as temporary uncompr_col should always be deleted + result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + } + + // free input column if possible + if (result != column && delete_in_col) { + delete column; + } + + if (result == nullptr) { + throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + + " trg: " + graph_compr_f_to_string(trg_f)); + } + + return result; + } + + const column_uncompr *decompress_graph_col(const column_base *column, const GraphCompressionFormat src_f) { + return static_cast( + morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false)); + } + + double compression_ratio(const column_base *col, GraphCompressionFormat col_format) { + auto uncompr_col = decompress_graph_col(col, col_format); + auto ratio = uncompr_col->get_size_used_byte() / (double)col->get_size_used_byte(); + + if (col != uncompr_col) { + delete uncompr_col; + } + + return ratio; + } +} // namespace morphstore + +#endif // MORPHSTORE_GRAPH_MORPH_GRAPH_COL_H \ No newline at end of file diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index f01b912f..40d2db74 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index e4e0e7f5..08a4c34b 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -28,10 +28,9 @@ #include #include #include -#include #include -#include -#include +#include +#include #include #include @@ -40,10 +39,9 @@ namespace morphstore { // TODO: allow also other vector extensions (switch from safe_morph to morph) using ve = vectorlib::scalar>; - using column_uncompr = column; - using column_dyn_vbp = column; - using column_delta = column; - using column_for = column; + using default_vbp = DEFAULT_DYNAMIC_VBP_F(ve); + using default_delta = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); + using default_for = DEFAULT_FOR_DYNAMIC_VBP_F(ve); enum class GraphCompressionFormat { DELTA, FOR, UNCOMPRESSED, DYNAMIC_VBP }; @@ -73,110 +71,21 @@ namespace morphstore { switch (format) { case GraphCompressionFormat::DELTA: - block_size = DEFAULT_DELTA_DYNAMIC_VBP_F(ve)::m_BlockSize; + block_size = default_delta::m_BlockSize; break; case GraphCompressionFormat::UNCOMPRESSED: block_size = uncompr_f::m_BlockSize; break; case GraphCompressionFormat::FOR: - block_size = DEFAULT_FOR_DYNAMIC_VBP_F(ve)::m_BlockSize; + block_size = default_for::m_BlockSize; break; case GraphCompressionFormat::DYNAMIC_VBP: - block_size = DEFAULT_DYNAMIC_VBP_F(ve)::m_BlockSize; + block_size = default_vbp::m_BlockSize; break; } return block_size; } - - // casting the column to the actual column type before morphing (as compiler could not derive it) - // delete_old_col -> delete input column after morphing (if the result is not the input column) - const column_base *morph_graph_col(const column_base *column, const GraphCompressionFormat src_f, - const GraphCompressionFormat trg_f, bool delete_in_col = false) { - if (src_f == trg_f) { - return column; - } - - const column_base *result = column; - - switch (src_f) { - case GraphCompressionFormat::UNCOMPRESSED: { - const column_uncompr *old_col = dynamic_cast(column); - switch (trg_f) { - case GraphCompressionFormat::DELTA: - result = morph(old_col); - break; - case GraphCompressionFormat::FOR: - result = morph(old_col); - break; - case GraphCompressionFormat::DYNAMIC_VBP: - result = morph(old_col); - break; - case GraphCompressionFormat::UNCOMPRESSED: - // handled by src_f == trg_f - break; - } - break; - } - case GraphCompressionFormat::DELTA: { - if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_delta *old_col = dynamic_cast(column); - result = morph(old_col); - } else { - // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); - result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); - } - break; - } - case GraphCompressionFormat::FOR: { - if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_for *old_col = dynamic_cast(column); - result = morph(old_col); - } else { - // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); - result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); - } - break; - } - case GraphCompressionFormat::DYNAMIC_VBP: { - if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_dyn_vbp *old_col = dynamic_cast(column); - result = morph(old_col); - } else { - // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, false); - // delete_in_col = true as temporary uncompr_col should always be deleted - result = morph_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); - } - break; - } - } - - // free input column if possible - if (result != column && delete_in_col) { - delete column; - } - - if (result == nullptr) { - throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + - " trg: " + graph_compr_f_to_string(trg_f)); - } - - return result; - } - - const column_uncompr *decompress_graph_col(const column_base *column, const GraphCompressionFormat src_f, - bool delete_in_col = false) { - return static_cast( - morph_graph_col(column, src_f, GraphCompressionFormat::UNCOMPRESSED, delete_in_col)); - } - - double compression_ratio(const column_base *column, GraphCompressionFormat col_format) { - // TODO: need to delete decompressed_col? - return decompress_graph_col(column, col_format)->get_size_used_byte() / (double)column->get_size_used_byte(); - } } // namespace morphstore #endif // MORPHSTORE_GRAPH_COMPR_FORMAT_H \ No newline at end of file From 17a996f92f3cc2f207ece5db44d33c17a30fd79e Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 2 Jun 2020 22:56:08 +0200 Subject: [PATCH 190/216] WIP add template free morph_saving_offsets to use for csr columns --- .../graph/morph_saving_offsets_graph_col.h | 147 ++++++++++++++++++ include/core/storage/graph/formats/csr.h | 140 +++++++++++------ 2 files changed, 240 insertions(+), 47 deletions(-) create mode 100644 include/core/morphing/graph/morph_saving_offsets_graph_col.h diff --git a/include/core/morphing/graph/morph_saving_offsets_graph_col.h b/include/core/morphing/graph/morph_saving_offsets_graph_col.h new file mode 100644 index 00000000..b6d61335 --- /dev/null +++ b/include/core/morphing/graph/morph_saving_offsets_graph_col.h @@ -0,0 +1,147 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file morph_saving_offsets_graph_col.h + * @brief helper for `morph_saving_offsets()` graph column (template-free column). Basically need to cast to template + * column as it cannot be derieved + * @todo Remove this helper and make graph formats accept templates (can use normal `morph_saving_offsets()` then) + */ + +#ifndef MORPHSTORE_GRAPH_MORPH_SAVING_OFFSETS_GRAPH_COL_H +#define MORPHSTORE_GRAPH_MORPH_SAVING_OFFSETS_GRAPH_COL_H + +#include +#include +#include + +#include +// for simple decompression (very likely removeable) +#include + +#include + +namespace morphstore { + + // casting the column to the actual column type before morphing (as compiler could not derive it) + // delete_old_col -> delete input column after morphing (if the result is not the input column) + column_with_blockoffsets_base *morph_saving_offsets_graph_col(column_with_blockoffsets_base *col_with_offsets, + const GraphCompressionFormat src_f, + const GraphCompressionFormat trg_f, + bool delete_in_col = false) { + if (src_f == trg_f) { + return col_with_offsets; + } + + column_with_blockoffsets_base *result = col_with_offsets; + + auto col = col_with_offsets->get_column(); + + switch (src_f) { + case GraphCompressionFormat::UNCOMPRESSED: { + const column_uncompr *old_col = dynamic_cast(col); + switch (trg_f) { + case GraphCompressionFormat::DELTA: + result = morph_saving_offsets(old_col); + break; + case GraphCompressionFormat::FOR: + result = morph_saving_offsets(old_col); + break; + case GraphCompressionFormat::DYNAMIC_VBP: + result = morph_saving_offsets(old_col); + break; + case GraphCompressionFormat::UNCOMPRESSED: + // handled by src_f == trg_f + break; + } + break; + } + case GraphCompressionFormat::DELTA: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_delta *old_col = dynamic_cast(col); + result = morph_saving_offsets(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, + GraphCompressionFormat::UNCOMPRESSED, false); + result = + morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + case GraphCompressionFormat::FOR: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_for *old_col = dynamic_cast(col); + result = morph_saving_offsets(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, + GraphCompressionFormat::UNCOMPRESSED, false); + result = + morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + case GraphCompressionFormat::DYNAMIC_VBP: { + if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { + const column_dyn_vbp *old_col = dynamic_cast(col); + result = morph_saving_offsets(old_col); + } else { + // as direct morphing is not yet supported .. go via decompressing first + auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, + GraphCompressionFormat::UNCOMPRESSED, false); + // delete_in_col = true as temporary uncompr_col should always be deleted + result = + morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); + } + break; + } + } + + // free input column if possible + if (result != col_with_offsets && delete_in_col) { + delete col_with_offsets; + } + + if (result == nullptr) { + throw std::runtime_error("Did not handle src: " + graph_compr_f_to_string(src_f) + + " trg: " + graph_compr_f_to_string(trg_f)); + } + + return result; + } + +/* const column_uncompr *decompress_part_of_graph_col(const column_base *col, const GraphCompressionFormat src_f) { + // TODO + throw std::runtime_error("Not implemented decompressing a single block"); + } */ + + // TODO: also consider size of blockoffset vector? + double compression_ratio(column_with_blockoffsets_base *col_with_offsets, GraphCompressionFormat col_format) { + auto col = col_with_offsets->get_column(); + auto uncompr_col = decompress_graph_col(col, col_format); + auto ratio = uncompr_col->get_size_used_byte() / (double)col->get_size_used_byte(); + + if (col != uncompr_col) { + delete uncompr_col; + } + + return ratio; + } +} // namespace morphstore + +#endif // MORPHSTORE_GRAPH_MORPH_SAVING_OFFSETS_GRAPH_COL_H \ No newline at end of file diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index ba1e9a82..a982ca25 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -25,6 +25,7 @@ #define MORPHSTORE_CSR_H #include +#include #include #include @@ -38,8 +39,8 @@ namespace morphstore { * offset column: index is vertex-id; column entry contains offset in edgeId array * edgeId column: contains edge id */ - column_base *offset_column; - column_base *edgeId_column; + column_with_blockoffsets_base *offset_column; + column_with_blockoffsets_base *edgeId_column; protected: // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC @@ -58,11 +59,11 @@ namespace morphstore { graph_compr_f_to_string(current_compression)); } - uint64_t *offset_data = offset_column->get_data(); + uint64_t *offset_data = offset_column->get_column()->get_data(); uint64_t offset = offset_data[sourceID]; uint64_t nextOffset = offset + edge_ids.size(); - uint64_t *edgeId_data = edgeId_column->get_data(); + uint64_t *edgeId_data = edgeId_column->get_column()->get_data(); // TODO: get copy to work (should be faster than loop) // std::copy(edge_ids.begin(), edge_ids.end(), edgeId_data); for (auto edge_id : edge_ids) { @@ -76,10 +77,29 @@ namespace morphstore { } } + // DEBUG function to look into column: + void print_column(const column_base *col, int start, int end) const { + // validate interval (fix otherwise) + int col_size = col->get_count_values(); + if (start < 0 || col_size < start) { + start = 0; + } + if (col_size <= end) { + end = col->get_count_values() - 1; + } + + std::cout << "Printing column from " << start << " to " << end << std::endl; + const uint64_t *data = col->get_data(); + + for (auto pos = start; pos <= end; pos++) { + std::cout << "Index: " << pos << " Value:" << data[pos] << std::endl; + } + } + public: ~CSR() { - free(offset_column); - free(edgeId_column); + delete offset_column; + delete edgeId_column; } CSR(EdgesContainerType edges_container_type) @@ -97,15 +117,19 @@ namespace morphstore { Graph::allocate_graph_structure(numberVertices, numberEdges); const size_t offset_size = numberVertices * sizeof(uint64_t); - offset_column = new column(offset_size); - offset_column->set_meta_data(numberVertices, offset_size); + auto offset_col = new column(offset_size); + offset_col->set_meta_data(numberVertices, offset_size); + // wrapping offset_column + offset_column = new column_with_blockoffsets(offset_col); const size_t edge_ids_size = numberEdges * sizeof(uint64_t); - edgeId_column = new column(edge_ids_size); - edgeId_column->set_meta_data(numberEdges, edge_ids_size); + auto edgeId_col = new column(edge_ids_size); + edgeId_col->set_meta_data(numberEdges, edge_ids_size); + // wrapping edgeId_column + edgeId_column = new column_with_blockoffsets(edgeId_col); // init node array: - uint64_t *offset_data = offset_column->get_data(); + uint64_t *offset_data = offset_col->get_data(); offset_data[0] = 0; } @@ -118,7 +142,8 @@ namespace morphstore { uint64_t get_out_degree(uint64_t id) const override { // decompressing offset_column in order to read correct offset // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read - auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); + // return only relevant block and than work on that + auto uncompr_offset_col = decompress_graph_col(offset_column->get_column(), current_compression); uint64_t *offset_data = uncompr_offset_col->get_data(); uint64_t offset = offset_data[id]; @@ -132,7 +157,7 @@ namespace morphstore { } // deleting temporary column - if (uncompr_offset_col != offset_column) { + if (uncompr_offset_col != offset_column->get_column()) { delete uncompr_offset_col; } @@ -144,6 +169,49 @@ namespace morphstore { } } + std::vector get_outgoing_edge_ids(uint64_t id) const override { + assert(vertices->exists_vertex(id)); + + std::vector out_edge_ids; + // TODO: only decompress relevant block + auto uncompr_offset_col = decompress_graph_col(offset_column->get_column(), current_compression); + uint64_t offset = ((uint64_t *)uncompr_offset_col->get_data())[id]; + + + // TODO: remove + print_column(uncompr_offset_col, id - 20, id + 20); + + if (uncompr_offset_col != offset_column->get_column()) { + delete uncompr_offset_col; + } + + // TODO: decompressing offset_column twice this way (should not be a problem if block cache exists) + uint64_t out_degree = get_out_degree(id); + + out_edge_ids.reserve(out_degree); + + // TODO: only decompress relevant blocks + auto uncompr_edgeId_col = decompress_graph_col(edgeId_column->get_column(), current_compression); + uint64_t *edgeId_data = uncompr_edgeId_col->get_data(); + + // TODO: remove + print_column(uncompr_edgeId_col, 1'000'000, 1'000'020); + std::cout << std::endl << "edge id column offset: " << offset; + std::cout << " vertex out degree: " << out_degree; + std::cout << " edge id column size: " << uncompr_edgeId_col->get_count_values(); + std::cout.flush(); + + assert(offset + out_degree < uncompr_edgeId_col->get_count_values()); + + out_edge_ids.insert(out_edge_ids.end(), edgeId_data + offset, edgeId_data + offset + out_degree); + + if (uncompr_edgeId_col != edgeId_column->get_column()) { + delete uncompr_edgeId_col; + } + + return out_edge_ids; + } + void morph(GraphCompressionFormat target_format) override { #if DEBUG std::cout << "Morphing graph format specific data structures from " @@ -157,10 +225,8 @@ namespace morphstore { return; } - offset_column = - const_cast(morph_graph_col(offset_column, current_compression, target_format, true)); - edgeId_column = - const_cast(morph_graph_col(edgeId_column, current_compression, target_format, true)); + offset_column = morph_saving_offsets_graph_col(offset_column, current_compression, target_format, true); + edgeId_column = morph_saving_offsets_graph_col(edgeId_column, current_compression, target_format, true); this->current_compression = target_format; } @@ -169,46 +235,26 @@ namespace morphstore { std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); - + + // column_meta_data, prepared_for_random_access, .. not included in get_size_used_byte; + index_size += 2 * sizeof(column); index_size += edgeId_column->get_size_used_byte(); index_size += offset_column->get_size_used_byte(); return {index_size, data_size}; } - std::vector get_outgoing_edge_ids(uint64_t id) const override { - assert(vertices->exists_vertex(id)); - - std::vector out_edge_ids; - auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); - uint64_t offset = ((uint64_t *)uncompr_offset_col->get_data())[id]; - - if (uncompr_offset_col != offset_column) { - delete uncompr_offset_col; - } - - // TODO: decompressing offset_column twice this way - uint64_t numberEdges = get_out_degree(id); - - auto uncompr_edgeId_col = decompress_graph_col(edgeId_column, current_compression); - uint64_t *edgeId_data = uncompr_edgeId_col->get_data(); - out_edge_ids.insert(out_edge_ids.end(), edgeId_data + offset, edgeId_data + offset + numberEdges); - - if (uncompr_edgeId_col != edgeId_column) { - delete uncompr_edgeId_col; - } - - return out_edge_ids; - } - double offset_column_compr_ratio() { return compression_ratio(offset_column, current_compression); } double edgeId_column_compr_ratio() { return compression_ratio(edgeId_column, current_compression); } - std::string get_column_info(const column_base *column) { - return " values: " + std::to_string(column->get_count_values()) + - " size in bytes: " + std::to_string(column->get_size_used_byte()) + - " compression ratio: " + std::to_string(compression_ratio(column, current_compression)); + std::string get_column_info(column_with_blockoffsets_base *col_with_offsets) { + auto col = col_with_offsets->get_column(); + + return " values: " + std::to_string(col->get_count_values()) + + " size in bytes: " + std::to_string(col->get_size_used_byte()) + + " compression ratio: " + std::to_string(compression_ratio(col_with_offsets, current_compression)) + + " number of blocks (if blocksize > 1): " + std::to_string(col_with_offsets->get_block_offsets()->size()); } void statistics() override { From 52dbb5c384054a8a0b933e4629b8b1ff39c6de46 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 5 Jun 2020 11:42:06 +0200 Subject: [PATCH 191/216] Improve morph_saving_offsets_test finding a current bug (fixed in the following) --- .../morphing/morph_saving_offsets_test.cpp | 80 ++++++++++++++----- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp index fbb78334..5394fc78 100644 --- a/test/core/morphing/morph_saving_offsets_test.cpp +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -43,31 +44,72 @@ using ve = scalar>; using compr_f = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); int main(void) { - auto origCol = generate_sorted_unique(3000); + // 3 whole blocks + // TODO: also check for partial block + // for last block only morph_batch if it is complete ... (as incomplete block are still uncompressed) + auto origCol = generate_sorted_unique(3072); + + // !! morph saving offsets needs to look if last block can be actually morphed (if not complet -> undefined behaviour?) auto col_with_offsets = morph_saving_offsets(origCol); assert(col_with_offsets->get_block_offsets()->size() == 3); - const uint8_t * second_block_offset = col_with_offsets->get_block_offset(1); - auto block_size = col_with_offsets->get_block_size(); - auto alloc_size = block_size * sizeof(uint64_t); - auto decompr_col_block = new column(alloc_size); - decompr_col_block->set_meta_data(block_size, alloc_size); - uint8_t * out8 = decompr_col_block->get_data(); - - // decompress a single block of a column - morph_batch( - second_block_offset, - out8, - block_size - ); - - // should start with 1024 - print_columns( + auto decompr_col = morph_saving_offsets(col_with_offsets->get_column())->get_column(); + + // asserting correctness of operator + equality_check ec0(decompr_col, origCol); + std::cout << ec0; + + assert(ec0.m_CountValuesEqual); + assert(ec0.m_SizeUsedByteEqual); + + uint64_t* expected = origCol->get_data(); + uint64_t* actual = decompr_col->get_data(); + + // for finding point of error + for (uint64_t i = 0; i < origCol->get_count_values(); i++) { + bool equals = expected[i] == actual[i]; + if (!equals) { + std::cout << "actual: " << actual[i] << " expected: " << expected[i] << std::endl; + std::cout.flush(); + } + } + + // findings: block 0 correctly decompressed, block 1 is off by 1023, block 2 is again correctly decompressed +/* print_columns( print_buffer_base::decimal, - decompr_col_block, - "single column block"); + decompr_col, + "whole decompressed column"); */ + + + //assert(ec0.good()); + + + // asserting correctness of decompressing a single block + auto block_size = col_with_offsets->get_block_size(); + auto alloc_size = block_size * sizeof(uint64_t); + for (uint64_t block = 0; block < col_with_offsets->get_block_offsets()->size(); block++) { + std::cout << "Checking block " << block << "range: " << block * block_size << " .. " << ((block +1) * block_size) - 1 << std::endl; + auto expected_col = generate_sorted_unique(1024, block * 1024); + + const uint8_t *block_offset = col_with_offsets->get_block_offset(block); + auto decompr_col_block = new column(alloc_size); + decompr_col_block->set_meta_data(block_size, alloc_size); + uint8_t *out8 = decompr_col_block->get_data(); + + // decompress a single block of a column + morph_batch(block_offset, out8, block_size); + + equality_check ec_block(expected_col, decompr_col_block); + + std::cout << ec_block; + + if (!ec_block.good()) { + print_columns(print_buffer_base::decimal, decompr_col_block, expected_col, "actual", "expected"); + assert(ec_block.good()); + } + } return 0; } \ No newline at end of file From fe9967b0d252eb3417608afc1f25438c9b022dc2 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 5 Jun 2020 17:18:59 +0200 Subject: [PATCH 192/216] Add test for morphing column blocks and fix bug regarding last block --- include/core/morphing/morph_saving_offsets.h | 10 +- .../core/storage/column_with_blockoffsets.h | 5 +- include/core/utils/equality_check.h | 28 ++++- test/core/morphing/CMakeLists.txt | 10 +- .../core/morphing/morph_column_block_test.cpp | 102 ++++++++++++++++++ .../morphing/morph_saving_offsets_test.cpp | 74 +++---------- 6 files changed, 158 insertions(+), 71 deletions(-) create mode 100644 test/core/morphing/morph_column_block_test.cpp diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index 08469eda..4ce1f06b 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -141,26 +141,26 @@ struct morph_saving_offsets_t { get_size_max_byte_any_len(countLog) ); uint8_t * out8 = outCol->get_data(); - // so block_offset[x] is for x-th block - block_offsets->push_back(out8); const uint8_t * const initOut8 = out8; const size_t countBlocks = countLog / t_BlockSize; // morphing each block and save the offset for(size_t blockIdx = 0; blockIdx < countBlocks; blockIdx++) { + // saving the start address of the block + block_offsets->push_back(out8); + // only t_BlockSizeLog as only on block at a time should be morphed morph_batch( in8, out8, t_BlockSize ); - - block_offsets->push_back(out8); } const size_t sizeComprByte = out8 - initOut8; - // needed for last block + // needed for last block (if incomplete data stays uncompressed) if(outSizeRestByte) { + block_offsets->push_back(out8); out8 = column::create_data_uncompr_start(out8); memcpy(out8, in8, outSizeRestByte); } diff --git a/include/core/storage/column_with_blockoffsets.h b/include/core/storage/column_with_blockoffsets.h index 70469b30..42b86380 100644 --- a/include/core/storage/column_with_blockoffsets.h +++ b/include/core/storage/column_with_blockoffsets.h @@ -37,6 +37,9 @@ class column_with_blockoffsets_base { virtual const column_base *get_column() = 0; virtual size_t get_block_size() = 0; virtual size_t get_size_used_byte() = 0; + bool last_block_compressed() { + return get_column()->get_count_values_uncompr() == 0; + } }; // used to allow only partial decompression of column blocks (for random access) @@ -69,7 +72,7 @@ template class column_with_blockoffsets : public column_with_blockoffs const column *get_column() { return col; } - size_t get_block_size() { return F::m_BlockSize; } + inline size_t get_block_size() { return F::m_BlockSize; } size_t get_size_used_byte() { return col->get_size_used_byte() + (block_offsets->size() * sizeof(uint8_t *)); diff --git a/include/core/utils/equality_check.h b/include/core/utils/equality_check.h index 8e2a7ab3..0c443584 100644 --- a/include/core/utils/equality_check.h +++ b/include/core/utils/equality_check.h @@ -31,6 +31,8 @@ #include #include +#include + namespace morphstore { struct equality_check { @@ -77,6 +79,9 @@ namespace morphstore { }; std::ostream & operator<<( std::ostream & os, const equality_check & ec ) { + const char *data_ok_str = + (ec.m_CountValuesEqual && ec.m_SizeUsedByteEqual) ? equality_check::ok_str(ec.m_DataEqual) : "undefined"; + os << "countValues: " << equality_check::ok_str( ec.m_CountValuesEqual ) << " (expected " << ec.m_CountValuesExp @@ -86,10 +91,29 @@ namespace morphstore { << " (expected " << ec.m_SizeUsedByteExp << ", found " << ec.m_SizeUsedByteFnd << ')' << std::endl - << "data: " << equality_check::ok_str( ec.m_DataEqual ) - << " (this check is only valid, if countValues and sizeUsedByte are ok)" + << "data: " << data_ok_str << std::endl; return os; } + + template void assert_columns_equal(const column *expected_col, const column *actual_col) { + equality_check ec(expected_col, actual_col); + std::cout << ec; + if (!ec.good()) { + uint64_t *expected = expected_col->get_data(); + uint64_t *actual = actual_col->get_data(); + + assert(expected_col->get_count_values() == actual_col->get_count_values()); + + // printing only different entries + for (uint64_t i = 0; i < expected_col->get_count_values(); i++) { + if (!(expected[i] == actual[i])) { + std::cout << "pos: " << i << " expected: " << expected[i] << " actual: " << actual[i] << std::endl; + } + } + // print_columns(print_buffer_base::decimal, actual_col, expected_col, "actual", "expected"); + assert(false); + } + } } #endif //MORPHSTORE_CORE_UTILS_EQUALITY_CHECK_H diff --git a/test/core/morphing/CMakeLists.txt b/test/core/morphing/CMakeLists.txt index 60fdcde4..f314130f 100644 --- a/test/core/morphing/CMakeLists.txt +++ b/test/core/morphing/CMakeLists.txt @@ -1,8 +1,10 @@ if ( CTEST_ALL OR CTEST_MORPHING ) - FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/morph_saving_offsets_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/morph_saving_offsets_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/morph_column_block_test_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/morphing/vbp_test_app ) - add_executable( morph_saving_offsets_app morph_saving_offsets_test.cpp ) + add_executable( morph_saving_offsets_test_app morph_saving_offsets_test.cpp ) + add_executable( morph_column_block_test_app morph_column_block_test.cpp ) add_executable( vbp_test_app vbp_test.cpp ) if (SSE) @@ -31,8 +33,8 @@ if ( CTEST_ALL OR CTEST_MORPHING ) ) target_link_libraries( vbp_test_app PRIVATE "-ldl" ) - target_link_libraries( morph_saving_offsets_app PRIVATE "-ldl" ) - add_test( morph_saving_offsets_test morph_saving_offsets_app ) + add_test( morph_column_block_test morph_column_block_test_app ) + add_test( morph_saving_offsets_test morph_saving_offsets_test_app ) add_test( vbp_test vbp_test_app ) endif() \ No newline at end of file diff --git a/test/core/morphing/morph_column_block_test.cpp b/test/core/morphing/morph_column_block_test.cpp new file mode 100644 index 00000000..fd730c75 --- /dev/null +++ b/test/core/morphing/morph_column_block_test.cpp @@ -0,0 +1,102 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file morph_saving_offsets_test.cpp + * @brief Tests morphing blocks based on morph_saving_offsets. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace morphstore; +using namespace vectorlib; + +using ve = scalar>; +using compr_f = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); + +int main(void) { + // 3 whole blocks + // TODO: also check for partial block .. 2 test variants (column_size 3000 and 3072) + // 3000 not working yet + auto column_size = 3000; + + auto orig_col = generate_sorted_unique(column_size); + + // !! morph saving offsets needs to look if last block can be actually morphed (if not complete -> undefined + // behaviour?) + + auto compr_col_with_offsets = morph_saving_offsets(orig_col); + assert(compr_col_with_offsets->get_block_offsets()->size() == 3); + assert(compr_col_with_offsets->last_block_compressed() == (column_size % compr_f::m_BlockSize == 0)); + + // asserting correctness of decompressing a single block + auto block_size = compr_col_with_offsets->get_block_size(); + + for (uint64_t block = 0; block < compr_col_with_offsets->get_block_offsets()->size(); block++) { + auto value_count = block_size; + + bool last_block = (block == (compr_col_with_offsets->get_block_offsets()->size() - 1)); + bool last_block_uncompressed = !compr_col_with_offsets->last_block_compressed(); + const uint8_t *block_offset = compr_col_with_offsets->get_block_offset(block); + + if (last_block && last_block_uncompressed) { + value_count = compr_col_with_offsets->get_column()->get_count_values() % block_size; + } + + std::cout << "Checking block " << block << " range: " << block * block_size << " .. " + << (block * block_size + value_count) - 1 << std::endl; + + auto alloc_size = value_count * sizeof(uint64_t); + + // TODO: refactor into general function: + // checking if block size == 1 (then direct mem_copy) + // checking if last block -> direct mem_copy + right meta data setting (value count < block_size) + // column morph_block(column_with_offset col_with_offsets) + + auto decompr_col_block = new column(alloc_size); + decompr_col_block->set_meta_data(value_count, alloc_size); + uint8_t *out8 = decompr_col_block->get_data(); + + + if (last_block && last_block_uncompressed) { + auto outSizeRestByte = uncompr_f::get_size_max_byte(value_count); + memcpy(out8, block_offset, outSizeRestByte); + } else { + morph_batch(block_offset, out8, block_size); + } + + auto expected_col = generate_sorted_unique(value_count, block * 1024); + assert_columns_equal(expected_col, decompr_col_block); + } + + return 0; +} \ No newline at end of file diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp index 5394fc78..25bb812b 100644 --- a/test/core/morphing/morph_saving_offsets_test.cpp +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -47,69 +47,25 @@ int main(void) { // 3 whole blocks // TODO: also check for partial block // for last block only morph_batch if it is complete ... (as incomplete block are still uncompressed) - auto origCol = generate_sorted_unique(3072); + auto orig_col = generate_sorted_unique(3072); - // !! morph saving offsets needs to look if last block can be actually morphed (if not complet -> undefined behaviour?) + auto compr_col_with_offsets = morph_saving_offsets(orig_col); + assert(compr_col_with_offsets->get_block_offsets()->size() == 3); + assert(compr_col_with_offsets->last_block_compressed()); - auto col_with_offsets = morph_saving_offsets(origCol); + std::cout << "Checking morph_saving_offset() result column equals the one from morph()" << std::endl; + // BUG: more bytes used with morph_saving_offsets + auto compr_col = morph(orig_col); + assert_columns_equal(compr_col, compr_col_with_offsets->get_column()); - assert(col_with_offsets->get_block_offsets()->size() == 3); - auto decompr_col = morph_saving_offsets(col_with_offsets->get_column())->get_column(); - - // asserting correctness of operator - equality_check ec0(decompr_col, origCol); - std::cout << ec0; - - assert(ec0.m_CountValuesEqual); - assert(ec0.m_SizeUsedByteEqual); - - uint64_t* expected = origCol->get_data(); - uint64_t* actual = decompr_col->get_data(); - - // for finding point of error - for (uint64_t i = 0; i < origCol->get_count_values(); i++) { - bool equals = expected[i] == actual[i]; - if (!equals) { - std::cout << "actual: " << actual[i] << " expected: " << expected[i] << std::endl; - std::cout.flush(); - } - } - - // findings: block 0 correctly decompressed, block 1 is off by 1023, block 2 is again correctly decompressed -/* print_columns( - print_buffer_base::decimal, - decompr_col, - "whole decompressed column"); */ - - - //assert(ec0.good()); - - - // asserting correctness of decompressing a single block - auto block_size = col_with_offsets->get_block_size(); - auto alloc_size = block_size * sizeof(uint64_t); - for (uint64_t block = 0; block < col_with_offsets->get_block_offsets()->size(); block++) { - std::cout << "Checking block " << block << "range: " << block * block_size << " .. " << ((block +1) * block_size) - 1 << std::endl; - auto expected_col = generate_sorted_unique(1024, block * 1024); - - const uint8_t *block_offset = col_with_offsets->get_block_offset(block); - auto decompr_col_block = new column(alloc_size); - decompr_col_block->set_meta_data(block_size, alloc_size); - uint8_t *out8 = decompr_col_block->get_data(); - - // decompress a single block of a column - morph_batch(block_offset, out8, block_size); - - equality_check ec_block(expected_col, decompr_col_block); - - std::cout << ec_block; - - if (!ec_block.good()) { - print_columns(print_buffer_base::decimal, decompr_col_block, expected_col, "actual", "expected"); - assert(ec_block.good()); - } - } + // TODO: get this one to work !! + // currently BUG: 0. block: ok , 1. block: +1023, 2. block: + 3070 + std::cout << "Checking morph_saving_offset() decompressed equals original column" << std::endl; + auto decompr_col = morph_saving_offsets(compr_col_with_offsets->get_column()); + // uncompr_f blocksize == 1 --> no need to save block offsets + assert(decompr_col->get_block_offsets()->size() == 0); + assert_columns_equal(orig_col, decompr_col->get_column()); return 0; } \ No newline at end of file From 0e754ff28b8bb1d8a4b69f293c86f6d8c511c8db Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 5 Jun 2020 17:47:16 +0200 Subject: [PATCH 193/216] Fix offset last block if it is uncompressed .. added the offset to early --- include/core/morphing/morph_saving_offsets.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index 4ce1f06b..20d94b40 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -160,8 +160,8 @@ struct morph_saving_offsets_t { // needed for last block (if incomplete data stays uncompressed) if(outSizeRestByte) { - block_offsets->push_back(out8); out8 = column::create_data_uncompr_start(out8); + block_offsets->push_back(out8); memcpy(out8, in8, outSizeRestByte); } From c10ad1d22b3ae7ff2a405b894f361d30508cff0a Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 8 Jun 2020 17:49:21 +0200 Subject: [PATCH 194/216] Fix bug decompressing column_with_blockoffsets at once actually need to decompress the blocks one by one .. (see `morph_saving_offsets_test` for reasoning) Lead to a couple of changes: * morph_saving_offsets now accepts column_with_offsets as input * dedicated morph_saving_offset for decompression --- .../graph/morph_saving_offsets_graph_col.h | 46 +-- include/core/morphing/morph_saving_offsets.h | 277 ++++++++++-------- .../core/storage/column_with_blockoffsets.h | 4 +- include/core/storage/graph/formats/csr.h | 23 +- include/core/utils/equality_check.h | 3 +- .../core/morphing/morph_column_block_test.cpp | 15 +- .../morphing/morph_saving_offsets_test.cpp | 48 +-- 7 files changed, 228 insertions(+), 188 deletions(-) diff --git a/include/core/morphing/graph/morph_saving_offsets_graph_col.h b/include/core/morphing/graph/morph_saving_offsets_graph_col.h index b6d61335..c2450baf 100644 --- a/include/core/morphing/graph/morph_saving_offsets_graph_col.h +++ b/include/core/morphing/graph/morph_saving_offsets_graph_col.h @@ -30,30 +30,31 @@ #include #include -// for simple decompression (very likely removeable) -#include #include namespace morphstore { + using column_uncompr = column; + using column_with_offsets_uncompr = column_with_blockoffsets; + using column__with_offsets_dyn_vbp = column_with_blockoffsets; + using column_with_offsets_delta = column_with_blockoffsets; + using column_with_offsets_for = column_with_blockoffsets; // casting the column to the actual column type before morphing (as compiler could not derive it) // delete_old_col -> delete input column after morphing (if the result is not the input column) - column_with_blockoffsets_base *morph_saving_offsets_graph_col(column_with_blockoffsets_base *col_with_offsets, + column_with_blockoffsets_base *morph_saving_offsets_graph_col(column_with_blockoffsets_base *col, const GraphCompressionFormat src_f, const GraphCompressionFormat trg_f, bool delete_in_col = false) { if (src_f == trg_f) { - return col_with_offsets; + return col; } - column_with_blockoffsets_base *result = col_with_offsets; - - auto col = col_with_offsets->get_column(); + auto result = col; switch (src_f) { case GraphCompressionFormat::UNCOMPRESSED: { - const column_uncompr *old_col = dynamic_cast(col); + auto old_col = dynamic_cast(col); switch (trg_f) { case GraphCompressionFormat::DELTA: result = morph_saving_offsets(old_col); @@ -72,12 +73,11 @@ namespace morphstore { } case GraphCompressionFormat::DELTA: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_delta *old_col = dynamic_cast(col); + auto old_col = dynamic_cast(col); result = morph_saving_offsets(old_col); } else { // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, - GraphCompressionFormat::UNCOMPRESSED, false); + auto uncompr_col = morph_saving_offsets_graph_col(col, src_f, GraphCompressionFormat::UNCOMPRESSED, false); result = morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); } @@ -85,12 +85,11 @@ namespace morphstore { } case GraphCompressionFormat::FOR: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_for *old_col = dynamic_cast(col); + auto old_col = dynamic_cast(col); result = morph_saving_offsets(old_col); } else { // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, - GraphCompressionFormat::UNCOMPRESSED, false); + auto uncompr_col = morph_saving_offsets_graph_col(col, src_f, GraphCompressionFormat::UNCOMPRESSED, false); result = morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); } @@ -98,12 +97,11 @@ namespace morphstore { } case GraphCompressionFormat::DYNAMIC_VBP: { if (trg_f == GraphCompressionFormat::UNCOMPRESSED) { - const column_dyn_vbp *old_col = dynamic_cast(col); + auto old_col = dynamic_cast(col); result = morph_saving_offsets(old_col); } else { // as direct morphing is not yet supported .. go via decompressing first - auto uncompr_col = morph_saving_offsets_graph_col(col_with_offsets, src_f, - GraphCompressionFormat::UNCOMPRESSED, false); + auto uncompr_col = morph_saving_offsets_graph_col(col, src_f, GraphCompressionFormat::UNCOMPRESSED, false); // delete_in_col = true as temporary uncompr_col should always be deleted result = morph_saving_offsets_graph_col(uncompr_col, GraphCompressionFormat::UNCOMPRESSED, trg_f, true); @@ -113,8 +111,8 @@ namespace morphstore { } // free input column if possible - if (result != col_with_offsets && delete_in_col) { - delete col_with_offsets; + if (result != col && delete_in_col) { + delete col; } if (result == nullptr) { @@ -125,15 +123,21 @@ namespace morphstore { return result; } -/* const column_uncompr *decompress_part_of_graph_col(const column_base *col, const GraphCompressionFormat src_f) { +/* const column_with_offsets_uncompr *decompress_part_of_graph_col(const column_base *col, const GraphCompressionFormat src_f) { // TODO throw std::runtime_error("Not implemented decompressing a single block"); } */ + column_with_offsets_uncompr *decompress_graph_col(column_with_blockoffsets_base *col, + const GraphCompressionFormat src_f) { + return static_cast( + morph_saving_offsets_graph_col(col, src_f, GraphCompressionFormat::UNCOMPRESSED, false)); + } + // TODO: also consider size of blockoffset vector? double compression_ratio(column_with_blockoffsets_base *col_with_offsets, GraphCompressionFormat col_format) { + auto uncompr_col = decompress_graph_col(col_with_offsets, col_format)->get_column(); auto col = col_with_offsets->get_column(); - auto uncompr_col = decompress_graph_col(col, col_format); auto ratio = uncompr_col->get_size_used_byte() / (double)col->get_size_used_byte(); if (col != uncompr_col) { diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index 20d94b40..ed303827 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -25,167 +25,206 @@ #include #include +#include #include #include #include #include #include -#include #include #include namespace morphstore { -// **************************************************************************** -// Column-level -// **************************************************************************** - -// ---------------------------------------------------------------------------- -// General interface -// ---------------------------------------------------------------------------- + // **************************************************************************** + // Column-level + // **************************************************************************** + + // ---------------------------------------------------------------------------- + // General interface + // ---------------------------------------------------------------------------- -/** - * @brief A struct wrapping the actual morph_saving_offsets-operator. - * - * This is necessary to enable partial template specialization, which is - * required, since some compressed formats have their own template parameters. - */ -template struct morph_saving_offsets_t { /** - * @brief Morph_with_offsets-operator. Changes the (compressed) format of the given - * column from the source format `t_src_f` to the destination format - * `t_dst_f` without logically changing the data. + * @brief A struct wrapping the actual morph_saving_offsets-operator. * - * This function is deleted by default, to guarantee that using this struct - * with a format combination it is not specialized for causes a compiler - * error, not a linker error. + * This is necessary to enable partial template specialization, which is + * required, since some compressed formats have their own template parameters. + */ + template struct morph_saving_offsets_t { + /** + * @brief Morph_with_offsets-operator. Changes the (compressed) format of the given + * column from the source format `t_src_f` to the destination format + * `t_dst_f` without logically changing the data. + * + * This function is deleted by default, to guarantee that using this struct + * with a format combination it is not specialized for causes a compiler + * error, not a linker error. + * + * @param inCol The data represented in the source format + previous block_offsets. + * @return The same data represented in the destination format. + */ + static column_with_blockoffsets *apply(column_with_blockoffsets *inCol) = delete; + }; + + /** + * A convenience function wrapping the morph-saving-offset-operator. + * ! Only works if the block-size = 1 (as otherwise invalid blockoffsets) + * + * Changes the (compressed) format of the given column from the source format + * `t_src_f` to the destination format `t_dst_f` without logically changing the + * data. * * @param inCol The data represented in the source format. * @return The same data represented in the destination format. */ - static column_with_blockoffsets *apply(const column *inCol) = delete; -}; + template + column_with_blockoffsets *morph_saving_offsets(const column *inCol) { + return morph_saving_offsets(new column_with_blockoffsets(inCol)); + } -/** - * A convenience function wrapping the morph-operator. - * - * Changes the (compressed) format of the given column from the source format - * `t_src_f` to the destination format `t_dst_f` without logically changing the - * data. - * - * @param inCol The data represented in the source format. - * @return The same data represented in the destination format. - */ -template -column_with_blockoffsets *morph_saving_offsets(const column *inCol) { - return morph_saving_offsets_t::apply(inCol); -} + /** + * A convenience function wrapping the morph-operator. + * + * Changes the (compressed) format of the given column from the source format + * `t_src_f` to the destination format `t_dst_f` without logically changing the + * data. + * + * @param inCol The data represented in the source format. + * @return The same data represented in the destination format. + */ + template + column_with_blockoffsets *morph_saving_offsets(column_with_blockoffsets *inCol) { + return morph_saving_offsets_t::apply(inCol); + } -// ---------------------------------------------------------------------------- -// Partial specialization for morphing from a format to itself -// ---------------------------------------------------------------------------- + // ---------------------------------------------------------------------------- + // Partial specialization for morphing from a format to itself + // ---------------------------------------------------------------------------- -/** - * @brief A template specialization of the morph-operator handling the case - * when the source and the destination format are the same. - * - * It merely returns the given column without doing any work. - * @todo: reneable this (currently this would be invalid, as potential block_offsets are not saved) - * currently has to be catched beforehand - */ -/* template struct morph_saving_offsets_t { - static column_with_blockoffsets *apply(const column *inCol) { - return new column_with_blockoffsets(inCol); + /** + * @brief A template specialization of the morph-operator handling the case + * when the source and the destination format are the same. + * + * It merely returns the given column without doing any work. + */ + template struct morph_saving_offsets_t { + static column_with_blockoffsets *apply(column_with_blockoffsets *inCol) { return inCol; }; }; -}; */ -/** - * @brief A template specialization of the morph-operator handling the case - * when the source and the destination format are both uncompressed. - * - * We need to make this case explicit, since otherwise, the choice of the - * right partial template specialization is ambiguous for the compiler. - */ -template struct morph_saving_offsets_t { - static column_with_blockoffsets *apply(const column *inCol) { - return new column_with_blockoffsets(inCol); + /** + * @brief A template specialization of the morph-operator handling the case + * when the source and the destination format are both uncompressed. + * + * We need to make this case explicit, since otherwise, the choice of the + * right partial template specialization is ambiguous for the compiler. + */ + template struct morph_saving_offsets_t { + static column_with_blockoffsets *apply(column_with_blockoffsets *inCol) { + return inCol; + }; }; -}; -// ---------------------------------------------------------------------------- -// Partial specialization for all compressing morph operators -// ---------------------------------------------------------------------------- + // ---------------------------------------------------------------------------- + // Partial specialization for all compressing morph operators + // ---------------------------------------------------------------------------- -template -struct morph_saving_offsets_t { - using src_f = uncompr_f; + template + struct morph_saving_offsets_t { + using src_f = uncompr_f; - static column_with_blockoffsets *apply(const column *inCol) { - const size_t t_BlockSize = t_dst_f::m_BlockSize; + static_assert(t_dst_f::m_BlockSize != 1, + "Blocksize of 1 is only expected for uncompr_f .. block-wise morph is useless in that case"); - if (t_BlockSize == 1) { - return new column_with_blockoffsets(morph(inCol)); - } + static column_with_blockoffsets *apply(column_with_blockoffsets *inCol_with_offsets) { + + const size_t t_BlockSize = t_dst_f::m_BlockSize; + + auto inCol = inCol_with_offsets->get_column(); - std::vector *block_offsets = new std::vector(); + std::vector *block_offsets = new std::vector(); - const size_t countLog = inCol->get_count_values(); - const size_t outCountLogCompr = round_down_to_multiple(countLog, t_BlockSize); - const size_t outSizeRestByte = uncompr_f::get_size_max_byte(countLog - outCountLogCompr); - block_offsets->reserve(outCountLogCompr + 1); + const size_t countLog = inCol->get_count_values(); + const size_t outCountLogCompr = round_down_to_multiple(countLog, t_BlockSize); + const size_t outSizeRestByte = uncompr_f::get_size_max_byte(countLog - outCountLogCompr); + block_offsets->reserve(outCountLogCompr + 1); - const uint8_t * in8 = inCol->get_data(); + const uint8_t *in8 = inCol->get_data(); - auto outCol = new column( - get_size_max_byte_any_len(countLog) - ); - uint8_t * out8 = outCol->get_data(); - const uint8_t * const initOut8 = out8; + auto outCol = new column(get_size_max_byte_any_len(countLog)); + uint8_t *out8 = outCol->get_data(); + const uint8_t *const initOut8 = out8; - const size_t countBlocks = countLog / t_BlockSize; + const size_t countBlocks = countLog / t_BlockSize; - // morphing each block and save the offset - for(size_t blockIdx = 0; blockIdx < countBlocks; blockIdx++) { + // morphing each block and save the offset + for (size_t blockIdx = 0; blockIdx < countBlocks; blockIdx++) { // saving the start address of the block block_offsets->push_back(out8); // only t_BlockSizeLog as only on block at a time should be morphed - morph_batch( - in8, out8, t_BlockSize - ); - } - - const size_t sizeComprByte = out8 - initOut8; + morph_batch(in8, out8, t_BlockSize); + } - // needed for last block (if incomplete data stays uncompressed) - if(outSizeRestByte) { - out8 = column::create_data_uncompr_start(out8); - block_offsets->push_back(out8); - memcpy(out8, in8, outSizeRestByte); - } + const size_t sizeComprByte = out8 - initOut8; - outCol->set_meta_data( - countLog, out8 - initOut8 + outSizeRestByte, sizeComprByte - ); + // needed for last block (if incomplete data stays uncompressed) + if (outSizeRestByte) { + out8 = column::create_data_uncompr_start(out8); + block_offsets->push_back(out8); + memcpy(out8, in8, outSizeRestByte); + } - return new column_with_blockoffsets(outCol, block_offsets); - } -}; + outCol->set_meta_data(countLog, out8 - initOut8 + outSizeRestByte, sizeComprByte); -// ---------------------------------------------------------------------------- -// Partial specialization for all decompressing morph operators -// ---------------------------------------------------------------------------- + return new column_with_blockoffsets(outCol, block_offsets); + } + }; -// as uncompressed has a blocksize of 1 --> no need to save blockoffsets -template -struct morph_saving_offsets_t { - using dst_f = uncompr_f; + // ---------------------------------------------------------------------------- + // Partial specialization for all decompressing morph operators + // ---------------------------------------------------------------------------- + + // as uncompressed has a blocksize of 1 --> no need to save blockoffsets + template + struct morph_saving_offsets_t { + using dst_f = uncompr_f; + + static column_with_blockoffsets *apply(column_with_blockoffsets *inCol_with_offset) { + // TODO: morph_batch each block independently (see above) + auto inCol = inCol_with_offset->get_column(); + auto block_offsets = inCol_with_offset->get_block_offsets(); + + const size_t countLog = inCol->get_count_values(); + + const size_t outSizeByte = dst_f::get_size_max_byte(countLog); + auto outCol = new column(outSizeByte); + uint8_t *out8 = outCol->get_data(); + + // !! need to decompress each block seperatly + // example problem: + // delta: morphing multi blocks at once -> block start value = diff to previous block + // morphing one block at a time -> block start value = first actual value of the block + // example: col 0..2047 + // --> morph(): start-values: 0 ; 1 + // --> morph_saving_offsets(): start-values: 0 ; 1024 + for (uint64_t i = 0; i < block_offsets->size(); i++) { + auto offset = block_offsets->at(i); + + // uncompressed last block + if ((i == block_offsets->size() - 1) && !inCol_with_offset->last_block_compressed()) { + memcpy(out8, offset, uncompr_f::get_size_max_byte(countLog % t_src_f::m_BlockSize)); + } else { + morph_batch(offset, out8, t_src_f::m_BlockSize); + } + } + + outCol->set_meta_data(countLog, outSizeByte); + + return new column_with_blockoffsets(outCol); + } + }; - static column_with_blockoffsets *apply(const column *inCol) { - return new column_with_blockoffsets(morph(inCol)); - } -}; -} +} // namespace morphstore -#endif //MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H +#endif // MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H diff --git a/include/core/storage/column_with_blockoffsets.h b/include/core/storage/column_with_blockoffsets.h index 42b86380..918a449a 100644 --- a/include/core/storage/column_with_blockoffsets.h +++ b/include/core/storage/column_with_blockoffsets.h @@ -54,7 +54,9 @@ template class column_with_blockoffsets : public column_with_blockoffs public: column_with_blockoffsets(const column *c) - : column_with_blockoffsets(c, new std::vector()) {} + : column_with_blockoffsets(c, new std::vector()) { + static_assert(F::m_BlockSize == 1, "need block offsets if block-size > 1"); + } column_with_blockoffsets(const column *c, std::vector *offsets) { col = c; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index a982ca25..e506ca96 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -143,8 +143,8 @@ namespace morphstore { // decompressing offset_column in order to read correct offset // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read // return only relevant block and than work on that - auto uncompr_offset_col = decompress_graph_col(offset_column->get_column(), current_compression); - uint64_t *offset_data = uncompr_offset_col->get_data(); + auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); + uint64_t *offset_data = uncompr_offset_col->get_column()->get_data(); uint64_t offset = offset_data[id]; uint64_t nextOffset; @@ -157,7 +157,7 @@ namespace morphstore { } // deleting temporary column - if (uncompr_offset_col != offset_column->get_column()) { + if (uncompr_offset_col != offset_column) { delete uncompr_offset_col; } @@ -174,13 +174,9 @@ namespace morphstore { std::vector out_edge_ids; // TODO: only decompress relevant block - auto uncompr_offset_col = decompress_graph_col(offset_column->get_column(), current_compression); + auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression)->get_column(); uint64_t offset = ((uint64_t *)uncompr_offset_col->get_data())[id]; - - // TODO: remove - print_column(uncompr_offset_col, id - 20, id + 20); - if (uncompr_offset_col != offset_column->get_column()) { delete uncompr_offset_col; } @@ -191,17 +187,10 @@ namespace morphstore { out_edge_ids.reserve(out_degree); // TODO: only decompress relevant blocks - auto uncompr_edgeId_col = decompress_graph_col(edgeId_column->get_column(), current_compression); + auto uncompr_edgeId_col = decompress_graph_col(edgeId_column, current_compression)->get_column(); uint64_t *edgeId_data = uncompr_edgeId_col->get_data(); - // TODO: remove - print_column(uncompr_edgeId_col, 1'000'000, 1'000'020); - std::cout << std::endl << "edge id column offset: " << offset; - std::cout << " vertex out degree: " << out_degree; - std::cout << " edge id column size: " << uncompr_edgeId_col->get_count_values(); - std::cout.flush(); - - assert(offset + out_degree < uncompr_edgeId_col->get_count_values()); + //assert(offset + out_degree < uncompr_edgeId_col->get_count_values()); out_edge_ids.insert(out_edge_ids.end(), edgeId_data + offset, edgeId_data + offset + out_degree); diff --git a/include/core/utils/equality_check.h b/include/core/utils/equality_check.h index 0c443584..80457c41 100644 --- a/include/core/utils/equality_check.h +++ b/include/core/utils/equality_check.h @@ -103,7 +103,8 @@ namespace morphstore { uint64_t *expected = expected_col->get_data(); uint64_t *actual = actual_col->get_data(); - assert(expected_col->get_count_values() == actual_col->get_count_values()); + assert(ec.m_CountValuesEqual); + assert(ec.m_SizeUsedByteEqual); // printing only different entries for (uint64_t i = 0; i < expected_col->get_count_values(); i++) { diff --git a/test/core/morphing/morph_column_block_test.cpp b/test/core/morphing/morph_column_block_test.cpp index fd730c75..851d699a 100644 --- a/test/core/morphing/morph_column_block_test.cpp +++ b/test/core/morphing/morph_column_block_test.cpp @@ -44,19 +44,17 @@ using ve = scalar>; using compr_f = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); int main(void) { - // 3 whole blocks - // TODO: also check for partial block .. 2 test variants (column_size 3000 and 3072) - // 3000 not working yet - auto column_size = 3000; + // TODO: 2 test variants (column_size 3000 and 3072) + auto orig_column_size = 3000; - auto orig_col = generate_sorted_unique(column_size); + auto orig_col = generate_sorted_unique(orig_column_size); // !! morph saving offsets needs to look if last block can be actually morphed (if not complete -> undefined // behaviour?) auto compr_col_with_offsets = morph_saving_offsets(orig_col); - assert(compr_col_with_offsets->get_block_offsets()->size() == 3); - assert(compr_col_with_offsets->last_block_compressed() == (column_size % compr_f::m_BlockSize == 0)); + assert(compr_col_with_offsets->get_block_offsets()->size() == round_up_div(orig_column_size, compr_f::m_BlockSize)); + assert(compr_col_with_offsets->last_block_compressed() == (orig_column_size % compr_f::m_BlockSize == 0)); // asserting correctness of decompressing a single block auto block_size = compr_col_with_offsets->get_block_size(); @@ -80,7 +78,8 @@ int main(void) { // TODO: refactor into general function: // checking if block size == 1 (then direct mem_copy) // checking if last block -> direct mem_copy + right meta data setting (value count < block_size) - // column morph_block(column_with_offset col_with_offsets) + // column morph_column_block(column_with_offset col_with_offsets, ?block_number) + // block then used for get_pos() inside of CSR graph (when this works -> caching of blocks in CSR) auto decompr_col_block = new column(alloc_size); decompr_col_block->set_meta_data(value_count, alloc_size); diff --git a/test/core/morphing/morph_saving_offsets_test.cpp b/test/core/morphing/morph_saving_offsets_test.cpp index 25bb812b..99384dac 100644 --- a/test/core/morphing/morph_saving_offsets_test.cpp +++ b/test/core/morphing/morph_saving_offsets_test.cpp @@ -21,48 +21,54 @@ */ #include +#include +#include +#include #include +#include +#include #include #include -#include #include +#include #include -#include -#include -#include -#include -#include -#include #include +#include #include using namespace morphstore; using namespace vectorlib; -using ve = scalar>; +using ve = scalar>; using compr_f = DEFAULT_DELTA_DYNAMIC_VBP_F(ve); int main(void) { - // 3 whole blocks - // TODO: also check for partial block - // for last block only morph_batch if it is complete ... (as incomplete block are still uncompressed) - auto orig_col = generate_sorted_unique(3072); + // 3 whole blocks + // TODO: also check for partial block (2 variants) + auto orig_column_size = 3000; + auto orig_col = generate_sorted_unique(orig_column_size); auto compr_col_with_offsets = morph_saving_offsets(orig_col); - assert(compr_col_with_offsets->get_block_offsets()->size() == 3); - assert(compr_col_with_offsets->last_block_compressed()); - std::cout << "Checking morph_saving_offset() result column equals the one from morph()" << std::endl; - // BUG: more bytes used with morph_saving_offsets - auto compr_col = morph(orig_col); - assert_columns_equal(compr_col, compr_col_with_offsets->get_column()); + assert(compr_col_with_offsets->get_block_offsets()->size() == + round_up_div(orig_column_size, compr_f::m_BlockSize)); + assert(compr_col_with_offsets->last_block_compressed() == (orig_column_size % compr_f::m_BlockSize == 0)); + // !! currently not equal, as morph_batch on delta block start values very likely depend on previous block + // delta: morphing multi blocks at once -> block start value = diff to previous block + // morphing one block at a time -> block start value = first value of the block + // example: col 0..2047 + // --> morph(): start-values: 0 ; 1 + // --> morph_saving_offsets(): start-values: 0 ; 1024 + /* std::cout << "Checking morph_saving_offset() result column equals the one from morph()" << std::endl; + auto compr_col = morph(orig_col); + assert_columns_equal(compr_col, compr_col_with_offsets->get_column()); */ - // TODO: get this one to work !! - // currently BUG: 0. block: ok , 1. block: +1023, 2. block: + 3070 + // TODO: get this one to work !! -> block-wise decompression needed + // currently BUG: 0. block: ok , 1. block: +1023, 2. block: + 3070 std::cout << "Checking morph_saving_offset() decompressed equals original column" << std::endl; - auto decompr_col = morph_saving_offsets(compr_col_with_offsets->get_column()); + auto decompr_col = morph_saving_offsets(compr_col_with_offsets); // uncompr_f blocksize == 1 --> no need to save block offsets assert(decompr_col->get_block_offsets()->size() == 0); assert_columns_equal(orig_col, decompr_col->get_column()); From be7abcfbb033856b82d6753ae4d2d3ed21b5d594 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 8 Jun 2020 22:54:09 +0200 Subject: [PATCH 195/216] Add decompress_column_block function --- .../core/morphing/decompress_column_block.h | 95 +++++++++++++++++++ .../graph/morph_saving_offsets_graph_col.h | 27 +++++- .../core/storage/column_with_blockoffsets.h | 14 +-- .../core/morphing/morph_column_block_test.cpp | 41 +++----- 4 files changed, 138 insertions(+), 39 deletions(-) create mode 100644 include/core/morphing/decompress_column_block.h diff --git a/include/core/morphing/decompress_column_block.h b/include/core/morphing/decompress_column_block.h new file mode 100644 index 00000000..45a0a899 --- /dev/null +++ b/include/core/morphing/decompress_column_block.h @@ -0,0 +1,95 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file decompress_column_block.h + * @brief Decompressing column blocks based on column_with_blockoffsets. + */ + +#ifndef MORPHSTORE_CORE_MORPHING_DECOMPRESS_COLUMN_BLOCK_H +#define MORPHSTORE_CORE_MORPHING_DECOMPRESS_COLUMN_BLOCK_H + +#include +#include +#include + +#include + +namespace morphstore { + + /** + * @brief Decompressing a range column blocks (inclusive range) + * + * @param inCol The column with block-offsets + * @param start index of blocks to be decompressed + * @param end index of blocks to be decompressed + * @return Specified blocks uncompressed in a new column + */ + template + const column *decompress_column_blocks(column_with_blockoffsets *inCol, uint64_t start, + uint64_t end) { + static_assert(compr_f::m_BlockSize != 1, "Decompressing column blocks of size 1 is not allowed"); + + auto block_size = compr_f::m_BlockSize; + auto block_count = inCol->get_block_offsets()->size(); + auto inCol_value_count = inCol->get_column()->get_count_values(); + + // validating range + assert(start <= end); + assert(start <= block_count); + assert(end <= block_count); + + bool last_block_uncompressed = !inCol->last_block_compressed(); + bool last_block_included = end == (block_count - 1); + + // pessimistic value_count (assuming all blocks are complete) + auto value_count = (end - start + 1) * block_size; + + if (last_block_included && last_block_uncompressed) { + // correcting value_count estimation + value_count -= block_size - inCol_value_count % block_size; + } + + // TODO: should actually be base_t? + auto alloc_size = value_count * sizeof(uint64_t); + + auto decompr_col_blocks = new column(alloc_size); + decompr_col_blocks->set_meta_data(value_count, alloc_size); + uint8_t *out8 = decompr_col_blocks->get_data(); + + for (uint64_t block = start; block <= end; block++) { + const uint8_t *block_offset = inCol->get_block_offset(block); + + if (block == end && last_block_included && last_block_uncompressed) { + // handle uncompressed part + morph_batch(block_offset, out8, inCol_value_count % block_size); + } else { + morph_batch(block_offset, out8, block_size); + } + } + + return decompr_col_blocks; + } + + template + const column *decompress_column_block(column_with_blockoffsets *inCol, uint64_t block_index) { + return decompress_column_blocks(inCol, block_index, block_index); + } + +} // namespace morphstore + +#endif // MORPHSTORE_CORE_MORPHING_DECOMPRESS_COLUMN_BLOCK_H diff --git a/include/core/morphing/graph/morph_saving_offsets_graph_col.h b/include/core/morphing/graph/morph_saving_offsets_graph_col.h index c2450baf..66a82d99 100644 --- a/include/core/morphing/graph/morph_saving_offsets_graph_col.h +++ b/include/core/morphing/graph/morph_saving_offsets_graph_col.h @@ -30,6 +30,7 @@ #include #include +#include #include @@ -123,10 +124,28 @@ namespace morphstore { return result; } -/* const column_with_offsets_uncompr *decompress_part_of_graph_col(const column_base *col, const GraphCompressionFormat src_f) { - // TODO - throw std::runtime_error("Not implemented decompressing a single block"); - } */ + const column_uncompr *decompress_column_block(column_with_blockoffsets_base *col, + const GraphCompressionFormat src_f, uint64_t block) { + switch (src_f) { + case GraphCompressionFormat::DELTA: { + auto casted_col = dynamic_cast(col); + return decompress_column_block(casted_col, block); + } + case GraphCompressionFormat::FOR: { + auto casted_col = dynamic_cast(col); + return decompress_column_block(casted_col, block); + } + case GraphCompressionFormat::DYNAMIC_VBP: { + auto casted_col = dynamic_cast(col); + return decompress_column_block(casted_col, block); + } + case GraphCompressionFormat::UNCOMPRESSED: { + throw std::runtime_error("Decompress a single block of size 1 is meaningless .. access directly"); + } + default: + throw std::runtime_error("Unexpected compression format" + graph_compr_f_to_string(src_f)); + } + } column_with_offsets_uncompr *decompress_graph_col(column_with_blockoffsets_base *col, const GraphCompressionFormat src_f) { diff --git a/include/core/storage/column_with_blockoffsets.h b/include/core/storage/column_with_blockoffsets.h index 918a449a..35aa20dc 100644 --- a/include/core/storage/column_with_blockoffsets.h +++ b/include/core/storage/column_with_blockoffsets.h @@ -33,10 +33,14 @@ class column_with_blockoffsets_base { virtual ~column_with_blockoffsets_base() {} virtual const std::vector *get_block_offsets() = 0; - virtual const uint8_t *get_block_offset(size_t pos) = 0; + virtual const uint8_t *get_block_offset(size_t block_number) = 0; virtual const column_base *get_column() = 0; virtual size_t get_block_size() = 0; - virtual size_t get_size_used_byte() = 0; + + size_t get_size_used_byte() { + return get_column()->get_size_used_byte() + (get_block_offsets()->size() * sizeof(uint8_t *)); + } + bool last_block_compressed() { return get_column()->get_count_values_uncompr() == 0; } @@ -70,15 +74,11 @@ template class column_with_blockoffsets : public column_with_blockoffs } const std::vector *get_block_offsets() { return block_offsets; } - const uint8_t *get_block_offset(size_t pos) { return block_offsets->at(pos); } + const uint8_t *get_block_offset(size_t block_number) { return block_offsets->at(block_number); } const column *get_column() { return col; } inline size_t get_block_size() { return F::m_BlockSize; } - - size_t get_size_used_byte() { - return col->get_size_used_byte() + (block_offsets->size() * sizeof(uint8_t *)); - } }; } // namespace morphstore #endif //MORPHSTORE_CORE_STORAGE_COLUMN_WITH_BLOCKOFFSETS_H diff --git a/test/core/morphing/morph_column_block_test.cpp b/test/core/morphing/morph_column_block_test.cpp index 851d699a..5d599c5e 100644 --- a/test/core/morphing/morph_column_block_test.cpp +++ b/test/core/morphing/morph_column_block_test.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -53,49 +53,34 @@ int main(void) { // behaviour?) auto compr_col_with_offsets = morph_saving_offsets(orig_col); - assert(compr_col_with_offsets->get_block_offsets()->size() == round_up_div(orig_column_size, compr_f::m_BlockSize)); + auto block_count = compr_col_with_offsets->get_block_offsets()->size(); + assert(block_count == round_up_div(orig_column_size, compr_f::m_BlockSize)); assert(compr_col_with_offsets->last_block_compressed() == (orig_column_size % compr_f::m_BlockSize == 0)); // asserting correctness of decompressing a single block + auto block_size = compr_col_with_offsets->get_block_size(); for (uint64_t block = 0; block < compr_col_with_offsets->get_block_offsets()->size(); block++) { auto value_count = block_size; - bool last_block = (block == (compr_col_with_offsets->get_block_offsets()->size() - 1)); - bool last_block_uncompressed = !compr_col_with_offsets->last_block_compressed(); - const uint8_t *block_offset = compr_col_with_offsets->get_block_offset(block); - - if (last_block && last_block_uncompressed) { + if (block == block_count -1 && !compr_col_with_offsets->last_block_compressed()) { value_count = compr_col_with_offsets->get_column()->get_count_values() % block_size; } std::cout << "Checking block " << block << " range: " << block * block_size << " .. " - << (block * block_size + value_count) - 1 << std::endl; - - auto alloc_size = value_count * sizeof(uint64_t); - - // TODO: refactor into general function: - // checking if block size == 1 (then direct mem_copy) - // checking if last block -> direct mem_copy + right meta data setting (value count < block_size) - // column morph_column_block(column_with_offset col_with_offsets, ?block_number) - // block then used for get_pos() inside of CSR graph (when this works -> caching of blocks in CSR) - - auto decompr_col_block = new column(alloc_size); - decompr_col_block->set_meta_data(value_count, alloc_size); - uint8_t *out8 = decompr_col_block->get_data(); - - - if (last_block && last_block_uncompressed) { - auto outSizeRestByte = uncompr_f::get_size_max_byte(value_count); - memcpy(out8, block_offset, outSizeRestByte); - } else { - morph_batch(block_offset, out8, block_size); - } + << (block * block_size + block_size) - 1 << std::endl; + + auto decompr_col_block = decompress_column_block(compr_col_with_offsets, block); auto expected_col = generate_sorted_unique(value_count, block * 1024); assert_columns_equal(expected_col, decompr_col_block); } + // checking decompressing multiple sequentiell blocks + std::cout << "Checking decompressing multiple blocks " << std::endl; + auto multiple_col_blocks = decompress_column_blocks(compr_col_with_offsets, 0, block_count - 1); + assert_columns_equal(orig_col, multiple_col_blocks); + return 0; } \ No newline at end of file From be9e51180dae968dcd97ed90eeb693f6b86db6d5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 9 Jun 2020 17:00:09 +0200 Subject: [PATCH 196/216] Use decompress_column_block in csr format --- .../core/morphing/decompress_column_block.h | 4 +- .../graph/morph_saving_offsets_graph_col.h | 15 ++- include/core/storage/graph/formats/csr.h | 120 +++++++++++++----- .../graph/csr_graph_compression_benchmark.cpp | 14 +- 4 files changed, 113 insertions(+), 40 deletions(-) diff --git a/include/core/morphing/decompress_column_block.h b/include/core/morphing/decompress_column_block.h index 45a0a899..f67d2db7 100644 --- a/include/core/morphing/decompress_column_block.h +++ b/include/core/morphing/decompress_column_block.h @@ -50,8 +50,8 @@ namespace morphstore { // validating range assert(start <= end); - assert(start <= block_count); - assert(end <= block_count); + assert(start < block_count); + assert(end < block_count); bool last_block_uncompressed = !inCol->last_block_compressed(); bool last_block_included = end == (block_count - 1); diff --git a/include/core/morphing/graph/morph_saving_offsets_graph_col.h b/include/core/morphing/graph/morph_saving_offsets_graph_col.h index 66a82d99..68c51167 100644 --- a/include/core/morphing/graph/morph_saving_offsets_graph_col.h +++ b/include/core/morphing/graph/morph_saving_offsets_graph_col.h @@ -124,20 +124,20 @@ namespace morphstore { return result; } - const column_uncompr *decompress_column_block(column_with_blockoffsets_base *col, - const GraphCompressionFormat src_f, uint64_t block) { + const column_uncompr *decompress_column_blocks(column_with_blockoffsets_base *col, + const GraphCompressionFormat src_f, uint64_t start, uint64_t end) { switch (src_f) { case GraphCompressionFormat::DELTA: { auto casted_col = dynamic_cast(col); - return decompress_column_block(casted_col, block); + return decompress_column_blocks(casted_col, start, end); } case GraphCompressionFormat::FOR: { auto casted_col = dynamic_cast(col); - return decompress_column_block(casted_col, block); + return decompress_column_blocks(casted_col, start, end); } case GraphCompressionFormat::DYNAMIC_VBP: { auto casted_col = dynamic_cast(col); - return decompress_column_block(casted_col, block); + return decompress_column_blocks(casted_col, start, end); } case GraphCompressionFormat::UNCOMPRESSED: { throw std::runtime_error("Decompress a single block of size 1 is meaningless .. access directly"); @@ -147,6 +147,11 @@ namespace morphstore { } } + const column_uncompr *decompress_column_block(column_with_blockoffsets_base *col, + const GraphCompressionFormat src_f, uint64_t block) { + return decompress_column_blocks(col, src_f, block, block); + } + column_with_offsets_uncompr *decompress_graph_col(column_with_blockoffsets_base *col, const GraphCompressionFormat src_f) { return static_cast( diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index e506ca96..32612de9 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -77,6 +77,33 @@ namespace morphstore { } } + uint64_t get_offset(uint64_t id) const { + // TODO: use cache + + auto block_size = offset_column->get_block_size(); + + if (current_compression == GraphCompressionFormat::UNCOMPRESSED) { + uint64_t *col_data = offset_column->get_column()->get_data(); + return col_data[id]; + } else { + auto block_number = id / block_size; + auto block_pos = id % block_size; + + assert(block_number < offset_column->get_block_offsets()->size()); + + auto uncompr_block = decompress_column_block(offset_column, current_compression, block_number); + + uint64_t *block_data = uncompr_block->get_data(); + + auto offset = block_data[block_pos]; + + // deleting temporary column + delete uncompr_block; + + return offset; + } + } + // DEBUG function to look into column: void print_column(const column_base *col, int start, int end) const { // validate interval (fix otherwise) @@ -140,28 +167,18 @@ namespace morphstore { // get number of edges of vertex with id uint64_t get_out_degree(uint64_t id) const override { - // decompressing offset_column in order to read correct offset - // TODO: only decompress part of the column as only offset_column[id] and offset_column[id+1] will be read - // return only relevant block and than work on that - auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression); - uint64_t *offset_data = uncompr_offset_col->get_column()->get_data(); - - uint64_t offset = offset_data[id]; + uint64_t offset = get_offset(id); uint64_t nextOffset; // special case: last vertex id has no next offset if (id == getVertexCount() - 1) { nextOffset = getEdgeCount(); } else { - nextOffset = offset_data[id + 1]; - } - - // deleting temporary column - if (uncompr_offset_col != offset_column) { - delete uncompr_offset_col; + nextOffset = get_offset(id + 1); } // compute out_degree + // TODO: simplify this line if (offset == nextOffset) return 0; else { @@ -172,33 +189,74 @@ namespace morphstore { std::vector get_outgoing_edge_ids(uint64_t id) const override { assert(vertices->exists_vertex(id)); - std::vector out_edge_ids; - // TODO: only decompress relevant block - auto uncompr_offset_col = decompress_graph_col(offset_column, current_compression)->get_column(); - uint64_t offset = ((uint64_t *)uncompr_offset_col->get_data())[id]; + std::vector result; - if (uncompr_offset_col != offset_column->get_column()) { - delete uncompr_offset_col; - } + uint64_t start = get_offset(id); + uint64_t degree = get_out_degree(id); - // TODO: decompressing offset_column twice this way (should not be a problem if block cache exists) - uint64_t out_degree = get_out_degree(id); + // TODO: use cache + result.reserve(degree); - out_edge_ids.reserve(out_degree); + // end is not included in the result + auto end = start + degree; - // TODO: only decompress relevant blocks - auto uncompr_edgeId_col = decompress_graph_col(edgeId_column, current_compression)->get_column(); - uint64_t *edgeId_data = uncompr_edgeId_col->get_data(); + assert(start <= end); + assert(getEdgeCount() >= end); - //assert(offset + out_degree < uncompr_edgeId_col->get_count_values()); + if (degree == 0) { + return result; + } - out_edge_ids.insert(out_edge_ids.end(), edgeId_data + offset, edgeId_data + offset + out_degree); + auto block_size = edgeId_column->get_block_size(); - if (uncompr_edgeId_col != edgeId_column->get_column()) { - delete uncompr_edgeId_col; + if (current_compression == GraphCompressionFormat::UNCOMPRESSED) { + uint64_t *col_data = edgeId_column->get_column()->get_data(); + result.insert(result.end(), col_data + start, col_data + end); + } else { + // getting one block at a time as most of the time only one block is needed + // also allows to use block cache (would need to inject cache into decompress_block otherwise) + auto start_block = start / block_size; + auto start_block_pos = start % block_size; + auto end_block = end / block_size; + auto end_block_pos = end % block_size; + + assert(start_block < edgeId_column->get_block_offsets()->size()); + assert(end_block < edgeId_column->get_block_offsets()->size()); + + // case that end is the first value of another block (should not decompress that block than) + if (end_block_pos == 0) { + end_block--; + end_block_pos = block_size + 1; + } + + for (auto block_number = start_block; block_number <= end_block; block_number++) { + auto uncompr_block = decompress_column_block(edgeId_column, current_compression, block_number); + uint64_t *block_data = uncompr_block->get_data(); + + // all edge ids in the same block + if (start_block == end_block) { + result.insert(result.end(), block_data + start_block_pos, block_data + end_block_pos); + } else if (block_number == end_block) { + // only insert until end_pos + result.insert(result.end(), block_data, block_data + end_block_pos); + } else if (block_number == start_block) { + // don't insert values before start + auto block_end = block_data + block_size; + result.insert(result.end(), block_data + start_block_pos, block_end); + } else { + // insert whole block (should be very rare) + auto block_end = block_data + block_size; + result.insert(result.end(), block_data, block_end); + } + + // deleting temporary column + delete uncompr_block; + } } - return out_edge_ids; + assert(result.size() == degree); + + return result; } void morph(GraphCompressionFormat target_format) override { diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 16585130..7354b4e1 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -39,7 +39,7 @@ struct CompressionBenchmarkEntry { std::string to_string() { return graph_compr_f_to_string(compr_format) + "|" + std::to_string(compression_time) + "|" + std::to_string(offset_col_compression_ratio) + "|" + std::to_string(edgeId_col_compression_ratio) + - "|" + std::to_string(random_access_time); + "|" + std::to_string(random_access_time) + "|" + std::to_string(full_iterate); } }; @@ -71,7 +71,8 @@ int main(void) { << std::endl; std::cout << "Compression-Format | compression-time | offset-column compr. ratio" << " | edgeId-column compr. ratio | access of edges of " - << std::to_string(number_of_random_access) + " random vertices" << std::endl; + << std::to_string(number_of_random_access) + " random vertices" + << " | full iterate" << std::endl; for (auto current_f : compr_formats) { for (int exec = 0; exec < number_of_executions; exec++) { @@ -96,6 +97,15 @@ int main(void) { } current_try.random_access_time = get_duration(start); + // full iterate + auto vertex_count = graph->getVertexCount(); + start = highResClock::now(); + for (uint64_t id = 0; id < vertex_count; id++) { + graph->get_outgoing_edge_ids(id); + } + + current_try.full_iterate = get_duration(start); + std::cout << current_try.to_string() << std::endl; } } From d64d01ca567539142ab24b0a46fae01e96fefb1c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 9 Jun 2020 18:25:50 +0200 Subject: [PATCH 197/216] Cache decompressed column block * changing cache in csr -> not const function anymore --- .../storage/graph/formats/adjacencylist.h | 4 +- include/core/storage/graph/formats/csr.h | 73 +++++++++++++++---- include/core/storage/graph/graph.h | 7 +- 3 files changed, 66 insertions(+), 18 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 40d2db74..2c10adaa 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -172,7 +172,7 @@ namespace morphstore { uint64_t get_min_compr_degree() { return min_compr_degree; } // get number of neighbors of vertex with id - uint64_t get_out_degree(uint64_t id) const override { + uint64_t get_out_degree(uint64_t id) override { auto entry = adjacencylistPerVertex->find(id); if (entry == adjacencylistPerVertex->end()) { return 0; @@ -181,7 +181,7 @@ namespace morphstore { } } - std::vector get_outgoing_edge_ids(uint64_t id) const override { + std::vector get_outgoing_edge_ids(uint64_t id) override { // basically column -> vector (as convinient to use in other methods) // maybe better idea would be to return a uint64_t* instead (together with a size value) std::vector edge_ids; diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 32612de9..e81ed10a 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -24,14 +24,41 @@ #ifndef MORPHSTORE_CSR_H #define MORPHSTORE_CSR_H -#include #include +#include + +#include #include #include namespace morphstore { + // simple cache of size 1 (to avoid decompressing the same block multiple times .. f.i. for getting the degree of a + // vertex) + class ColumnBlockCache { + private: + uint64_t block_number; + const column *decompressed_block; + + public: + ColumnBlockCache(uint64_t block_number, const column *decompressed_block) { + this->block_number = block_number; + this->decompressed_block = decompressed_block; + } + + ~ColumnBlockCache() { + // always valid, as columns in uncompressed format should not use the cache + delete decompressed_block; + } + + uint64_t get_block_number() const { return block_number; } + + const column * get() const { + return decompressed_block; + } + }; + class CSR : public Graph { private: @@ -42,6 +69,10 @@ namespace morphstore { column_with_blockoffsets_base *offset_column; column_with_blockoffsets_base *edgeId_column; + // for faster sequentiell access (not respected in memory usage yet) + std::unique_ptr offset_block_cache = nullptr; + std::unique_ptr edgeIds_block_cache = nullptr; + protected: // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) override { @@ -77,7 +108,7 @@ namespace morphstore { } } - uint64_t get_offset(uint64_t id) const { + uint64_t get_offset(uint64_t id) { // TODO: use cache auto block_size = offset_column->get_block_size(); @@ -91,15 +122,23 @@ namespace morphstore { assert(block_number < offset_column->get_block_offsets()->size()); - auto uncompr_block = decompress_column_block(offset_column, current_compression, block_number); + const column_uncompr* uncompr_block; + if (offset_block_cache && offset_block_cache->get_block_number() == block_number) { + //std::cout << "cache hit" << std::endl; + uncompr_block = offset_block_cache->get(); + } + else { + //std::cout << "cache miss" << std::endl; + uncompr_block = decompress_column_block(offset_column, current_compression, block_number); + + // update cache + offset_block_cache = std::make_unique(block_number, uncompr_block); + } uint64_t *block_data = uncompr_block->get_data(); auto offset = block_data[block_pos]; - // deleting temporary column - delete uncompr_block; - return offset; } } @@ -107,7 +146,7 @@ namespace morphstore { // DEBUG function to look into column: void print_column(const column_base *col, int start, int end) const { // validate interval (fix otherwise) - int col_size = col->get_count_values(); + int col_size = col->get_count_values(); if (start < 0 || col_size < start) { start = 0; } @@ -166,7 +205,7 @@ namespace morphstore { } // get number of edges of vertex with id - uint64_t get_out_degree(uint64_t id) const override { + uint64_t get_out_degree(uint64_t id) override { uint64_t offset = get_offset(id); uint64_t nextOffset; @@ -186,7 +225,7 @@ namespace morphstore { } } - std::vector get_outgoing_edge_ids(uint64_t id) const override { + std::vector get_outgoing_edge_ids(uint64_t id) override { assert(vertices->exists_vertex(id)); std::vector result; @@ -220,8 +259,8 @@ namespace morphstore { auto end_block = end / block_size; auto end_block_pos = end % block_size; - assert(start_block < edgeId_column->get_block_offsets()->size()); - assert(end_block < edgeId_column->get_block_offsets()->size()); + assert(start_block < edgeId_column->get_block_offsets()->size()); + assert(end_block < edgeId_column->get_block_offsets()->size()); // case that end is the first value of another block (should not decompress that block than) if (end_block_pos == 0) { @@ -275,6 +314,13 @@ namespace morphstore { offset_column = morph_saving_offsets_graph_col(offset_column, current_compression, target_format, true); edgeId_column = morph_saving_offsets_graph_col(edgeId_column, current_compression, target_format, true); + if (offset_block_cache) { + offset_block_cache = nullptr; + } + if (edgeIds_block_cache) { + edgeIds_block_cache = nullptr; + } + this->current_compression = target_format; } @@ -282,7 +328,7 @@ namespace morphstore { std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); - + // column_meta_data, prepared_for_random_access, .. not included in get_size_used_byte; index_size += 2 * sizeof(column); index_size += edgeId_column->get_size_used_byte(); @@ -301,7 +347,8 @@ namespace morphstore { return " values: " + std::to_string(col->get_count_values()) + " size in bytes: " + std::to_string(col->get_size_used_byte()) + " compression ratio: " + std::to_string(compression_ratio(col_with_offsets, current_compression)) + - " number of blocks (if blocksize > 1): " + std::to_string(col_with_offsets->get_block_offsets()->size()); + " number of blocks (if blocksize > 1): " + + std::to_string(col_with_offsets->get_block_offsets()->size()); } void statistics() override { diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index a0dc535b..9a3e9f0f 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -51,6 +51,7 @@ namespace morphstore { protected: GraphCompressionFormat current_compression = GraphCompressionFormat::UNCOMPRESSED; + // TODO: actually just needed for CSR format (could be moved) uint64_t expectedVertexCount; uint64_t expectedEdgeCount; @@ -175,11 +176,11 @@ namespace morphstore { virtual std::string get_storage_format() const = 0; virtual uint64_t add_edge(uint64_t from, uint64_t to, unsigned short int type) = 0; virtual void morph(GraphCompressionFormat target_format) = 0; - virtual std::vector get_outgoing_edge_ids(uint64_t id) const = 0; - virtual uint64_t get_out_degree(uint64_t id) const = 0; + virtual std::vector get_outgoing_edge_ids(uint64_t id) = 0; + virtual uint64_t get_out_degree(uint64_t id) = 0; // function to return a vector of ids of neighbors for BFS alg. - std::vector get_neighbors_ids(uint64_t id) const { + std::vector get_neighbors_ids(uint64_t id) { std::vector targetVertexIds; for (auto edge_id : get_outgoing_edge_ids(id)) { assert(edges->exists_edge(edge_id)); From 4a4ea87cb2b5a8d83ff256496d7afe975ae04fbb Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 9 Jun 2020 22:54:44 +0200 Subject: [PATCH 198/216] Cache edgeId_column in CSR + bug fix for getting edge-ids --- include/core/storage/graph/formats/csr.h | 56 +++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index e81ed10a..845cd521 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -69,8 +69,10 @@ namespace morphstore { column_with_blockoffsets_base *offset_column; column_with_blockoffsets_base *edgeId_column; - // for faster sequentiell access (not respected in memory usage yet) + // for faster sequentiell access (not respected in memory usage yet) .. ideally encapsulated in an iterator + // as already for getting edge-ids the same block is decompressed 3x otherwise std::unique_ptr offset_block_cache = nullptr; + // assuming most degrees are << block-size std::unique_ptr edgeIds_block_cache = nullptr; protected: @@ -122,6 +124,7 @@ namespace morphstore { assert(block_number < offset_column->get_block_offsets()->size()); + // TODO refactor this cache logic into a method const column_uncompr* uncompr_block; if (offset_block_cache && offset_block_cache->get_block_number() == block_number) { //std::cout << "cache hit" << std::endl; @@ -176,8 +179,7 @@ namespace morphstore { std::string get_storage_format() const override { return "CSR"; } - // this function gets the number of vertices/edges and allocates memory for the vertices-map and the graph - // topology arrays + // this function gets the number of vertices/edges and allocates memory for the graph-topology arrays // TODO: test that no data exists before (as this will get overwritten) void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); @@ -265,31 +267,62 @@ namespace morphstore { // case that end is the first value of another block (should not decompress that block than) if (end_block_pos == 0) { end_block--; - end_block_pos = block_size + 1; + // setting it one step further than actually possible to read from (vector.insert excludes the end) + end_block_pos = block_size; } + // most of the case only one block accessed -> might be worth to seperate from for loop (start_block == end_block) for (auto block_number = start_block; block_number <= end_block; block_number++) { - auto uncompr_block = decompress_column_block(edgeId_column, current_compression, block_number); + const column_uncompr *uncompr_block; + // to avoid wrongly deleting a cached block (which could be used by the next access) + // by creating new unique ptr or direct delete (alternatively use a shared_pointer .. might be a + // very good idea) + bool cache_hit = false; + // only looking at cache for first block (as we assume sequential read) + if (block_number == start_block && edgeIds_block_cache && + edgeIds_block_cache->get_block_number() == block_number) { + // std::cout << "edgeId_col cache hit" << std::endl; + uncompr_block = edgeIds_block_cache->get(); + cache_hit = true; + } else { + // std::cout << "edgeId_col cache miss" << std::endl; + uncompr_block = decompress_column_block(edgeId_column, current_compression, block_number); + } + uint64_t *block_data = uncompr_block->get_data(); - // all edge ids in the same block + // all edge ids in the same block (implicitly end_block == block_number == start_block) if (start_block == end_block) { result.insert(result.end(), block_data + start_block_pos, block_data + end_block_pos); + // update cache + if (!cache_hit) { + edgeIds_block_cache = std::make_unique(block_number, uncompr_block); + } } else if (block_number == end_block) { // only insert until end_pos result.insert(result.end(), block_data, block_data + end_block_pos); + // update cache + if (!cache_hit) { + edgeIds_block_cache = std::make_unique(block_number, uncompr_block); + } } else if (block_number == start_block) { // don't insert values before start auto block_end = block_data + block_size; result.insert(result.end(), block_data + start_block_pos, block_end); + + // deleting temporary column if not cached + if (!cache_hit) { + delete uncompr_block; + } } else { // insert whole block (should be very rare) auto block_end = block_data + block_size; result.insert(result.end(), block_data, block_end); - } - // deleting temporary column - delete uncompr_block; + // deleting temporary column (does not matter if cached as following block will overwrite the + // cache) + delete uncompr_block; + } } } @@ -314,11 +347,12 @@ namespace morphstore { offset_column = morph_saving_offsets_graph_col(offset_column, current_compression, target_format, true); edgeId_column = morph_saving_offsets_graph_col(edgeId_column, current_compression, target_format, true); + // invalidating caches if (offset_block_cache) { - offset_block_cache = nullptr; + offset_block_cache.reset(); } if (edgeIds_block_cache) { - edgeIds_block_cache = nullptr; + edgeIds_block_cache.reset(); } this->current_compression = target_format; From 853db0d92b85e16ff6d4179716099219ca785c78 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 12:54:52 +0200 Subject: [PATCH 199/216] Add instructions for LDBC_DIR flag --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 03b4fdf3..30ab5bbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,8 +88,9 @@ morph_flag(-march=native) # remove build type to allow for custom flag handling set(CMAKE_BUILD_TYPE "") -# add resource directory for ldbc graph (something like "$ENV{HOME}/ldbc/ldbc_snb_datagen/social_network/") -morph_flag(-DLDBC_DIR="$ENV{HOME}/ldbc/ldbc_snb_datagen/social_network/") +# add resource directory for ldbc graph +# (see https://github.com/ldbc/ldbc_snb_datagen for further instructions) +morph_flag(-DLDBC_DIR="$ENV{HOME}/ldbc_snb_datagen/social_network/") # general compiler settings, meant for all subdirectories and tests morph_flag(-Werror) From 68b544a583a7f1110a00237d06a4ea1f97c1c01a Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 20:11:18 +0200 Subject: [PATCH 200/216] Fix delete error in AdjacencyList format and correct size_of_graph func --- include/core/storage/graph/formats/adjacencylist.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index 2c10adaa..ab78ac9c 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -18,8 +18,8 @@ /** * @file adjacencylist.h * @brief Derived adj. list storage format class. Base: graph.h - * @todo Adjust get_size_of_graph(), ?replace unordered_map with a fixed sized array - */ + * @todo try replacing unordered_map with a fixed sized array; read more into std::variant not allowing references (seems to work though) +*/ #ifndef MORPHSTORE_ADJACENCYLIST_H #define MORPHSTORE_ADJACENCYLIST_H @@ -133,9 +133,9 @@ namespace morphstore { } else { delete std::get(adj_list); } - - delete adjacencylistPerVertex; } + + delete this->adjacencylistPerVertex; } AdjacencyList(EdgesContainerType edges_container_type) @@ -284,6 +284,9 @@ namespace morphstore { // for measuring the size in bytes: std::pair get_size_of_graph() const override { auto [index_size, data_size] = Graph::get_size_of_graph(); + + // min_compr_degree + index_size += sizeof(uint64_t); // adjacencyListPerVertex index_size += sizeof(std::unordered_map); From 0b07069adf90e021f4ba072c5631022761d0edef Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 20:13:00 +0200 Subject: [PATCH 201/216] Assert offsets in csr make sense catching a current bug regarding the offset_column (not fixing it yet) Failure case: last vertex has no edges -> no edges added -> no offset set --- include/core/storage/graph/formats/csr.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 845cd521..4ac2bd09 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -80,6 +80,7 @@ namespace morphstore { void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) override { // TODO: throw error if not in order of vertex-ids ASC inserted (currently will only produce rubbish data) // TODO: handle if sourceIDs are skipped + // TODO: !!! handle if last vertex has no edges (wrong offset currently) // potential solution: add last_seen_vertex_id as class field .. check based on that .. assert order and // insert offsets for skipped vertices @@ -218,13 +219,11 @@ namespace morphstore { nextOffset = get_offset(id + 1); } + // if this fails, than alloc_graph has probably the wrong values + assert(offset <= nextOffset); + // compute out_degree - // TODO: simplify this line - if (offset == nextOffset) - return 0; - else { - return nextOffset - offset; - } + return nextOffset - offset; } std::vector get_outgoing_edge_ids(uint64_t id) override { From 252a2b2beabeee4d57695527184aa87f670993ee Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 20:13:22 +0200 Subject: [PATCH 202/216] Fix title of benchmark --- .../graph/adjList_graph_compression_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp index e7e1348e..14719e81 100644 --- a/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/adjList_graph_compression_benchmark.cpp @@ -70,7 +70,7 @@ int main(void) { random_accesses.push_back(dist(rd)); } - std::cout << "Test vertex storage structure (median of 5 for full_iterate and random access)" << std::endl; + std::cout << "Test compression of adjacency-list format" << std::endl; std::cout << "Compression-Format | minimum degree for compression | compression-time | " << "compr. ratio | column ratio | access of edges of 5000 random vertices | full-iterate " << std::endl; From 0e96431e8a4ed69a098e4ef3cc04b401fbcc6c0f Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 20:15:13 +0200 Subject: [PATCH 203/216] Move degree measurement into a seperate operator and use shared ptr slimming down graph.h unique ptr didn't make sense, as the algorithm does not own the graph here --- .../core/operators/graph/degree_measurement.h | 87 +++++++++++++++++++ include/core/operators/graph/top_down_bfs.h | 66 +++++++------- include/core/storage/graph/graph.h | 47 ---------- .../graph/ldbc/bfs_ldbc_graph_test.h | 5 +- .../graph/simple/bfs_simple_graph_test.h | 10 ++- 5 files changed, 125 insertions(+), 90 deletions(-) create mode 100644 include/core/operators/graph/degree_measurement.h diff --git a/include/core/operators/graph/degree_measurement.h b/include/core/operators/graph/degree_measurement.h new file mode 100644 index 00000000..886a235d --- /dev/null +++ b/include/core/operators/graph/degree_measurement.h @@ -0,0 +1,87 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file degree_measurement.h + * @brief computing a degree distribution of a given graph + * @todo multi-threaded impl? ; add tests + */ + +#ifndef MORPHSTORE_DEGREE_MEASUREMENT +#define MORPHSTORE_DEGREE_MEASUREMENT + +#include + +#include + +namespace morphstore { + + class DegreeMeasurement { + + public: + // function to return a list of pair < vertex id, degree > DESC: + static std::vector> get_list_of_degree_DESC(std::shared_ptr &graph) { + std::vector> vertexDegreeList; + auto vertex_count = graph->getVertexCount(); + vertexDegreeList.reserve(vertex_count); + + // fill the vector with every vertex key and his degree + for (uint64_t i = 0; i < vertex_count; ++i) { +#if DEBUG + if (i % 10000 == 0) { + std::cout << "Degree-List - Current Progress" << i << "/" << vertex_count << std::endl; + } +#endif + vertexDegreeList.push_back({i, graph->get_out_degree(i)}); + } + + // sort the vector on degree DESC + std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), + [](const std::pair &left, const std::pair &right) { + return left.second > right.second; + }); + + return vertexDegreeList; + } + + // function to measure graph characteristics (degree and count) and write the result to a given file: + static void measure_degree_count(std::shared_ptr graph, std::string filePath) { + std::vector> verticesDegree = get_list_of_degree_DESC(graph); + // unordered map for mapping degree to count: + std::unordered_map results; + + for (uint64_t i = 0; i < verticesDegree.size(); ++i) { + // increment count in results for a given degree: + results[verticesDegree[i].second]++; + } + + // write to file: + std::ofstream fs; + std::stringstream ss; + // open file for writing and delete existing stuff: + fs.open(filePath, std::fstream::out | std::ofstream::trunc); + + for (auto const &m : results) { + ss << m.first << "," << m.second << "\n"; + } + fs << ss.str(); + fs.close(); + } + }; +} // namespace morphstore + +#endif // MORPHSTORE_DEGREE_MEASUREMENT diff --git a/include/core/operators/graph/top_down_bfs.h b/include/core/operators/graph/top_down_bfs.h index 6529edba..94e7715a 100644 --- a/include/core/operators/graph/top_down_bfs.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2019-2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -24,52 +24,44 @@ #ifndef MORPHSTORE_TOP_DOWN_BFS #define MORPHSTORE_TOP_DOWN_BFS -#include "../../storage/graph/graph.h" +#include +#include #include -namespace morphstore{ - - class BFS{ +namespace morphstore { + class BFS { private: - std::unique_ptr graph; - uint64_t graphSize; - - public: + std::shared_ptr graph; + public: // constructor with smart pointer to graph as parameter/reference - BFS(std::unique_ptr& g) : graph(std::move(g)){ - graphSize = graph->getVertexCount(); - } - - uint64_t get_graph_size(){ - return graphSize; - } + BFS(std::shared_ptr &g) : graph(g) {} // ------------------------------------------ BFS algorithm ------------------------------------------ // actual BFS algorithm: takes the start-node id and returns the number of explored vertices - uint64_t do_BFS(uint64_t startVertex){ + uint64_t do_BFS(uint64_t startVertex) { std::vector frontier; std::vector next; - std::vector visited(graphSize, false); + std::vector visited(graph->getVertexCount(), false); uint64_t exploredVertices = 0; frontier.push_back(startVertex); visited[startVertex] = true; - while(!frontier.empty()){ + while (!frontier.empty()) { // Loop through current layer of vertices in the frontier - for(uint64_t i = 0; i < frontier.size(); ++i){ + for (uint64_t i = 0; i < frontier.size(); ++i) { uint64_t currentVertex = frontier[i]; // get list of a vertex's adjacency - std::vector neighbors = graph->get_neighbors_ids(currentVertex); + std::vector neighbors = graph->get_neighbors_ids(currentVertex); // Loop through all of neighbors of current vertex - for(uint64_t j = 0; j < neighbors.size(); ++j){ - // check if neighbor has been visited, if not -> put into frontier and mark as visit = true - if(!visited[neighbors[j]]){ + for (uint64_t j = 0; j < neighbors.size(); ++j) { + // check if neighbor has been visited, if not -> put into frontier and mark as visit = true + if (!visited[neighbors[j]]) { next.push_back(neighbors[j]); visited[neighbors[j]] = true; ++exploredVertices; @@ -88,7 +80,7 @@ namespace morphstore{ // function that measures the number of explored vertices and time in ms: // results are written into a file; cycle determines the ith vertex from list - void do_measurements(uint64_t cycle, std::string pathToFile){ + void do_measurements(uint64_t cycle, std::string pathToFile) { // list of measurement candidates: the parameter means the ith vertex in total std::vector candidates = get_list_of_every_ith_vertex(cycle); @@ -96,15 +88,15 @@ namespace morphstore{ std::vector> results; results.reserve(candidates.size()); - - for(uint64_t i = 0; i < candidates.size(); ++i){ + for (uint64_t i = 0; i < candidates.size(); ++i) { // start measuring bfs time: auto startBFSTime = std::chrono::high_resolution_clock::now(); uint64_t exploredVertices = do_BFS(candidates[i]); auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time - auto elapsedBFSTime = std::chrono::duration_cast< std::chrono::milliseconds >( finishBFSTime - startBFSTime ).count(); + auto elapsedBFSTime = + std::chrono::duration_cast(finishBFSTime - startBFSTime).count(); // write to intermediate array: results.push_back({exploredVertices, elapsedBFSTime}); @@ -117,26 +109,26 @@ namespace morphstore{ // open file for writing and delete existing stuff: fs.open(filename, std::fstream::out | std::ofstream::trunc); - for(uint64_t j = 0; j < results.size(); ++j){ + for (uint64_t j = 0; j < results.size(); ++j) { ss << results[j].first << "," << results[j].second << "\n"; ++j; } - fs << ss.str() ; + fs << ss.str(); fs.close(); } // function which returns a list of every ith vertex which is sorted by degree DESC - std::vector< uint64_t > get_list_of_every_ith_vertex(uint64_t cycle){ - std::vector< uint64_t > measurementCandidates; - std::vector< std::pair > totalListOfVertices = graph->get_list_of_degree_DESC(); - for(uint64_t i = 0; i < totalListOfVertices.size(); i = i + cycle){ + std::vector get_list_of_every_ith_vertex(uint64_t cycle) { + std::vector measurementCandidates; + std::vector> totalListOfVertices = + DegreeMeasurement::get_list_of_degree_DESC(graph); + for (uint64_t i = 0; i < totalListOfVertices.size(); i = i + cycle) { measurementCandidates.push_back(totalListOfVertices[i].first); } return measurementCandidates; } }; -} - -#endif //MORPHSTORE_TOP_DOWN_BFS +} // namespace morphstore +#endif // MORPHSTORE_TOP_DOWN_BFS diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index 9a3e9f0f..e6195852 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -118,53 +118,6 @@ namespace morphstore { EdgeWithIdAndProperties get_edge(uint64_t id) { return edges->get_edge_with_properties(id); } - // function to return a list of pair < vertex id, degree > DESC: - // TODO: move into seperate header and use graph as input parameter - std::vector> get_list_of_degree_DESC() { - std::vector> vertexDegreeList; - vertexDegreeList.reserve(getVertexCount()); - // fill the vector with every vertex key and his degree - for (uint64_t i = 0; i < getVertexCount(); ++i) { - /* if (i % 1000 == 0) { - std::cout << "Degree-List - Current Progress" << i << "/" << getVertexCount() << - std::endl; - } */ - vertexDegreeList.push_back({i, this->get_out_degree(i)}); - } - // sort the vector on degree DESC - std::sort(vertexDegreeList.begin(), vertexDegreeList.end(), - [](const std::pair &left, const std::pair &right) { - return left.second > right.second; - }); - - return vertexDegreeList; - } - - // function to measure graph characteristics (degree and count): - // TODO: move into seperate header and use graph as input parameter - void measure_degree_count(std::string filePath) { - std::vector> verticesDegree = get_list_of_degree_DESC(); - // unordered map for mapping degree to count: - std::unordered_map results; - - for (uint64_t i = 0; i < verticesDegree.size(); ++i) { - // increment count in results for a given degree: - results[verticesDegree[i].second]++; - } - - // write to file: - std::ofstream fs; - std::stringstream ss; - // open file for writing and delete existing stuff: - fs.open(filePath, std::fstream::out | std::ofstream::trunc); - - for (auto const &m : results) { - ss << m.first << "," << m.second << "\n"; - } - fs << ss.str(); - fs.close(); - } - void add_property_to_vertex(uint64_t id, const std::pair property) { vertices->add_property_to_vertex(id, property); }; diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index 010f451d..c93a9425 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -39,13 +39,13 @@ template void bfs_ldbc_graph_test(void) { static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - std::unique_ptr graph = std::make_unique(); + std::shared_ptr graph = std::make_shared(); std::string storageFormat = graph->get_storage_format(); print_header(storageFormat); // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) - std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); + std::shared_ptr ldbcImport = std::make_shared(LDBC_DIR); // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); @@ -57,7 +57,6 @@ template void bfs_ldbc_graph_test(void) { auto bfs = std::make_unique(graph); // for scale factor 1 and including static as well as dynamic part of the graph std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; - // bfs->do_measurements(10000, targetDir + "bfs_" + storageFormat); #else throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif diff --git a/test/core/operators/graph/simple/bfs_simple_graph_test.h b/test/core/operators/graph/simple/bfs_simple_graph_test.h index 66bf9ccb..bdb7260e 100644 --- a/test/core/operators/graph/simple/bfs_simple_graph_test.h +++ b/test/core/operators/graph/simple/bfs_simple_graph_test.h @@ -39,7 +39,7 @@ void bfs_simple_graph_test (void) { static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); - std::unique_ptr graph = std::make_unique(); + std::shared_ptr graph = std::make_shared(); print_header(graph->get_storage_format()); graph->allocate_graph_structure(4, 4); @@ -61,9 +61,13 @@ void bfs_simple_graph_test (void) { std::cout << "Some statistics" << std::endl; graph->statistics(); + assert(graph->getVertexCount() == 4); + assert(graph->getEdgeCount() == 4); + auto bfs = std::make_unique(graph); - //assert(graph->getVertexCount() == 4); - //assert(graph->getEdgeCount() == 3); + assert(bfs->do_BFS(v1) == 2); + assert(bfs->do_BFS(v2) == 1); + assert(bfs->do_BFS(v3) == 1); } \ No newline at end of file From b124286748062793f776db9a6db30bc43dd132b3 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 21:09:39 +0200 Subject: [PATCH 204/216] Do not save graph in BFS class as volatile graphs dont work then use-case: bfs_benchmark --- include/core/operators/graph/top_down_bfs.h | 23 +++++++------------ .../graph/ldbc/bfs_ldbc_graph_test.h | 3 +-- .../graph/simple/bfs_simple_graph_test.h | 11 ++++----- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/include/core/operators/graph/top_down_bfs.h b/include/core/operators/graph/top_down_bfs.h index 94e7715a..56b62dfe 100644 --- a/include/core/operators/graph/top_down_bfs.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -32,17 +32,9 @@ namespace morphstore { class BFS { - private: - std::shared_ptr graph; - public: - // constructor with smart pointer to graph as parameter/reference - BFS(std::shared_ptr &g) : graph(g) {} - - // ------------------------------------------ BFS algorithm ------------------------------------------ - // actual BFS algorithm: takes the start-node id and returns the number of explored vertices - uint64_t do_BFS(uint64_t startVertex) { + static uint64_t compute(std::shared_ptr graph, uint64_t startVertex) { std::vector frontier; std::vector next; std::vector visited(graph->getVertexCount(), false); @@ -80,9 +72,9 @@ namespace morphstore { // function that measures the number of explored vertices and time in ms: // results are written into a file; cycle determines the ith vertex from list - void do_measurements(uint64_t cycle, std::string pathToFile) { + static void do_measurements(std::shared_ptr graph, uint64_t cycle, std::string pathToFile) { // list of measurement candidates: the parameter means the ith vertex in total - std::vector candidates = get_list_of_every_ith_vertex(cycle); + std::vector candidates = get_list_of_every_ith_vertex(graph, cycle); // Intermediate data structure: (explored vertices, time in ms) std::vector> results; @@ -92,7 +84,7 @@ namespace morphstore { // start measuring bfs time: auto startBFSTime = std::chrono::high_resolution_clock::now(); - uint64_t exploredVertices = do_BFS(candidates[i]); + uint64_t exploredVertices = compute(graph, candidates[i]); auto finishBFSTime = std::chrono::high_resolution_clock::now(); // For measuring the execution time auto elapsedBFSTime = @@ -109,9 +101,10 @@ namespace morphstore { // open file for writing and delete existing stuff: fs.open(filename, std::fstream::out | std::ofstream::trunc); - for (uint64_t j = 0; j < results.size(); ++j) { + ss << "explored vertices | time in ms \n"; + + for (uint64_t j = 0; j < results.size(); j++) { ss << results[j].first << "," << results[j].second << "\n"; - ++j; } fs << ss.str(); @@ -119,7 +112,7 @@ namespace morphstore { } // function which returns a list of every ith vertex which is sorted by degree DESC - std::vector get_list_of_every_ith_vertex(uint64_t cycle) { + static std::vector get_list_of_every_ith_vertex(std::shared_ptr graph, uint64_t cycle) { std::vector measurementCandidates; std::vector> totalListOfVertices = DegreeMeasurement::get_list_of_degree_DESC(graph); diff --git a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h index c93a9425..0deebf8d 100644 --- a/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h +++ b/test/core/operators/graph/ldbc/bfs_ldbc_graph_test.h @@ -54,9 +54,8 @@ template void bfs_ldbc_graph_test(void) { std::cout << "Some statistics" << std::endl; graph->statistics(); - auto bfs = std::make_unique(graph); // for scale factor 1 and including static as well as dynamic part of the graph - std::cout << "Based on Vertex with id 0: " << bfs->do_BFS(0) << " vertices could be explored via BFS"; + std::cout << "Based on Vertex with id 0: " << morphstore::BFS::compute(graph, 0) << " vertices could be explored via BFS"; #else throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); #endif diff --git a/test/core/operators/graph/simple/bfs_simple_graph_test.h b/test/core/operators/graph/simple/bfs_simple_graph_test.h index bdb7260e..3d737d55 100644 --- a/test/core/operators/graph/simple/bfs_simple_graph_test.h +++ b/test/core/operators/graph/simple/bfs_simple_graph_test.h @@ -63,11 +63,8 @@ void bfs_simple_graph_test (void) { assert(graph->getVertexCount() == 4); assert(graph->getEdgeCount() == 4); - - auto bfs = std::make_unique(graph); - - - assert(bfs->do_BFS(v1) == 2); - assert(bfs->do_BFS(v2) == 1); - assert(bfs->do_BFS(v3) == 1); + + assert(morphstore::BFS::compute(graph, v1) == 2); + assert(morphstore::BFS::compute(graph, v2) == 1); + assert(morphstore::BFS::compute(graph, v3) == 1); } \ No newline at end of file From bcec970f91acd25ab31d22e4f4d905547b5021b8 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Fri, 12 Jun 2020 21:10:38 +0200 Subject: [PATCH 205/216] Add bfs benchmark for evaluating impact of graph compressions --- src/microbenchmarks/graph/CMakeLists.txt | 4 + src/microbenchmarks/graph/bfs_benchmark.cpp | 105 ++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 src/microbenchmarks/graph/bfs_benchmark.cpp diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index 8867f945..f057b57e 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -3,11 +3,15 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/edge_storage_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_csr_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_adjList_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/bfs_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) add_executable( edge_storage_benchmark_app edge_storage_benchmark.cpp) add_executable( compress_csr_benchmark_app csr_graph_compression_benchmark.cpp) add_executable( compress_adjList_benchmark_app adjList_graph_compression_benchmark.cpp) + add_executable( bfs_benchmark_app bfs_benchmark.cpp) + target_link_libraries(compress_csr_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(compress_adjList_benchmark_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(bfs_benchmark_app PRIVATE "-ldl" stdc++fs) endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/bfs_benchmark.cpp b/src/microbenchmarks/graph/bfs_benchmark.cpp new file mode 100644 index 00000000..b5b67525 --- /dev/null +++ b/src/microbenchmarks/graph/bfs_benchmark.cpp @@ -0,0 +1,105 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_benchmark.cpp + * @brief A benchmark of the csr-graph compression (using the ldbc graph) + * @todo allow different compression formats for the two csr columns; add full_iterate + */ + +#include "benchmark_helper.h" +#include +#include +#include +#include + +#include + +using namespace morphstore; + +struct CompressionBenchmarkEntry { + std::string graph_format; + std::string compr_format; + int64_t bfs_time; + int64_t visited_vertices; + + std::string to_string() { + return graph_format + "|" + compr_format + "|" + std::to_string(bfs_time) + "|" + + std::to_string(visited_vertices); + } +}; + +template void benchmark() { + + static_assert(std::is_base_of::value, + "type parameter of this method must be a graph format"); + +#ifdef LDBC_DIR + // could be also build parameters? + const int number_of_executions = 5; + const int number_of_start_vertices = 10; + + // order based on block-size (as adj-list format currently only supports decreasing blocksizes at `morph()`) + std::vector compr_formats = {GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR, + GraphCompressionFormat::DYNAMIC_VBP, + GraphCompressionFormat::UNCOMPRESSED}; + + // Load ldbc graph + // blank lines for easier deletion of progress prints + std::cout << std::endl << std::endl; + std::shared_ptr graph = std::make_shared(); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); + ldbcImport->import(*graph); + std::cout << std::endl << std::endl; + + const int cycle_size = graph->getVertexCount() / number_of_start_vertices; + auto start_vertex_ids = BFS::get_list_of_every_ith_vertex(graph, cycle_size); + + std::cout << "Test impact of compression on BFS" << std::endl; + std::cout << "Graph-Format | Compression-Format | bfs-time | visited vertices" << std::endl; + + for (auto current_f : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + current_try.compr_format = graph_compr_f_to_string(current_f); + + // restore start state (not needed as this will be not timed and morphing internally goes via uncompr) + //graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); + // morphing into desired format + graph->morph(current_f); + + for (auto id : start_vertex_ids) { + auto start = highResClock::now(); + current_try.visited_vertices = morphstore::BFS::compute(graph, id); + current_try.bfs_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + } + + } + } +#else + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); +#endif +} + +int main(void) { + benchmark(); + benchmark(); +} From 61d9938e0d0d6594ecd294771fa9078948ea88f0 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 14 Jun 2020 17:59:34 +0200 Subject: [PATCH 206/216] Add naive page rank --- include/core/operators/graph/page_rank.h | 116 +++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 include/core/operators/graph/page_rank.h diff --git a/include/core/operators/graph/page_rank.h b/include/core/operators/graph/page_rank.h new file mode 100644 index 00000000..a03f0fef --- /dev/null +++ b/include/core/operators/graph/page_rank.h @@ -0,0 +1,116 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank.h + * @brief naive page-rank implementation (based on https://en.wikipedia.org/wiki/PageRank) + * @todo multi-threaded impl? ; add tests; weighted implementation + */ + +#ifndef MORPHSTORE_PAGE_RANK +#define MORPHSTORE_PAGE_RANK + +#include + +// for equal with tolerance +#include +// for std::abs +#include + +namespace morphstore { + + struct PageRankResult { + // input parameters + uint64_t max_iterations; + float damping_factor, tolerance; + + uint64_t ran_iterations = 0; + // terminated as scores converged? + bool converged; + // i-th entry for vertex with id i + std::vector scores; + + // leaving out the scores + std::string describe() { + std::string converged_str = converged ? "True" : "False"; + return "Input-Parameters: { damping-factor: " + std::to_string(damping_factor) + + ", max-iterations: " + std::to_string(max_iterations) + + ", tolerance: " + std::to_string(tolerance) + "} \n\t\t" + + "Computed: { converged: " + converged_str + ", ran_iterations: " + std::to_string(ran_iterations) + + "}"; + } + }; + + class PageRank { + + public: + // assuming a consecutive vertex id-space + static PageRankResult compute(std::shared_ptr graph, const uint64_t max_iterations = 20, + const float damping_factor = 0.85, const float tolerance = 0.0001) { + // init score vector with 1/vertex_count; + const uint64_t vertex_count = graph->getVertexCount(); + std::vector scores(vertex_count, 1.0 / vertex_count); + + uint64_t iteration = 0; + bool converged = false; + + for (; iteration < max_iterations; iteration++) { + // init scores of current iteration + std::vector new_scores(vertex_count, (1.0 - damping_factor) / vertex_count); + + // loop over all vertices + for (uint64_t i = 0; i < vertex_count; ++i) { + const auto neighbors = graph->get_neighbors_ids(i); + + // damping_factor * (prev-it-PR(i) / degr(i)) + const auto value_to_propagate = damping_factor * (scores[i] / neighbors.size()); + + // propagate score to its neighbours + for (auto neighbor_id : neighbors) { + new_scores[neighbor_id] += value_to_propagate; + } + } + + // TODO: add a tolerance (as another condition to terminate besides max_iterations) + // would check if scores - new_scores > tolerance (break otherwise) + if (std::equal(scores.begin(), scores.end(), new_scores.begin(), new_scores.end(), + [tolerance](float score, float other_score) { + return std::abs(score - other_score) < tolerance; + })) { + converged = true; + break; + } + + scores = new_scores; + } + + // build result; + PageRankResult result; + result.damping_factor = damping_factor; + result.max_iterations = max_iterations; + result.tolerance = tolerance; + + result.converged = converged; + result.ran_iterations = iteration; + result.scores = scores; + + return result; + } + }; +} // namespace morphstore + +#endif // MORPHSTORE_PAGE_RANK From a7b787e8e9d7462e761a681870a2251522edac35 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 14 Jun 2020 18:30:17 +0200 Subject: [PATCH 207/216] Add tests for page rank --- include/core/operators/graph/page_rank.h | 8 +- test/core/operators/graph/ldbc/CMakeLists.txt | 5 ++ .../graph/ldbc/page_rank_ldbc_graph_test.cpp | 68 ++++++++++++++++ .../operators/graph/simple/CMakeLists.txt | 5 +- .../page_rank_simple_csr_graph_test.cpp | 29 +++++++ .../simple/page_rank_simple_graph_test.h | 78 +++++++++++++++++++ 6 files changed, 186 insertions(+), 7 deletions(-) create mode 100644 test/core/operators/graph/ldbc/page_rank_ldbc_graph_test.cpp create mode 100644 test/core/operators/graph/simple/page_rank_simple_csr_graph_test.cpp create mode 100644 test/core/operators/graph/simple/page_rank_simple_graph_test.h diff --git a/include/core/operators/graph/page_rank.h b/include/core/operators/graph/page_rank.h index a03f0fef..8512897b 100644 --- a/include/core/operators/graph/page_rank.h +++ b/include/core/operators/graph/page_rank.h @@ -49,7 +49,7 @@ namespace morphstore { std::string converged_str = converged ? "True" : "False"; return "Input-Parameters: { damping-factor: " + std::to_string(damping_factor) + ", max-iterations: " + std::to_string(max_iterations) + - ", tolerance: " + std::to_string(tolerance) + "} \n\t\t" + + ", tolerance: " + std::to_string(tolerance) + "} \n\t\t\t" + "Computed: { converged: " + converged_str + ", ran_iterations: " + std::to_string(ran_iterations) + "}"; } @@ -65,10 +65,10 @@ namespace morphstore { const uint64_t vertex_count = graph->getVertexCount(); std::vector scores(vertex_count, 1.0 / vertex_count); - uint64_t iteration = 0; + uint64_t iteration; bool converged = false; - for (; iteration < max_iterations; iteration++) { + for (iteration = 0; iteration < max_iterations; iteration++) { // init scores of current iteration std::vector new_scores(vertex_count, (1.0 - damping_factor) / vertex_count); @@ -85,8 +85,6 @@ namespace morphstore { } } - // TODO: add a tolerance (as another condition to terminate besides max_iterations) - // would check if scores - new_scores > tolerance (break otherwise) if (std::equal(scores.begin(), scores.end(), new_scores.begin(), new_scores.end(), [tolerance](float score, float other_score) { return std::abs(score - other_score) < tolerance; diff --git a/test/core/operators/graph/ldbc/CMakeLists.txt b/test/core/operators/graph/ldbc/CMakeLists.txt index ca06cc54..b32395e3 100644 --- a/test/core/operators/graph/ldbc/CMakeLists.txt +++ b/test/core/operators/graph/ldbc/CMakeLists.txt @@ -1,12 +1,17 @@ if ( CTEST_ALL OR CTEST_OPERATORS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/ldbc/bfs_ldbc_csr_graph_test_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/ldbc/bfs_ldbc_adj_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/ldbc/page_rank_ldbc_graph_test_app ) add_executable( bfs_ldbc_csr_graph_test_app bfs_ldbc_csr_graph_test.cpp) add_executable( bfs_ldbc_adj_graph_test_app bfs_ldbc_adj_graph_test.cpp) + add_executable( page_rank_ldbc_graph_test_app page_rank_ldbc_graph_test.cpp) + target_link_libraries(bfs_ldbc_csr_graph_test_app PRIVATE "-ldl" stdc++fs) target_link_libraries(bfs_ldbc_adj_graph_test_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(page_rank_ldbc_graph_test_app PRIVATE "-ldl" stdc++fs) add_test( bfs_ldbc_csr_graph_test_app bfs_ldbc_csr_graph_test_app ) add_test( bfs_ldbc_adj_graph_test_app bfs_ldbc_adj_graph_test_app ) + add_test( page_rank_ldbc_graph_test_app page_rank_ldbc_graph_test_app ) endif() \ No newline at end of file diff --git a/test/core/operators/graph/ldbc/page_rank_ldbc_graph_test.cpp b/test/core/operators/graph/ldbc/page_rank_ldbc_graph_test.cpp new file mode 100644 index 00000000..065edab5 --- /dev/null +++ b/test/core/operators/graph/ldbc/page_rank_ldbc_graph_test.cpp @@ -0,0 +1,68 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank_ldbc_graph_test.cpp + * @brief Test methods for PageRank on the ldbc graph (only testing csr out of simplicity) + * @todo + */ +#include +#include +#include +#include + +void print_header(std::string storageFormat) { + + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Operator-Test: LDBC " << storageFormat << " Page-Rank Test *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; +} + +template +void page_rank_ldbc_graph_test (void) { + + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + + std::shared_ptr graph = std::make_shared(); + std::string storageFormat = graph->get_storage_format(); + + print_header(storageFormat); + + // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) + std::shared_ptr ldbcImport = std::make_shared(LDBC_DIR); + + // generate vertices & edges from LDBC files and insert into graph structure + ldbcImport->import(*graph); + + // some statistics (DEBUG) + std::cout << "Some statistics" << std::endl; + graph->statistics(); + + + auto result = morphstore::PageRank::compute(graph, 30); + + std::cout << result.describe() << std::endl; + + // TODO: some assertions? +} + +int main() { + page_rank_ldbc_graph_test(); + return 0; +} \ No newline at end of file diff --git a/test/core/operators/graph/simple/CMakeLists.txt b/test/core/operators/graph/simple/CMakeLists.txt index 81a6bd93..459f107a 100644 --- a/test/core/operators/graph/simple/CMakeLists.txt +++ b/test/core/operators/graph/simple/CMakeLists.txt @@ -1,12 +1,13 @@ if ( CTEST_ALL OR CTEST_OPERATORS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/simple/bfs_simple_csr_graph_test_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/simple/bfs_simple_adj_graph_test_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/test/core/operators/graph/simple/page_rank_simple_csr_graph_test_app ) add_executable( bfs_simple_csr_graph_test_app bfs_simple_csr_graph_test.cpp) add_executable( bfs_simple_adj_graph_test_app bfs_simple_adj_graph_test.cpp) - target_link_libraries(bfs_simple_csr_graph_test_app PRIVATE "-ldl" stdc++fs) - target_link_libraries(bfs_simple_adj_graph_test_app PRIVATE "-ldl" stdc++fs) + add_executable( page_rank_simple_csr_graph_test_app page_rank_simple_csr_graph_test.cpp) add_test( bfs_simple_csr_graph_test_app bfs_simple_csr_graph_test_app ) add_test( bfs_simple_adj_graph_test_app bfs_simple_adj_graph_test_app ) + add_test( page_rank_simple_csr_graph_test_app page_rank_simple_csr_graph_test_app ) endif() \ No newline at end of file diff --git a/test/core/operators/graph/simple/page_rank_simple_csr_graph_test.cpp b/test/core/operators/graph/simple/page_rank_simple_csr_graph_test.cpp new file mode 100644 index 00000000..ca6a92ea --- /dev/null +++ b/test/core/operators/graph/simple/page_rank_simple_csr_graph_test.cpp @@ -0,0 +1,29 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank_simple_csr_graph_test.cpp + * @brief Test for page-rank on a simple graph in csr format + * @todo + */ +#include +#include "page_rank_simple_graph_test.h" + +int main( void ){ + page_rank_simple_graph_test(); + return 0; +} diff --git a/test/core/operators/graph/simple/page_rank_simple_graph_test.h b/test/core/operators/graph/simple/page_rank_simple_graph_test.h new file mode 100644 index 00000000..273bac76 --- /dev/null +++ b/test/core/operators/graph/simple/page_rank_simple_graph_test.h @@ -0,0 +1,78 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank_simple_graph_test.cpp + * @brief Test methods for PageRank on a simple graph + * @todo + */ +#include +#include + +void print_header(std::string storageFormat) { + + std::cout << "\n"; + std::cout << "**********************************************************" << std::endl; + std::cout << "* MorphStore-Operator-Test: Simple " << storageFormat << " Page-Rank Test *" << std::endl; + std::cout << "**********************************************************" << std::endl; + std::cout << "\n"; +} + +template +void page_rank_simple_graph_test (void) { + + static_assert(std::is_base_of::value, "type parameter of this method must be a graph format"); + + std::shared_ptr graph = std::make_shared(); + print_header(graph->get_storage_format()); + + graph->allocate_graph_structure(4, 4); + + std::map edgeTypeMap = {{1, "knows"}, {2, "likes"}}; + std::map vertexTypeMap = {{0, "Person"}}; + graph->setEdgeTypeDictionary(edgeTypeMap); + graph->set_vertex_type_dictionary(vertexTypeMap); + + uint64_t v1 = graph->add_vertex(); + uint64_t v2 = graph->add_vertex(); + uint64_t v3 = graph->add_vertex(); + graph->add_vertex(); + + + // + graph->add_edges(v1, {morphstore::Edge(v1, v2, 1)}); + graph->add_edges(v2, {morphstore::Edge(v2, v3, 2), morphstore::Edge(v2, v1, 1)}); + graph->add_edges(v3, {morphstore::Edge(v3, v2, 1)}); + + std::cout << "Some statistics" << std::endl; + graph->statistics(); + + assert(graph->getVertexCount() == 4); + assert(graph->getEdgeCount() == 4); + + auto result = morphstore::PageRank::compute(graph, 100); + + std::cout << result.describe() << std::endl; + + for(uint64_t i = 0; i < result.scores.size(); i++) { + std::cout << "id: " << i << " score: " << result.scores.at(i) << std::endl; + } + + assert(result.scores.at(1) > result.scores.at(0)); + assert(result.scores.at(0) == result.scores.at(2)); + assert(result.scores.at(2) > result.scores.at(3)); +} \ No newline at end of file From 448cea4b9ab91b69013a5dd95a95a0d5b0ed0bb5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 14 Jun 2020 22:41:18 +0200 Subject: [PATCH 208/216] Add page-rank benchmark --- src/microbenchmarks/graph/CMakeLists.txt | 3 + src/microbenchmarks/graph/bfs_benchmark.cpp | 3 +- .../graph/csr_graph_compression_benchmark.cpp | 2 +- .../graph/page_rank_benchmark.cpp | 98 +++++++++++++++++++ 4 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 src/microbenchmarks/graph/page_rank_benchmark.cpp diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index f057b57e..a32633a2 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -4,14 +4,17 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_csr_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_adjList_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/bfs_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/page_rank_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) add_executable( edge_storage_benchmark_app edge_storage_benchmark.cpp) add_executable( compress_csr_benchmark_app csr_graph_compression_benchmark.cpp) add_executable( compress_adjList_benchmark_app adjList_graph_compression_benchmark.cpp) add_executable( bfs_benchmark_app bfs_benchmark.cpp) + add_executable( page_rank_benchmark_app page_rank_benchmark.cpp) target_link_libraries(compress_csr_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(compress_adjList_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(bfs_benchmark_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(page_rank_benchmark_app PRIVATE "-ldl" stdc++fs) endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/bfs_benchmark.cpp b/src/microbenchmarks/graph/bfs_benchmark.cpp index b5b67525..294eee96 100644 --- a/src/microbenchmarks/graph/bfs_benchmark.cpp +++ b/src/microbenchmarks/graph/bfs_benchmark.cpp @@ -17,8 +17,7 @@ /** * @file bfs_benchmark.cpp - * @brief A benchmark of the csr-graph compression (using the ldbc graph) - * @todo allow different compression formats for the two csr columns; add full_iterate + * @brief A benchmark evaluating the impact of graph compression on breadth first search (using the ldbc graph) */ #include "benchmark_helper.h" diff --git a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp index 7354b4e1..bd932cef 100644 --- a/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp +++ b/src/microbenchmarks/graph/csr_graph_compression_benchmark.cpp @@ -18,7 +18,7 @@ /** * @file csr_graph_compression_benchmark.cpp * @brief A benchmark of the csr-graph compression (using the ldbc graph) - * @todo allow different compression formats for the two csr columns; add full_iterate + * @todo allow different compression formats for the two csr columns */ #include "benchmark_helper.h" diff --git a/src/microbenchmarks/graph/page_rank_benchmark.cpp b/src/microbenchmarks/graph/page_rank_benchmark.cpp new file mode 100644 index 00000000..e6b3fe9e --- /dev/null +++ b/src/microbenchmarks/graph/page_rank_benchmark.cpp @@ -0,0 +1,98 @@ +/********************************************************************************************** + * Copyright (C) 2020 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank_benchmark.cpp + * @brief A benchmark evaluating the impact of graph compression on PageRank (using the ldbc graph) + */ + +#include "benchmark_helper.h" +#include +#include +#include +#include + +#include + +using namespace morphstore; + +struct CompressionBenchmarkEntry { + std::string graph_format; + std::string compr_format; + uint64_t page_rank_time, ran_iterations; + + std::string to_string() { + return graph_format + "|" + compr_format + "|" + std::to_string(page_rank_time) + "|" + std::to_string(ran_iterations); + } +}; + +template void benchmark() { + + static_assert(std::is_base_of::value, + "type parameter of this method must be a graph format"); + +#ifdef LDBC_DIR + // could be also build parameters? + const int number_of_executions = 5; + + // order based on block-size (as adj-list format currently only supports decreasing blocksizes at `morph()`) + std::vector compr_formats = {GraphCompressionFormat::DELTA, GraphCompressionFormat::FOR, + GraphCompressionFormat::DYNAMIC_VBP, + GraphCompressionFormat::UNCOMPRESSED}; + + // Load ldbc graph + // blank lines for easier deletion of progress prints + std::cout << std::endl << std::endl; + std::shared_ptr graph = std::make_shared(); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); + ldbcImport->import(*graph); + std::cout << std::endl << std::endl; + + + std::cout << "Test impact of compression on BFS" << std::endl; + std::cout << "Graph-Format | Compression-Format | page_rank-time in ms | iterations ran" << std::endl; + + for (auto current_f : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + current_try.compr_format = graph_compr_f_to_string(current_f); + + // restore start state (not needed as this will be not timed and morphing internally goes via uncompr) + //graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); + // morphing into desired format + graph->morph(current_f); + + auto start = highResClock::now(); + // current default values for PageRank: max_iterations = 20, damping_factor = 0.85, tolerance = 0.0001 + current_try.ran_iterations = morphstore::PageRank::compute(graph).ran_iterations; + current_try.page_rank_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + + } + } +#else + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); +#endif +} + +int main(void) { + benchmark(); + benchmark(); +} From 6639a510b378de74ac0fd19a11a8ec81c794b602 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Sun, 14 Jun 2020 23:37:34 +0200 Subject: [PATCH 209/216] Comment more code --- doc/doxygen/pages/tutorials/quick_start.md | 2 +- include/core/operators/graph/top_down_bfs.h | 3 +- include/core/storage/graph/edge/edge.h | 8 ++-- .../core/storage/graph/edge/edges_container.h | 7 +++- .../graph/edge/edges_hashmap_container.h | 9 ++++- .../graph/edge/edges_vectorarray_container.h | 14 +++++-- .../storage/graph/formats/adjacencylist.h | 30 +++++++++----- include/core/storage/graph/formats/csr.h | 19 +++++---- include/core/storage/graph/graph.h | 40 +++++++++++++++++-- .../core/storage/graph/graph_compr_format.h | 8 ++-- .../core/storage/graph/importer/ldbc_schema.h | 2 +- include/core/storage/graph/property_type.h | 4 +- .../storage/graph/vertex/vertices_container.h | 2 +- .../graph/vertex/vertices_hashmap_container.h | 2 +- .../vertex/vertices_vectorarray_container.h | 2 +- src/microbenchmarks/graph/bfs_benchmark.cpp | 23 ++++++++++- .../graph/page_rank_benchmark.cpp | 20 +++++++++- .../simple/bfs_simple_adj_graph_test.cpp | 2 +- .../simple/bfs_simple_csr_graph_test.cpp | 2 +- .../graph/simple/bfs_simple_graph_test.h | 4 +- .../core/storage/graph/ldbc/ldbc_graph_test.h | 11 ++++- 21 files changed, 157 insertions(+), 57 deletions(-) diff --git a/doc/doxygen/pages/tutorials/quick_start.md b/doc/doxygen/pages/tutorials/quick_start.md index 958c1415..89d591c5 100644 --- a/doc/doxygen/pages/tutorials/quick_start.md +++ b/doc/doxygen/pages/tutorials/quick_start.md @@ -23,7 +23,7 @@ Ensure that you have the following tools installed before trying to build: - g++ >= version 8.2 - cmake >= version 3.10 -Older versions may not build all test cases. Note that C++14 is necessary. +Older versions may not build all test cases. Note that C++17 is necessary. To facilitate building and testing MorphStore, there is a script build.sh in the root folder. diff --git a/include/core/operators/graph/top_down_bfs.h b/include/core/operators/graph/top_down_bfs.h index 56b62dfe..3da7129d 100644 --- a/include/core/operators/graph/top_down_bfs.h +++ b/include/core/operators/graph/top_down_bfs.h @@ -18,7 +18,7 @@ /** * @file top_down_bfs.h * @brief top down BFS implementation to traverse graph - * @todo implement vectorized BFS (AVX2, AVX-512) + * @todo implement vectorized BFS (AVX2, AVX-512) ; return list of visited nodes + visiting depth maybe */ #ifndef MORPHSTORE_TOP_DOWN_BFS @@ -112,6 +112,7 @@ namespace morphstore { } // function which returns a list of every ith vertex which is sorted by degree DESC + // TODO: could be seen as a generell helper function -> move into seperate header static std::vector get_list_of_every_ith_vertex(std::shared_ptr graph, uint64_t cycle) { std::vector measurementCandidates; std::vector> totalListOfVertices = diff --git a/include/core/storage/graph/edge/edge.h b/include/core/storage/graph/edge/edge.h index 2657d52e..b6d4e60a 100644 --- a/include/core/storage/graph/edge/edge.h +++ b/include/core/storage/graph/edge/edge.h @@ -35,7 +35,7 @@ namespace morphstore { - // for loading + // for loading a graph class Edge { protected: @@ -79,13 +79,13 @@ namespace morphstore { } }; - // for internal usage + // for internal usage (inside the edges-container) class EdgeWithId : public Edge { private: uint64_t id; // delete flag - // TODO put as a std::bitset in vectorarray_container + // TODO: put as a std::bitset in vectorarray_container (as hashmap-container does not need the valid flag) bool valid = false; public: @@ -158,7 +158,7 @@ namespace morphstore { bool operator<(const EdgeWithProperties &e) const { return edge.getTargetId() < e.getEdge().getTargetId(); } }; - // for returning to user + // for returning an edge to the user class EdgeWithIdAndProperties { private: std::unordered_map properties; diff --git a/include/core/storage/graph/edge/edges_container.h b/include/core/storage/graph/edge/edges_container.h index 19a43b66..62f4897b 100644 --- a/include/core/storage/graph/edge/edges_container.h +++ b/include/core/storage/graph/edge/edges_container.h @@ -18,7 +18,7 @@ /** * @file edges_container.h * @brief abstract class for storing edges - * @todo an EntityContainer abstraction (reduce duplicated code) + * @todo an EntityContainer abstraction (reduce duplicated code to vertices_container.h) */ #ifndef MORPHSTORE_EDGES_CONTAINER_H @@ -38,21 +38,24 @@ namespace morphstore { class EdgesContainer { protected: uint64_t expected_edge_count = 0; + // ! this should be an atomic one, if multi-threaded insertion is of interest uint64_t current_max_edge_id = 0; std::map edge_type_dictionary; - // TODO: try other property storage formats than per vertex .. (triple-store or per property) + // TODO: try other property storage formats than per edge .. (triple-store or per property) std::unordered_map> edge_properties; std::string get_edge_type(unsigned short int type) const { if (edge_type_dictionary.find(type) != edge_type_dictionary.end()) { return edge_type_dictionary.at(type); } else { + // could also throw an error here return "No Matching of type-number in the database! For type " + std::to_string(type); } } + // for assigning ids uint64_t get_next_edge_id() { return current_max_edge_id++; } public: diff --git a/include/core/storage/graph/edge/edges_hashmap_container.h b/include/core/storage/graph/edge/edges_hashmap_container.h index 60cc43a8..24f71d35 100644 --- a/include/core/storage/graph/edge/edges_hashmap_container.h +++ b/include/core/storage/graph/edge/edges_hashmap_container.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -18,7 +18,7 @@ /** * @file edges__hashmap_container.h * @brief storing edges using a hashmap - * @todo an EntityHashMapContainer abstraction (reduce duplicated code) + * @todo an EntityHashMapContainer abstraction (reduce duplicated code to VertexHashMapContainer) */ #ifndef MORPHSTORE_EDGES_HASHMAP_CONTAINER_H @@ -34,6 +34,8 @@ namespace morphstore { class EdgesHashMapContainer : public EdgesContainer { protected: + // mapping edge id -> edge + // currently saving the id twice std::unordered_map edges; public: @@ -44,6 +46,7 @@ namespace morphstore { this->edges.reserve(expected_edges); } + // TODO: unpack EdgeWithId to just Edge (avoid saving edge-id twice) void insert_edge(const EdgeWithId e) override { edges[e.getId()] = e; } bool exists_edge(const uint64_t id) const override { @@ -57,6 +60,8 @@ namespace morphstore { uint64_t edge_count() const { return edges.size(); } + // memory estimation + // returns a pair of index-size, data-size std::pair get_size() const override { auto [index_size, data_size] = EdgesContainer::get_size(); diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index 04c4165b..36231862 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -17,7 +17,7 @@ /** * @file edges__vectorarray_container.h - * @brief storing edges using a vector of arrays + * @brief storing edges using a vector of arrays; assuming a consecutive id space * @todo */ @@ -33,7 +33,8 @@ #include namespace morphstore { - // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! + // very different to VerticesVectorArrayContainer as edge ids are not given at insertion time! + // (not anymore, but not considered in current implementation) // and using std::array as aligned_alloc did not set invalid flag to false (could be solveable) class EdgesVectorArrayContainer : public EdgesContainer { protected: @@ -65,7 +66,7 @@ namespace morphstore { void allocate(const uint64_t expected_edges) override { EdgesContainer::allocate(expected_edges); - + // rounding up .. only whole arrays can be allocated auto array_count = std::ceil(expected_edges / (float)edges_per_array); this->edges.reserve(array_count); @@ -75,9 +76,12 @@ namespace morphstore { } void insert_edge(EdgeWithId e) { + // not assuming sequentiell insertion (could be changed to just insert at a given position) + // and only assert that the given position matches auto array_number = get_edge_array_number(e.getId()); auto array_pos = get_pos_in_array(e.getId()); - + + // second time to assert that expected edge count is not exceeded ? if (array_number >= edges.size()) { throw std::runtime_error("Exceeded edge id limit: Edge id " + std::to_string(e.getId()) + " > " + std::to_string(edges_per_array * edges.size() - 1)); @@ -112,6 +116,8 @@ namespace morphstore { uint64_t edge_count() const override { return number_of_edges; } + // memory estimation + // returns a pair of index-size, data-size std::pair get_size() const override { auto [index_size, data_size] = EdgesContainer::get_size(); diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index ab78ac9c..f7bbdfe7 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -42,8 +42,10 @@ namespace morphstore { // const column as after finalized only read_only using adjacency_column = column_base *; using adjacency_vector = std::vector *; + // an adjacency-list can be either a column or a vector using adjacency_list_variant = std::variant; + // visitor for accessing std::variant struct Adjacency_List_Size_Visitor { size_t operator()(const adjacency_column c) const { return c->get_size_used_byte(); } size_t operator()(const adjacency_vector v) const { @@ -51,23 +53,25 @@ namespace morphstore { } }; + // visitor for accessing std::variant struct Adjacency_List_OutDegree_Visitor { uint64_t operator()(const adjacency_column c) const { - // assuming compressed col has the same value count (would not work for RLE) + // assuming compressed col has the same value count (would not work for RLE?) return c->get_count_values(); } uint64_t operator()(const adjacency_vector v) const { return v->size(); } }; - // maps the a list of outgoing edges (ids) to a vertex-id + // maps the a list of outgoing edges (ids) to a vertex-id (representing the graph topology) + // TODO: try using a vector instead an unordered_map (? faster access, but needs more memory for empty adj.-lists ?) std::unordered_map *adjacencylistPerVertex = new std::unordered_map(); // as formats allocate to much memory for small columns - // current_compression + // adj-lists with a lower degree, are stored as vectors (others are columns) uint64_t min_compr_degree = 1024; - // convert big-enough adj-vector to a (read-only) adj-column + // convert big-enough adj-vector to a (read-only) adj-column (based on the min_compr_degree) void finalize() { int vectors_transformed = 0; for (auto [id, adj_list] : *adjacencylistPerVertex) { @@ -114,7 +118,6 @@ namespace morphstore { if (std::holds_alternative(entry->second)) { throw std::runtime_error("Not implemented to add edges, if adj. list is a (compressed) column"); } - adjacencyVector = std::get(entry->second); } else { adjacencyVector = new std::vector(); @@ -127,6 +130,7 @@ namespace morphstore { public: ~AdjacencyList() { + // as vectors and columns are allocated using new -> need to delete them manually for (auto [id, adj_list] : *this->adjacencylistPerVertex) { if (std::holds_alternative(adj_list)) { delete std::get(adj_list); @@ -150,20 +154,24 @@ namespace morphstore { void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); adjacencylistPerVertex->reserve(numberVertices); - } + } + // currently new_min_compr_degree must be smaller or equal than the current min_compr_degree void set_min_compr_degree(uint64_t new_min_compr_degree) { if (new_min_compr_degree > min_compr_degree) { // allowing this would need re-transforming finalized columns to vectors + // when this is allowed, the min_compr_degree should be enough as an function parameter for finalize throw std::runtime_error("Only supporting an decreasing minimum compression degree (new: " + std::to_string(new_min_compr_degree) + ", current: " + std::to_string(min_compr_degree) + ")"); } this->min_compr_degree = new_min_compr_degree; + // applying the new min_compr_degree finalize(); } // adding a single edge to vertex: + // graph-format specific, as CSR is currently limited to bulk inserts uint64_t add_edge(uint64_t sourceId, uint64_t targetId, unsigned short int type) override { Edge e = Edge(sourceId, targetId, type); return add_edges(sourceId, {e})[0]; @@ -171,7 +179,7 @@ namespace morphstore { uint64_t get_min_compr_degree() { return min_compr_degree; } - // get number of neighbors of vertex with id + // get number of outgoing edges for the vertex with the given id uint64_t get_out_degree(uint64_t id) override { auto entry = adjacencylistPerVertex->find(id); if (entry == adjacencylistPerVertex->end()) { @@ -199,12 +207,10 @@ namespace morphstore { if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { delete uncompr_col; } - } else { edge_ids = *std::get(adj_list); } } - return edge_ids; } @@ -212,12 +218,13 @@ namespace morphstore { void morph(GraphCompressionFormat target_format) override { morph(target_format, true); } // ! vector<->column conversion overhead if min_degree is different + // ! morphing to UNCOMPRESSED results in all adj-lists being columns (instead of vectors) void morph(GraphCompressionFormat target_format, bool blocksize_based_min_degree) { if (blocksize_based_min_degree) { // as if blocksize > size of adjlist -> stays uncompressed but still allocates a whole block set_min_compr_degree(graph_compr_f_block_size(target_format)); } else { - // transform big enough vectors into columns + // transform big enough vectors into columns (based on the min_compr_degree) this->finalize(); } @@ -265,6 +272,7 @@ namespace morphstore { } // ratio of adjacency columns (rest would be vectors) + // depends on the min_compr_degree double column_ratio() const { // neither coloumns or vectors if (getEdgeCount() == 0) { @@ -283,6 +291,7 @@ namespace morphstore { // for measuring the size in bytes: std::pair get_size_of_graph() const override { + // graph-format agnostic memory usage (like storage for entities) auto [index_size, data_size] = Graph::get_size_of_graph(); // min_compr_degree @@ -290,6 +299,7 @@ namespace morphstore { // adjacencyListPerVertex index_size += sizeof(std::unordered_map); + // overhead for each map-entry index_size += adjacencylistPerVertex->size() * (sizeof(uint64_t) + sizeof(adjacency_list_variant)); for (const auto [id, adj_list] : *adjacencylistPerVertex) { diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index 4ac2bd09..f0f25cc4 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2020 by MorphStore-Team * + * Copyright (C) 2019 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -36,6 +36,8 @@ namespace morphstore { // simple cache of size 1 (to avoid decompressing the same block multiple times .. f.i. for getting the degree of a // vertex) + // ! poor mans approach and should be changed, if multi-threaded executions are done + // (cache per thread maybe .. inside a `for_all` iterator maybe) class ColumnBlockCache { private: uint64_t block_number; @@ -64,7 +66,7 @@ namespace morphstore { private: /* graph topology: * offset column: index is vertex-id; column entry contains offset in edgeId array - * edgeId column: contains edge id + * edgeId column: contains edge ids */ column_with_blockoffsets_base *offset_column; column_with_blockoffsets_base *edgeId_column; @@ -88,6 +90,7 @@ namespace morphstore { assert(expectedEdgeCount >= getEdgeCount()); // currently only read-only if compressed + // for write it would be necessary to decompress the last block; write and compress again if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + graph_compr_f_to_string(current_compression)); @@ -112,8 +115,6 @@ namespace morphstore { } uint64_t get_offset(uint64_t id) { - // TODO: use cache - auto block_size = offset_column->get_block_size(); if (current_compression == GraphCompressionFormat::UNCOMPRESSED) { @@ -140,14 +141,12 @@ namespace morphstore { } uint64_t *block_data = uncompr_block->get_data(); - auto offset = block_data[block_pos]; - return offset; } } - // DEBUG function to look into column: + // DEBUG function to look into a column: void print_column(const column_base *col, int start, int end) const { // validate interval (fix otherwise) int col_size = col->get_count_values(); @@ -180,8 +179,8 @@ namespace morphstore { std::string get_storage_format() const override { return "CSR"; } - // this function gets the number of vertices/edges and allocates memory for the graph-topology arrays - // TODO: test that no data exists before (as this will get overwritten) + // this function gets the number of vertices/edges and allocates memory for the graph-topology columns + // TODO: test that no data exists before (as this gets overwritten) void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); @@ -346,7 +345,7 @@ namespace morphstore { offset_column = morph_saving_offsets_graph_col(offset_column, current_compression, target_format, true); edgeId_column = morph_saving_offsets_graph_col(edgeId_column, current_compression, target_format, true); - // invalidating caches + // invalidating caches (as block-size may differ) if (offset_block_cache) { offset_block_cache.reset(); } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index e6195852..d2cdfd53 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -17,7 +17,7 @@ /** * @file graph.h - * @brief base graph class for any storage format --> CSR,ADJ + * @brief base graph class for any storage format --> CSR,ADJ (allowing multi-graphs) * @todo */ @@ -58,6 +58,7 @@ namespace morphstore { std::unique_ptr vertices; std::unique_ptr edges; + // graph format specific (CSR and Adj only differ in their graph topology representation) virtual void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) = 0; public: @@ -66,6 +67,7 @@ namespace morphstore { Graph(VerticesContainerType vertices_container_type = VerticesContainerType::VectorArrayContainer, EdgesContainerType edges_container_type = EdgesContainerType::VectorArrayContainer) { + // could be encapsulated in a VerticesContainer builder switch (vertices_container_type) { case VerticesContainerType::VectorArrayContainer: vertices = std::make_unique(); @@ -75,6 +77,7 @@ namespace morphstore { break; } + // could be encapsulated in a EdgesContainer builder switch (edges_container_type) { case EdgesContainerType::VectorArrayContainer: edges = std::make_unique(); @@ -85,28 +88,38 @@ namespace morphstore { } } + // human-readable form of the container (f.i. for benchmark) std::string vertices_container_description() { return vertices->container_description(); } - + + // human-readable form of the container (f.i. for benchmark) std::string edges_container_description() { return edges->container_description(); } // -------------------- Setters & Getters -------------------- + // each vertex has a type represented by a number (in Neo4j terms this would be a node label) + // this provides the semantics behind that number void set_vertex_type_dictionary(const std::map &types) { assert(types.size() != 0); this->vertices->set_vertex_type_dictionary(types); } + // each edge has a type represented by a number (in Neo4j terms this would be a relationship type) + // this provides the semantics behind that number void setEdgeTypeDictionary(const std::map &types) { assert(types.size() != 0); this->edges->set_edge_type_dictionary(types); } + // expected count provided by allocate_graph_structure uint64_t getExpectedVertexCount() const { return expectedVertexCount; } + // count of actually stored vertices uint64_t getVertexCount() const { return vertices->vertex_count(); } + // expected count provided by allocate_graph_structure uint64_t getExpectedEdgeCount() const { return expectedEdgeCount; } + // count of actually stored edges uint64_t getEdgeCount() const { return edges->edge_count(); } uint64_t add_vertex(const unsigned short int type = 0, @@ -122,19 +135,25 @@ namespace morphstore { vertices->add_property_to_vertex(id, property); }; + // only setting whole edge_properties, as adding an edge property was not needed yet void set_edge_properties(uint64_t id, const std::unordered_map properties) { edges->set_edge_properties(id, properties); }; + // human-readable form of the graph storage format virtual std::string get_storage_format() const = 0; virtual uint64_t add_edge(uint64_t from, uint64_t to, unsigned short int type) = 0; + // changing the compression format virtual void morph(GraphCompressionFormat target_format) = 0; + // outgoing, as they are only indexed in the outgoing direction virtual std::vector get_outgoing_edge_ids(uint64_t id) = 0; + // get the out_degree of a vertex (size of the adjacency list) virtual uint64_t get_out_degree(uint64_t id) = 0; - // function to return a vector of ids of neighbors for BFS alg. + // convenience method to returning the target vertex-ids of the outgoing edges std::vector get_neighbors_ids(uint64_t id) { std::vector targetVertexIds; + // guess this could be easily parallelized (using std::foreach f.i.) for (auto edge_id : get_outgoing_edge_ids(id)) { assert(edges->exists_edge(edge_id)); targetVertexIds.push_back(edges->get_edge(edge_id).getTargetId()); @@ -143,13 +162,17 @@ namespace morphstore { return targetVertexIds; }; + // returning a vector of edge-ids (order based on input edges) std::vector add_edges(uint64_t sourceId, const std::vector edges_to_add) { std::vector edge_ids; + // assertion, which are shared by all graph formats if (!vertices->exists_vertex(sourceId)) { throw std::runtime_error("Source-id not found " + std::to_string(sourceId)); } + // (multi)-graph specific and storage-format agnostic + // changes, if other formats store target ids instead of edge ids (because non multi graphs do not need edge ids) for (auto edge : edges_to_add) { if (!vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found :" + edge.to_string()); @@ -162,6 +185,8 @@ namespace morphstore { return edge_ids; }; + // looks very similar to above but for ! EdgeWithProperties ! + // extra method, as runtime polymorphism seemed ugly in C++ here (but very likely there is a better way for this) std::vector add_edges(uint64_t sourceId, const std::vector edges_to_add) { std::vector edge_ids; @@ -173,6 +198,7 @@ namespace morphstore { if (auto edge = edge_with_props.getEdge(); !vertices->exists_vertex(edge.getTargetId())) { throw std::runtime_error("Target not found :" + edge.to_string()); } + // this calls a different methods on the edges-container edge_ids.push_back(edges->add_edge(edge_with_props)); } @@ -181,6 +207,8 @@ namespace morphstore { return edge_ids; }; + // memory estimation + // returns a pair of index-size, data-size virtual std::pair get_size_of_graph() const { // including vertices + its properties + its type dict auto [index_size, data_size] = vertices->get_size(); @@ -190,9 +218,12 @@ namespace morphstore { index_size += edges_size.first; data_size += edges_size.second; - return std::make_pair(index_size, data_size); + return {index_size, data_size}; }; + + // mainly needed to allocate CSR columns + // also containers can reserve expected size virtual void allocate_graph_structure(uint64_t expected_vertices, uint64_t expected_edges) { this->expectedVertexCount = expected_vertices; this->expectedEdgeCount = expected_edges; @@ -216,6 +247,7 @@ namespace morphstore { } } + // basic statistics to be extended by graph formats virtual void statistics() { std::cout << "---------------- Statistics ----------------" << std::endl; std::cout << "Number of vertices: " << getVertexCount() << std::endl; diff --git a/include/core/storage/graph/graph_compr_format.h b/include/core/storage/graph/graph_compr_format.h index 08a4c34b..22c6a381 100644 --- a/include/core/storage/graph/graph_compr_format.h +++ b/include/core/storage/graph/graph_compr_format.h @@ -18,7 +18,7 @@ /** * @file graph_compr_format.h * @brief helper for specifying compression of graph format specific columns - * @todo + * @todo remove need for extra graph-compression format */ #ifndef MORPHSTORE_GRAPH_COMPR_FORMAT_H @@ -36,7 +36,7 @@ #include namespace morphstore { - // TODO: allow also other vector extensions (switch from safe_morph to morph) + // TODO: allow also other vector extensions (regard build flag) using ve = vectorlib::scalar>; using default_vbp = DEFAULT_DYNAMIC_VBP_F(ve); @@ -65,7 +65,9 @@ namespace morphstore { return desc; } - + + // gets m_BlockSize using the corresponding format + // as GraphCompressionFormat is just a simple enum size_t inline graph_compr_f_block_size(GraphCompressionFormat format) { size_t block_size = 1; diff --git a/include/core/storage/graph/importer/ldbc_schema.h b/include/core/storage/graph/importer/ldbc_schema.h index d8c1cac6..314c7384 100644 --- a/include/core/storage/graph/importer/ldbc_schema.h +++ b/include/core/storage/graph/importer/ldbc_schema.h @@ -19,7 +19,7 @@ * @file lbc_schema.h * @brief Schema of the LDBC graph based on * https://raw.githubusercontent.com/ldbc/ldbc_snb_docs/dev/figures/schema-comfortable.png - * @todo + * @todo search for an existing Graph-Schema language (graph schemas should be stored in the resource folder) */ #ifndef MORPHSTORE_LDBC_SCHEMA_H diff --git a/include/core/storage/graph/property_type.h b/include/core/storage/graph/property_type.h index 63ad922c..3da8c7bf 100644 --- a/include/core/storage/graph/property_type.h +++ b/include/core/storage/graph/property_type.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * @@ -17,7 +17,7 @@ /** * @file property_type.h - * @brief variant of supported data types as a property + * @brief variant of supported data types for a property (vertex or edge property) * @todo Move into dedicated sub-folder (when different property mappings exists) */ diff --git a/include/core/storage/graph/vertex/vertices_container.h b/include/core/storage/graph/vertex/vertices_container.h index 2f3f62a7..6ac9280e 100644 --- a/include/core/storage/graph/vertex/vertices_container.h +++ b/include/core/storage/graph/vertex/vertices_container.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index 97c9568d..af8a16a1 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index fece9847..0e6c94db 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -1,5 +1,5 @@ /********************************************************************************************** - * Copyright (C) 2019 by MorphStore-Team * + * Copyright (C) 2020 by MorphStore-Team * * * * This file is part of MorphStore - a compression aware vectorized column store. * * * diff --git a/src/microbenchmarks/graph/bfs_benchmark.cpp b/src/microbenchmarks/graph/bfs_benchmark.cpp index 294eee96..b5dc8d50 100644 --- a/src/microbenchmarks/graph/bfs_benchmark.cpp +++ b/src/microbenchmarks/graph/bfs_benchmark.cpp @@ -68,8 +68,27 @@ template void benchmark() { const int cycle_size = graph->getVertexCount() / number_of_start_vertices; auto start_vertex_ids = BFS::get_list_of_every_ith_vertex(graph, cycle_size); - std::cout << "Test impact of compression on BFS" << std::endl; - std::cout << "Graph-Format | Compression-Format | bfs-time | visited vertices" << std::endl; + std::cout << "Test impact of compression on BFS (10 start-nodes (evenly distributed regarding degree); 5x excutions)" << std::endl; + std::cout << "Graph-Format | Compression-Format | bfs-time in ms| visited vertices" << std::endl; + + // for AdjacencyList format a version, where all lists are stored as vectors (not morphed -> nothing finalized) + if (std::is_same::value) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + current_try.compr_format = + graph_compr_f_to_string(GraphCompressionFormat::UNCOMPRESSED) + " (all vectors)"; + + for (auto id : start_vertex_ids) { + auto start = highResClock::now(); + current_try.visited_vertices = morphstore::BFS::compute(graph, id); + current_try.bfs_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + } + } + } for (auto current_f : compr_formats) { for (int exec = 0; exec < number_of_executions; exec++) { diff --git a/src/microbenchmarks/graph/page_rank_benchmark.cpp b/src/microbenchmarks/graph/page_rank_benchmark.cpp index e6b3fe9e..39455430 100644 --- a/src/microbenchmarks/graph/page_rank_benchmark.cpp +++ b/src/microbenchmarks/graph/page_rank_benchmark.cpp @@ -63,9 +63,27 @@ template void benchmark() { std::cout << std::endl << std::endl; - std::cout << "Test impact of compression on BFS" << std::endl; + std::cout << "Test impact of compression on PageRank (5x executions)" << std::endl; std::cout << "Graph-Format | Compression-Format | page_rank-time in ms | iterations ran" << std::endl; + // for adj-list a version, where all lists are stored as vectors (not morphed -> nothing finalized) + if (std::is_same::value) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + current_try.compr_format = + graph_compr_f_to_string(GraphCompressionFormat::UNCOMPRESSED) + " (all vectors)"; + + auto start = highResClock::now(); + // current default values for PageRank: max_iterations = 20, damping_factor = 0.85, tolerance = 0.0001 + current_try.ran_iterations = morphstore::PageRank::compute(graph).ran_iterations; + current_try.page_rank_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + } + } + for (auto current_f : compr_formats) { for (int exec = 0; exec < number_of_executions; exec++) { CompressionBenchmarkEntry current_try; diff --git a/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp b/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp index 50248273..89ebf7e0 100644 --- a/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp +++ b/test/core/operators/graph/simple/bfs_simple_adj_graph_test.cpp @@ -17,7 +17,7 @@ /** * @file bfs_simple__adj_graph_test.cpp - * @brief Test for bfs of social network graph in adj list format + * @brief Test bfs on adj-list graph format * @todo */ #include diff --git a/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp b/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp index e324eb58..9d7b0ae3 100644 --- a/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp +++ b/test/core/operators/graph/simple/bfs_simple_csr_graph_test.cpp @@ -17,7 +17,7 @@ /** * @file bfs_simple_csr_graph_test.cpp - * @brief Test for bfs of social network graph in csr list format + * @brief Test for bfs on csr graph format * @todo */ #include diff --git a/test/core/operators/graph/simple/bfs_simple_graph_test.h b/test/core/operators/graph/simple/bfs_simple_graph_test.h index 3d737d55..cfa094fb 100644 --- a/test/core/operators/graph/simple/bfs_simple_graph_test.h +++ b/test/core/operators/graph/simple/bfs_simple_graph_test.h @@ -17,11 +17,9 @@ /** * @file bfs_simple_graph_test.cpp - * @brief Test methods for bfs on social network graph + * @brief Test methods for bfs on simple graph * @todo */ - -#include #include #include diff --git a/test/core/storage/graph/ldbc/ldbc_graph_test.h b/test/core/storage/graph/ldbc/ldbc_graph_test.h index 803daff3..1a942fe2 100644 --- a/test/core/storage/graph/ldbc/ldbc_graph_test.h +++ b/test/core/storage/graph/ldbc/ldbc_graph_test.h @@ -44,22 +44,29 @@ template void ldbcGraphFormatTest(void) { print_header(storageFormat); - // ldbc importer: path to csv files as parameter: (don't forget the last '/' in adress path) std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); // generate vertices & edges from LDBC files and insert into graph structure ldbcImport->import(*graph); graph->statistics(); + graph->print_vertex_by_id(1035174); + graph->print_edge_by_id(10); + graph->print_neighbors_of_vertex(1035174); + graph->morph(morphstore::GraphCompressionFormat::DELTA); graph->statistics(); - // (DEBUG) Test Vertex, which contains edges with properties (SERVER): graph->print_vertex_by_id(1035174); graph->print_edge_by_id(10); graph->print_neighbors_of_vertex(1035174); + // DEBUGGING + //for(uint64_t id = 0; id < graph->getEdgeCount(); id++) { + // graph->get_outgoing_edge_ids(id); + //} + // measure degree distribution and write to file (file path as parameter): // TODO: but this into benchmark or so .. not actual test // std::cout << "Measure degree count" << std::endl; From 974c7561de49dd0ce27ceebb626ebb6d242a7f4f Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 15 Jun 2020 19:26:19 +0200 Subject: [PATCH 210/216] Fix measurement unit description in benchmark --- src/microbenchmarks/graph/bfs_benchmark.cpp | 2 +- src/microbenchmarks/graph/page_rank_benchmark.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/microbenchmarks/graph/bfs_benchmark.cpp b/src/microbenchmarks/graph/bfs_benchmark.cpp index b5dc8d50..4f0123c2 100644 --- a/src/microbenchmarks/graph/bfs_benchmark.cpp +++ b/src/microbenchmarks/graph/bfs_benchmark.cpp @@ -69,7 +69,7 @@ template void benchmark() { auto start_vertex_ids = BFS::get_list_of_every_ith_vertex(graph, cycle_size); std::cout << "Test impact of compression on BFS (10 start-nodes (evenly distributed regarding degree); 5x excutions)" << std::endl; - std::cout << "Graph-Format | Compression-Format | bfs-time in ms| visited vertices" << std::endl; + std::cout << "Graph-Format | Compression-Format | bfs-time in micro seconds| visited vertices" << std::endl; // for AdjacencyList format a version, where all lists are stored as vectors (not morphed -> nothing finalized) if (std::is_same::value) { diff --git a/src/microbenchmarks/graph/page_rank_benchmark.cpp b/src/microbenchmarks/graph/page_rank_benchmark.cpp index 39455430..8d9d40d1 100644 --- a/src/microbenchmarks/graph/page_rank_benchmark.cpp +++ b/src/microbenchmarks/graph/page_rank_benchmark.cpp @@ -64,7 +64,7 @@ template void benchmark() { std::cout << "Test impact of compression on PageRank (5x executions)" << std::endl; - std::cout << "Graph-Format | Compression-Format | page_rank-time in ms | iterations ran" << std::endl; + std::cout << "Graph-Format | Compression-Format | page_rank-time in micro seconds | iterations ran" << std::endl; // for adj-list a version, where all lists are stored as vectors (not morphed -> nothing finalized) if (std::is_same::value) { From 11dedfbb1d662daec0e7138e02a5e01e2a2b8460 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 15 Jun 2020 19:32:31 +0200 Subject: [PATCH 211/216] Fix documented file names --- include/core/storage/graph/edge/edges_hashmap_container.h | 2 +- include/core/storage/graph/edge/edges_vectorarray_container.h | 2 +- include/core/storage/graph/importer/ldbc_schema.h | 2 +- include/core/storage/graph/vertex/vertices_hashmap_container.h | 2 +- .../core/storage/graph/vertex/vertices_vectorarray_container.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/core/storage/graph/edge/edges_hashmap_container.h b/include/core/storage/graph/edge/edges_hashmap_container.h index 24f71d35..dfbaace7 100644 --- a/include/core/storage/graph/edge/edges_hashmap_container.h +++ b/include/core/storage/graph/edge/edges_hashmap_container.h @@ -16,7 +16,7 @@ **********************************************************************************************/ /** - * @file edges__hashmap_container.h + * @file edges_hashmap_container.h * @brief storing edges using a hashmap * @todo an EntityHashMapContainer abstraction (reduce duplicated code to VertexHashMapContainer) */ diff --git a/include/core/storage/graph/edge/edges_vectorarray_container.h b/include/core/storage/graph/edge/edges_vectorarray_container.h index 36231862..00ee32f1 100644 --- a/include/core/storage/graph/edge/edges_vectorarray_container.h +++ b/include/core/storage/graph/edge/edges_vectorarray_container.h @@ -16,7 +16,7 @@ **********************************************************************************************/ /** - * @file edges__vectorarray_container.h + * @file edges_vectorarray_container.h * @brief storing edges using a vector of arrays; assuming a consecutive id space * @todo */ diff --git a/include/core/storage/graph/importer/ldbc_schema.h b/include/core/storage/graph/importer/ldbc_schema.h index 314c7384..aa093e32 100644 --- a/include/core/storage/graph/importer/ldbc_schema.h +++ b/include/core/storage/graph/importer/ldbc_schema.h @@ -16,7 +16,7 @@ **********************************************************************************************/ /** - * @file lbc_schema.h + * @file ldbc_schema.h * @brief Schema of the LDBC graph based on * https://raw.githubusercontent.com/ldbc/ldbc_snb_docs/dev/figures/schema-comfortable.png * @todo search for an existing Graph-Schema language (graph schemas should be stored in the resource folder) diff --git a/include/core/storage/graph/vertex/vertices_hashmap_container.h b/include/core/storage/graph/vertex/vertices_hashmap_container.h index af8a16a1..c3fc6696 100644 --- a/include/core/storage/graph/vertex/vertices_hashmap_container.h +++ b/include/core/storage/graph/vertex/vertices_hashmap_container.h @@ -16,7 +16,7 @@ **********************************************************************************************/ /** - * @file vertices__hashmap_container.h + * @file vertices_hashmap_container.h * @brief storing vertices using a hashmap * @todo */ diff --git a/include/core/storage/graph/vertex/vertices_vectorarray_container.h b/include/core/storage/graph/vertex/vertices_vectorarray_container.h index 0e6c94db..0a89042c 100644 --- a/include/core/storage/graph/vertex/vertices_vectorarray_container.h +++ b/include/core/storage/graph/vertex/vertices_vectorarray_container.h @@ -16,7 +16,7 @@ **********************************************************************************************/ /** - * @file vertices__vectorarray_container.h + * @file vertices_vectorarray_container.h * @brief storing vertices using a vector of arrays * @todo */ From 4943d76eec8712c44e724d5aa275a65a42b47956 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 15 Jun 2020 22:29:33 +0200 Subject: [PATCH 212/216] Add a getting started section for the graph module --- doc/doxygen/pages/tutorials/quick_start.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/doxygen/pages/tutorials/quick_start.md b/doc/doxygen/pages/tutorials/quick_start.md index 89d591c5..8c2cd1c5 100644 --- a/doc/doxygen/pages/tutorials/quick_start.md +++ b/doc/doxygen/pages/tutorials/quick_start.md @@ -42,9 +42,29 @@ build/src/examples/example_query ~~~ This builds some example queries in debug mode and runs them. The source code of these queries can be found in the folder src/examples. -They are runnig in scalar mode. Thus, every system providing C++14 support should be able to build and run them regardless of any (not) +They are runnig in scalar mode. Thus, every system providing C++17 support should be able to build and run them regardless of any (not) available vector extensions. + +The Graph Module +====================== + +The graph module mainly contains the two different graph storage formats, which differ in their representation of the graph topology `Compressed Sparse Row (CSR)` and `Adjacency-List`. +These underlying graph model is a multi-graph, with properties for vertices and edges as well as types for both. +The model is very similar to the Property-Graph model, except that vertices can only have one type (instead of multiple labels). + +The columns describing the graph topology can be compressed using formats from the MorphStore. +Besides there exists simple implementations of the graph algorithms `breadth-first search (bfs)` and `PageRank`. + + +To run all the test and micro-benchmarks, a LDBC graph has to be generated. +Instructions of how to generate the graph, can be found at `https://github.com/ldbc/ldbc_snb_datagen`. +By default the ldbc graph is expected to be at `"$HOME/ldbc_snb_datagen/social_network/"`. +This can be changed at `/Morphstore/Engine/CMakeLists.txt`. + + + + Test Vector Extensions ====================== From 4ce2b7eb41d55530e644d1a42bfc52ff156a2bd5 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 16 Jun 2020 10:12:43 +0200 Subject: [PATCH 213/216] Comment new morph operators and correcting the number of block-offsets to be reserved --- doc/doxygen/pages/tutorials/quick_start.md | 2 +- include/core/morphing/decompress_column_block.h | 2 +- include/core/morphing/morph_saving_offsets.h | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/doxygen/pages/tutorials/quick_start.md b/doc/doxygen/pages/tutorials/quick_start.md index 8c2cd1c5..a4809f83 100644 --- a/doc/doxygen/pages/tutorials/quick_start.md +++ b/doc/doxygen/pages/tutorials/quick_start.md @@ -49,7 +49,7 @@ available vector extensions. The Graph Module ====================== -The graph module mainly contains the two different graph storage formats, which differ in their representation of the graph topology `Compressed Sparse Row (CSR)` and `Adjacency-List`. +The graph module mainly contains the two different graph storage formats `Compressed Sparse Row (CSR)` and `Adjacency-List`, which differ in their representation of the graph topology . These underlying graph model is a multi-graph, with properties for vertices and edges as well as types for both. The model is very similar to the Property-Graph model, except that vertices can only have one type (instead of multiple labels). diff --git a/include/core/morphing/decompress_column_block.h b/include/core/morphing/decompress_column_block.h index f67d2db7..5b244181 100644 --- a/include/core/morphing/decompress_column_block.h +++ b/include/core/morphing/decompress_column_block.h @@ -17,7 +17,7 @@ /** * @file decompress_column_block.h - * @brief Decompressing column blocks based on column_with_blockoffsets. + * @brief Decompressing single blocks of a column based on column_with_blockoffsets. */ #ifndef MORPHSTORE_CORE_MORPHING_DECOMPRESS_COLUMN_BLOCK_H diff --git a/include/core/morphing/morph_saving_offsets.h b/include/core/morphing/morph_saving_offsets.h index ed303827..e89495f6 100644 --- a/include/core/morphing/morph_saving_offsets.h +++ b/include/core/morphing/morph_saving_offsets.h @@ -17,7 +17,7 @@ /** * @file morph_saving_offset.h - * @brief based on morph.h, just calling morph_batch_t for every block (if blocksize > 1) + * @brief based on morph.h, just calling morph_batch_t for every block and saving its offset (if blocksize > 1) */ #ifndef MORPHSTORE_CORE_MORPHING_MORPH_SAVING_OFFSETS_H @@ -133,8 +133,9 @@ namespace morphstore { struct morph_saving_offsets_t { using src_f = uncompr_f; + // saving the offsets for every value would have an unacceptable overhead static_assert(t_dst_f::m_BlockSize != 1, - "Blocksize of 1 is only expected for uncompr_f .. block-wise morph is useless in that case"); + "Blocksize of 1 is only expected for uncompr_f .. block-wise morph is useless in this case"); static column_with_blockoffsets *apply(column_with_blockoffsets *inCol_with_offsets) { @@ -147,8 +148,7 @@ namespace morphstore { const size_t countLog = inCol->get_count_values(); const size_t outCountLogCompr = round_down_to_multiple(countLog, t_BlockSize); const size_t outSizeRestByte = uncompr_f::get_size_max_byte(countLog - outCountLogCompr); - block_offsets->reserve(outCountLogCompr + 1); - + const uint8_t *in8 = inCol->get_data(); auto outCol = new column(get_size_max_byte_any_len(countLog)); @@ -156,7 +156,8 @@ namespace morphstore { const uint8_t *const initOut8 = out8; const size_t countBlocks = countLog / t_BlockSize; - + block_offsets->reserve(countBlocks); + // morphing each block and save the offset for (size_t blockIdx = 0; blockIdx < countBlocks; blockIdx++) { // saving the start address of the block From 0c46f8fe3db4e2cd510926f83796a859582b09ae Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 16 Jun 2020 21:43:07 +0200 Subject: [PATCH 214/216] Allow different compression formats for the different csr columns --- .../storage/graph/formats/adjacencylist.h | 4 ++ include/core/storage/graph/formats/csr.h | 62 ++++++++++++------- include/core/storage/graph/graph.h | 5 +- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/include/core/storage/graph/formats/adjacencylist.h b/include/core/storage/graph/formats/adjacencylist.h index f7bbdfe7..a2ddbbf2 100644 --- a/include/core/storage/graph/formats/adjacencylist.h +++ b/include/core/storage/graph/formats/adjacencylist.h @@ -62,6 +62,8 @@ namespace morphstore { uint64_t operator()(const adjacency_vector v) const { return v->size(); } }; + GraphCompressionFormat current_compression = GraphCompressionFormat::UNCOMPRESSED; + // maps the a list of outgoing edges (ids) to a vertex-id (representing the graph topology) // TODO: try using a vector instead an unordered_map (? faster access, but needs more memory for empty adj.-lists ?) std::unordered_map *adjacencylistPerVertex = @@ -150,6 +152,8 @@ namespace morphstore { std::string get_storage_format() const override { return "Adjacency_List"; } + std::string get_compression_format() const override { return graph_compr_f_to_string(current_compression); } + // function: to set graph allocations void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { Graph::allocate_graph_structure(numberVertices, numberEdges); diff --git a/include/core/storage/graph/formats/csr.h b/include/core/storage/graph/formats/csr.h index f0f25cc4..0feb05c3 100644 --- a/include/core/storage/graph/formats/csr.h +++ b/include/core/storage/graph/formats/csr.h @@ -71,12 +71,21 @@ namespace morphstore { column_with_blockoffsets_base *offset_column; column_with_blockoffsets_base *edgeId_column; + // current compression formats (to be removed when class accepts template parameters for the formats) + GraphCompressionFormat offsets_compression = GraphCompressionFormat::UNCOMPRESSED; + GraphCompressionFormat edgeIds_compression = GraphCompressionFormat::UNCOMPRESSED; + // for faster sequentiell access (not respected in memory usage yet) .. ideally encapsulated in an iterator // as already for getting edge-ids the same block is decompressed 3x otherwise std::unique_ptr offset_block_cache = nullptr; // assuming most degrees are << block-size std::unique_ptr edgeIds_block_cache = nullptr; + bool is_uncompressed() { + return offsets_compression == GraphCompressionFormat::UNCOMPRESSED && + edgeIds_compression == GraphCompressionFormat::UNCOMPRESSED; + } + protected: // this function fills the graph-topology-arrays sequentially in the order of vertex-ids ASC void add_to_vertex_edges_mapping(uint64_t sourceID, const std::vector edge_ids) override { @@ -91,9 +100,9 @@ namespace morphstore { // currently only read-only if compressed // for write it would be necessary to decompress the last block; write and compress again - if (current_compression != GraphCompressionFormat::UNCOMPRESSED) { + if (!is_uncompressed()) { throw std::runtime_error("Edge insertion only allowed in uncompressed format. Current format: " + - graph_compr_f_to_string(current_compression)); + get_compression_format()); } uint64_t *offset_data = offset_column->get_column()->get_data(); @@ -117,7 +126,7 @@ namespace morphstore { uint64_t get_offset(uint64_t id) { auto block_size = offset_column->get_block_size(); - if (current_compression == GraphCompressionFormat::UNCOMPRESSED) { + if (offsets_compression == GraphCompressionFormat::UNCOMPRESSED) { uint64_t *col_data = offset_column->get_column()->get_data(); return col_data[id]; } else { @@ -134,7 +143,7 @@ namespace morphstore { } else { //std::cout << "cache miss" << std::endl; - uncompr_block = decompress_column_block(offset_column, current_compression, block_number); + uncompr_block = decompress_column_block(offset_column, offsets_compression, block_number); // update cache offset_block_cache = std::make_unique(block_number, uncompr_block); @@ -179,6 +188,11 @@ namespace morphstore { std::string get_storage_format() const override { return "CSR"; } + std::string get_compression_format() const override { + return "offsets: " + graph_compr_f_to_string(offsets_compression) + + ", edgeIds: " + graph_compr_f_to_string(edgeIds_compression); + } + // this function gets the number of vertices/edges and allocates memory for the graph-topology columns // TODO: test that no data exists before (as this gets overwritten) void allocate_graph_structure(uint64_t numberVertices, uint64_t numberEdges) override { @@ -248,7 +262,7 @@ namespace morphstore { auto block_size = edgeId_column->get_block_size(); - if (current_compression == GraphCompressionFormat::UNCOMPRESSED) { + if (edgeIds_compression == GraphCompressionFormat::UNCOMPRESSED) { uint64_t *col_data = edgeId_column->get_column()->get_data(); result.insert(result.end(), col_data + start, col_data + end); } else { @@ -284,7 +298,7 @@ namespace morphstore { cache_hit = true; } else { // std::cout << "edgeId_col cache miss" << std::endl; - uncompr_block = decompress_column_block(edgeId_column, current_compression, block_number); + uncompr_block = decompress_column_block(edgeId_column, edgeIds_compression, block_number); } uint64_t *block_data = uncompr_block->get_data(); @@ -330,30 +344,32 @@ namespace morphstore { } void morph(GraphCompressionFormat target_format) override { + morph(target_format, target_format); + } + + // allowing different compressions for offset column and edgeId column + void morph(GraphCompressionFormat target_offset_format, GraphCompressionFormat target_edgeId_format) { #if DEBUG std::cout << "Morphing graph format specific data structures from " - << graph_compr_f_to_string(current_compression) << " to " - << graph_compr_f_to_string(target_format) << std::endl; -#endif - if (current_compression == target_format) { -#if DEBUG - std::cout << "Already in " << graph_compr_f_to_string(target_format); + << graph_compr_f_to_string(get_compression_format()) << " to " + << "offsets: " graph_compr_f_to_string(target_offset_format) + << " edgeIds: " << graph_compr_f_to_string(target_edgeId_format) << std::endl; #endif - return; - } - offset_column = morph_saving_offsets_graph_col(offset_column, current_compression, target_format, true); - edgeId_column = morph_saving_offsets_graph_col(edgeId_column, current_compression, target_format, true); + offset_column = morph_saving_offsets_graph_col(offset_column, offsets_compression, target_offset_format, true); + edgeId_column = morph_saving_offsets_graph_col(edgeId_column, edgeIds_compression, target_edgeId_format, true); // invalidating caches (as block-size may differ) if (offset_block_cache) { offset_block_cache.reset(); } + if (edgeIds_block_cache) { edgeIds_block_cache.reset(); } - this->current_compression = target_format; + this->offsets_compression = target_offset_format; + this->edgeIds_compression = target_edgeId_format; } // get size of storage format: @@ -369,24 +385,24 @@ namespace morphstore { return {index_size, data_size}; } - double offset_column_compr_ratio() { return compression_ratio(offset_column, current_compression); } + double offset_column_compr_ratio() { return compression_ratio(offset_column, offsets_compression); } - double edgeId_column_compr_ratio() { return compression_ratio(edgeId_column, current_compression); } + double edgeId_column_compr_ratio() { return compression_ratio(edgeId_column, edgeIds_compression); } - std::string get_column_info(column_with_blockoffsets_base *col_with_offsets) { + std::string get_column_info(column_with_blockoffsets_base *col_with_offsets, GraphCompressionFormat format) { auto col = col_with_offsets->get_column(); return " values: " + std::to_string(col->get_count_values()) + " size in bytes: " + std::to_string(col->get_size_used_byte()) + - " compression ratio: " + std::to_string(compression_ratio(col_with_offsets, current_compression)) + + " compression ratio: " + std::to_string(compression_ratio(col_with_offsets, format)) + " number of blocks (if blocksize > 1): " + std::to_string(col_with_offsets->get_block_offsets()->size()); } void statistics() override { Graph::statistics(); - std::cout << "offset column: " << get_column_info(offset_column) << std::endl; - std::cout << "edgeId column: " << get_column_info(edgeId_column) << std::endl; + std::cout << "offset column: " << get_column_info(offset_column, offsets_compression) << std::endl; + std::cout << "edgeId column: " << get_column_info(edgeId_column, edgeIds_compression) << std::endl; std::cout << "--------------------------------------------" << std::endl; std::cout << std::endl << std::endl; } diff --git a/include/core/storage/graph/graph.h b/include/core/storage/graph/graph.h index d2cdfd53..edfa830f 100644 --- a/include/core/storage/graph/graph.h +++ b/include/core/storage/graph/graph.h @@ -49,8 +49,6 @@ namespace morphstore { class Graph { protected: - GraphCompressionFormat current_compression = GraphCompressionFormat::UNCOMPRESSED; - // TODO: actually just needed for CSR format (could be moved) uint64_t expectedVertexCount; uint64_t expectedEdgeCount; @@ -142,6 +140,7 @@ namespace morphstore { // human-readable form of the graph storage format virtual std::string get_storage_format() const = 0; + virtual std::string get_compression_format() const = 0; virtual uint64_t add_edge(uint64_t from, uint64_t to, unsigned short int type) = 0; // changing the compression format virtual void morph(GraphCompressionFormat target_format) = 0; @@ -255,7 +254,7 @@ namespace morphstore { << std::endl; std::cout << "Number of edges: " << getEdgeCount() << std::endl; std::cout << "Number of edges with properties:" << edges->edges_with_properties_count() << std::endl; - std::cout << "Compression Format:" << graph_compr_f_to_string(current_compression) << std::endl; + std::cout << "Compression Format:" << get_compression_format() << std::endl; } void print_vertex_by_id(uint64_t id) { From f031f63afcfd8b2c68c9cc9ec90b254ac9418ea3 Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Tue, 16 Jun 2020 21:43:27 +0200 Subject: [PATCH 215/216] Benchmark partial csr compressions --- src/microbenchmarks/graph/CMakeLists.txt | 6 + .../bfs_csr_partial_compression_benchmark.cpp | 110 ++++++++++++++++++ .../graph/page_rank_benchmark.cpp | 9 +- ...rank_csr_partial_compression_benchmark.cpp | 102 ++++++++++++++++ 4 files changed, 222 insertions(+), 5 deletions(-) create mode 100644 src/microbenchmarks/graph/bfs_csr_partial_compression_benchmark.cpp create mode 100644 src/microbenchmarks/graph/page_rank_csr_partial_compression_benchmark.cpp diff --git a/src/microbenchmarks/graph/CMakeLists.txt b/src/microbenchmarks/graph/CMakeLists.txt index a32633a2..f0ba91b7 100644 --- a/src/microbenchmarks/graph/CMakeLists.txt +++ b/src/microbenchmarks/graph/CMakeLists.txt @@ -5,6 +5,8 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/compress_adjList_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/bfs_benchmark_app ) FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/page_rank_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/bfs_csr_partial_compression_benchmark_app ) + FILE( REMOVE ${CMAKE_BINARY_DIR}/src/microbenchmarks/graph/page_rank_csr_partial_compression_benchmark_app ) add_executable( vertex_storage_benchmark_app vertex_storage_benchmark.cpp) add_executable( edge_storage_benchmark_app edge_storage_benchmark.cpp) @@ -12,9 +14,13 @@ if ( BUILD_ALL OR BUILD_MICROBMS ) add_executable( compress_adjList_benchmark_app adjList_graph_compression_benchmark.cpp) add_executable( bfs_benchmark_app bfs_benchmark.cpp) add_executable( page_rank_benchmark_app page_rank_benchmark.cpp) + add_executable( bfs_csr_partial_compression_benchmark_app bfs_csr_partial_compression_benchmark.cpp) + add_executable( page_rank_csr_partial_compression_benchmark_app page_rank_csr_partial_compression_benchmark.cpp) target_link_libraries(compress_csr_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(compress_adjList_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(bfs_benchmark_app PRIVATE "-ldl" stdc++fs) target_link_libraries(page_rank_benchmark_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(bfs_csr_partial_compression_benchmark_app PRIVATE "-ldl" stdc++fs) + target_link_libraries(page_rank_csr_partial_compression_benchmark_app PRIVATE "-ldl" stdc++fs) endif() \ No newline at end of file diff --git a/src/microbenchmarks/graph/bfs_csr_partial_compression_benchmark.cpp b/src/microbenchmarks/graph/bfs_csr_partial_compression_benchmark.cpp new file mode 100644 index 00000000..0f77bd57 --- /dev/null +++ b/src/microbenchmarks/graph/bfs_csr_partial_compression_benchmark.cpp @@ -0,0 +1,110 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file bfs_csr_partial_compression_benchmark.cpp + * @brief A benchmark evaluating the impact of graph compression on breadth first search (using the ldbc graph) and + * only compressing on csr column + * @todo cleanup benchmark (this was only created last minute) + */ + +#include "benchmark_helper.h" +#include +#include +#include + +#include + +using namespace morphstore; + +struct CompressionBenchmarkEntry { + std::string graph_format; + std::string compr_format; + int64_t bfs_time; + int64_t visited_vertices; + + std::string to_string() { + return graph_format + "|" + compr_format + "|" + std::to_string(bfs_time) + "|" + + std::to_string(visited_vertices); + } +}; + +int main(void) { +#ifdef LDBC_DIR + // could be also build parameters? + const int number_of_executions = 5; + const int number_of_start_vertices = 10; + + // combination of uncompress + other + std::vector> compr_formats = { + {GraphCompressionFormat::DELTA, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::FOR, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::DYNAMIC_VBP, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::DELTA}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::FOR}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::DYNAMIC_VBP}}; + + // Load ldbc graph + // blank lines for easier deletion of progress prints + std::cout << std::endl << std::endl; + std::shared_ptr graph = std::make_shared(); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); + ldbcImport->import(*graph); + std::cout << std::endl << std::endl; + + const int cycle_size = graph->getVertexCount() / number_of_start_vertices; + auto start_vertex_ids = BFS::get_list_of_every_ith_vertex(graph, cycle_size); + + // BFS + std::cout + << "Test impact of compression on BFS (10 start-nodes (evenly distributed regarding degree); 5x excutions)" + << std::endl; + std::cout << "Graph-Format | Compression-Format | bfs-time in micro seconds| visited vertices" << std::endl; + + for (auto [offset_format, edgeId_format] : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + + if(offset_format != GraphCompressionFormat::UNCOMPRESSED) { + current_try.compr_format = graph_compr_f_to_string(offset_format); + current_try.graph_format += "(only offsets compressed)"; + } + else if (edgeId_format != GraphCompressionFormat::UNCOMPRESSED) { + current_try.compr_format = graph_compr_f_to_string(edgeId_format); + current_try.graph_format += "(only edgeIds compressed)"; + } + + // restore start state (not needed as this will be not timed and morphing internally goes via uncompr) + // graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); + // morphing into desired format + graph->morph(offset_format, edgeId_format); + + for (auto id : start_vertex_ids) { + auto start = highResClock::now(); + current_try.visited_vertices = morphstore::BFS::compute(graph, id); + current_try.bfs_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + } + } + } +#else + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); +#endif +} diff --git a/src/microbenchmarks/graph/page_rank_benchmark.cpp b/src/microbenchmarks/graph/page_rank_benchmark.cpp index 8d9d40d1..36706951 100644 --- a/src/microbenchmarks/graph/page_rank_benchmark.cpp +++ b/src/microbenchmarks/graph/page_rank_benchmark.cpp @@ -36,7 +36,8 @@ struct CompressionBenchmarkEntry { uint64_t page_rank_time, ran_iterations; std::string to_string() { - return graph_format + "|" + compr_format + "|" + std::to_string(page_rank_time) + "|" + std::to_string(ran_iterations); + return graph_format + "|" + compr_format + "|" + std::to_string(page_rank_time) + "|" + + std::to_string(ran_iterations); } }; @@ -62,7 +63,6 @@ template void benchmark() { ldbcImport->import(*graph); std::cout << std::endl << std::endl; - std::cout << "Test impact of compression on PageRank (5x executions)" << std::endl; std::cout << "Graph-Format | Compression-Format | page_rank-time in micro seconds | iterations ran" << std::endl; @@ -91,7 +91,7 @@ template void benchmark() { current_try.compr_format = graph_compr_f_to_string(current_f); // restore start state (not needed as this will be not timed and morphing internally goes via uncompr) - //graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); + // graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); // morphing into desired format graph->morph(current_f); @@ -100,9 +100,8 @@ template void benchmark() { current_try.ran_iterations = morphstore::PageRank::compute(graph).ran_iterations; current_try.page_rank_time = get_duration(start); - // for saving into csv file, just use "> xyz.csv" at execution + // for saving into csv file, just use "> xyz.csv" at execution std::cout << current_try.to_string() << std::endl; - } } #else diff --git a/src/microbenchmarks/graph/page_rank_csr_partial_compression_benchmark.cpp b/src/microbenchmarks/graph/page_rank_csr_partial_compression_benchmark.cpp new file mode 100644 index 00000000..9895135e --- /dev/null +++ b/src/microbenchmarks/graph/page_rank_csr_partial_compression_benchmark.cpp @@ -0,0 +1,102 @@ +/********************************************************************************************** + * Copyright (C) 2019 by MorphStore-Team * + * * + * This file is part of MorphStore - a compression aware vectorized column store. * + * * + * This program is free software: you can redistribute it and/or modify it under the * + * terms of the GNU General Public License as published by the Free Software Foundation, * + * either version 3 of the License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with this program. * + * If not, see . * + **********************************************************************************************/ + +/** + * @file page_rank_csr_partial_compression_benchmark.cpp + * @brief A benchmark evaluating the impact of graph compression on PageRank (using the ldbc graph) and + * only compressing on csr column + * @todo cleanup benchmark (this was only created last minute) + */ + +#include "benchmark_helper.h" +#include +#include +#include + +#include + +using namespace morphstore; + +struct CompressionBenchmarkEntry { + std::string graph_format; + std::string compr_format; + uint64_t page_rank_time, ran_iterations; + + std::string to_string() { + return graph_format + "|" + compr_format + "|" + std::to_string(page_rank_time) + "|" + + std::to_string(ran_iterations); + } +}; + +int main(void) { +#ifdef LDBC_DIR + // could be also build parameters? + const int number_of_executions = 5; + + // combination of uncompress + other + std::vector> compr_formats = { + {GraphCompressionFormat::DELTA, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::FOR, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::DYNAMIC_VBP, GraphCompressionFormat::UNCOMPRESSED}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::DELTA}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::FOR}, + {GraphCompressionFormat::UNCOMPRESSED, GraphCompressionFormat::DYNAMIC_VBP}}; + + // Load ldbc graph + // blank lines for easier deletion of progress prints + std::cout << std::endl << std::endl; + std::shared_ptr graph = std::make_shared(); + std::unique_ptr ldbcImport = std::make_unique(LDBC_DIR); + ldbcImport->import(*graph); + std::cout << std::endl << std::endl; + + // PageRank + std::cout << "Test impact of compression on PageRank (5x executions)" << std::endl; + std::cout << "Graph-Format | Compression-Format | page_rank-time in micro seconds | iterations ran" << std::endl; + + for (auto [offset_format, edgeId_format] : compr_formats) { + for (int exec = 0; exec < number_of_executions; exec++) { + CompressionBenchmarkEntry current_try; + current_try.graph_format = graph->get_storage_format(); + + if(offset_format != GraphCompressionFormat::UNCOMPRESSED) { + current_try.compr_format = graph_compr_f_to_string(offset_format); + current_try.graph_format += "(only offsets compressed)"; + } + else if (edgeId_format != GraphCompressionFormat::UNCOMPRESSED) { + current_try.compr_format = graph_compr_f_to_string(edgeId_format); + current_try.graph_format += "(only edgeIds compressed)"; + } + + // restore start state (not needed as this will be not timed and morphing internally goes via uncompr) + // graph->morph(GraphCompressionFormat::UNCOMPRESSED, false); + // morphing into desired format + graph->morph(offset_format, edgeId_format); + + auto start = highResClock::now(); + // current default values for PageRank: max_iterations = 20, damping_factor = 0.85, tolerance = 0.0001 + current_try.ran_iterations = morphstore::PageRank::compute(graph).ran_iterations; + current_try.page_rank_time = get_duration(start); + + // for saving into csv file, just use "> xyz.csv" at execution + std::cout << current_try.to_string() << std::endl; + } + } +#else + throw std::invalid_argument("You forgot to define/uncomment the LDBC_DIR (at CMakeList.txt)"); +#endif +} From 6f731e5d859ffdb369c7f1dfcb93134c00223f7c Mon Sep 17 00:00:00 2001 From: Florentin Doerre Date: Mon, 22 Jun 2020 17:31:26 +0200 Subject: [PATCH 216/216] Allow fallback to string if data-type could not be found in schema looks like the ldbc graph schema is not final .. they added a forum type in the dev branch --- .../core/storage/graph/importer/ldbc_import.h | 22 ++++++++++++------- .../core/storage/graph/importer/ldbc_schema.h | 5 ++++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/core/storage/graph/importer/ldbc_import.h b/include/core/storage/graph/importer/ldbc_import.h index 0adf9af1..0e922fb8 100644 --- a/include/core/storage/graph/importer/ldbc_import.h +++ b/include/core/storage/graph/importer/ldbc_import.h @@ -203,8 +203,10 @@ namespace morphstore { property_key = row.substr(last, next - last); data_type = get_data_type(vertexType, property_key); if (data_type == Ldbc_Data_Type::ERROR) { - throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + - " could not be found in schema"); + std::cout + << "Unexpected property: in " << file.string() + << ":" << vertexType << ":" << property_key << " could not be found in schema"; + data_type = Ldbc_Data_Type::STRING; } attributes.push_back(std::make_pair(property_key, data_type)); last = next + 1; @@ -213,8 +215,10 @@ namespace morphstore { property_key = row.substr(last); data_type = get_data_type(vertexType, property_key); if (data_type == Ldbc_Data_Type::ERROR) { - throw std::invalid_argument(file.string() + ":" + vertexType + ":" + property_key + - " could not be found in schema"); + std::cout + << "Unexpected property: in " << file.string() + << ":" << vertexType << ":" << property_key << " could not be found in schema"; + data_type = Ldbc_Data_Type::STRING; } attributes.push_back(std::make_pair(property_key, data_type)); } else { @@ -518,10 +522,12 @@ namespace morphstore { if (start == 0) { propertyKey = row.substr(row.find(delimiter) + 1); data_type = get_data_type(sourceVertexType, propertyKey); - if (data_type == Ldbc_Data_Type::ERROR) - throw std::invalid_argument(file.string() + ":" + edgeType + ":" + - propertyKey + " could not be found in schema"); - + if (data_type == Ldbc_Data_Type::ERROR) { + std::cout + << "Unexpected property: in " << file.string() + << ":" << edgeType << ":" << propertyKey << " could not be found in schema"; + data_type = Ldbc_Data_Type::STRING; + } } else { // (1) write data to vector: if key is already present, over write value // (simplicity: we take the newest one) diff --git a/include/core/storage/graph/importer/ldbc_schema.h b/include/core/storage/graph/importer/ldbc_schema.h index aa093e32..7470b954 100644 --- a/include/core/storage/graph/importer/ldbc_schema.h +++ b/include/core/storage/graph/importer/ldbc_schema.h @@ -54,7 +54,10 @@ namespace morphstore { {"language", Ldbc_Data_Type::STRING}, {"browserUsed", Ldbc_Data_Type::STRING}, {"locationIP", Ldbc_Data_Type::STRING}}}, - {"forum", {{"creationDate", Ldbc_Data_Type::DATE_TIME}, {"title", Ldbc_Data_Type::LONG_STRING}}}, + {"forum", + {{"creationDate", Ldbc_Data_Type::DATE_TIME}, + {"title", Ldbc_Data_Type::LONG_STRING}, + {"type", Ldbc_Data_Type::STRING}}}, {"post", {{"creationDate", Ldbc_Data_Type::DATE_TIME}, {"browserUsed", Ldbc_Data_Type::STRING},