From 7ed7b8921d569c58e8227fe9e93b16566d847e19 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Sat, 19 Feb 2022 20:16:17 -0500 Subject: [PATCH 01/23] Add dataset parse/load library and use it in tilt --- dataset_parser/CMakeLists.txt | 30 +++++++++ dataset_parser/parser.cpp | 30 +++++++++ dataset_util/include/data_gen.h | 31 +++++++++ dataset_util/include/data_loader.h | 37 ++++++++++ dataset_util/include/data_parser.h | 62 +++++++++++++++++ dataset_util/include/taxi/taxi_data_gen.h | 75 +++++++++++++++++++++ dataset_util/include/taxi/taxi_data_print.h | 28 ++++++++ dataset_util/protos/taxi_trip.proto | 23 +++++++ tilt_bench/CMakeLists.txt | 18 ++++- tilt_bench/main.cpp | 14 ++++ 10 files changed, 345 insertions(+), 3 deletions(-) create mode 100644 dataset_parser/CMakeLists.txt create mode 100644 dataset_parser/parser.cpp create mode 100644 dataset_util/include/data_gen.h create mode 100644 dataset_util/include/data_loader.h create mode 100644 dataset_util/include/data_parser.h create mode 100644 dataset_util/include/taxi/taxi_data_gen.h create mode 100644 dataset_util/include/taxi/taxi_data_print.h create mode 100644 dataset_util/protos/taxi_trip.proto diff --git a/dataset_parser/CMakeLists.txt b/dataset_parser/CMakeLists.txt new file mode 100644 index 0000000..af7b941 --- /dev/null +++ b/dataset_parser/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.16.3) +set(CMAKE_C_COMPILER clang) +set(CMAKE_CXX_COMPILER clang++) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + +project(dataset_parser) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +get_filename_component(PROJECT_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +set(Protobuf_IMPORT_DIRS ${PROJECT_ROOT_DIR}/dataset_util/protos) + +find_package(Boost 1.78.0 REQUIRED COMPONENTS date_time) +find_package(protobuf CONFIG REQUIRED) + +if(protobuf_VERBOSE) + message(STATUS "Using Protocol Buffers ${protobuf_VERSION}") +endif() + +include_directories(${PROJECT_ROOT_DIR}/dataset_util/include ${PROTOBUF_INCLUDE_DIRS} ${Boost_INCLUDE_DIR}) +set(CMAKE_INCLUDE_CURRENT_DIR TRUE) + +add_executable(parser parser.cpp ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) + +target_link_libraries(parser protobuf::libprotobuf ${Boost_LIBRARIES}) + +set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) +protobuf_generate(TARGET parser LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) \ No newline at end of file diff --git a/dataset_parser/parser.cpp b/dataset_parser/parser.cpp new file mode 100644 index 0000000..96830e3 --- /dev/null +++ b/dataset_parser/parser.cpp @@ -0,0 +1,30 @@ +#include + +#include "taxi_trip.pb.h" + +#include +#include + +using namespace std; + +int main(int argc, char* argv[]) { + if (argc != 2) { + cerr << "Usage: " << endl; + } + + GOOGLE_PROTOBUF_VERIFY_VERSION; + + fstream file(argv[1]); + if (!file.is_open()) { + cerr << "Cannot open file " << argv[1] << endl; + return 1; + } + + taxi_trip_data_gen data_generator; + data_parser parser(file, &data_generator); + parser.parse(); + + file.close(); + google::protobuf::ShutdownProtobufLibrary(); + return 0; +} diff --git a/dataset_util/include/data_gen.h b/dataset_util/include/data_gen.h new file mode 100644 index 0000000..46c58cc --- /dev/null +++ b/dataset_util/include/data_gen.h @@ -0,0 +1,31 @@ +#ifndef DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ +#define DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ + +#include + +using namespace std; +using namespace boost; + +template +class data_gen +{ +public: + data_gen(){} + ~data_gen(){} + virtual void gen_data(vector&, T*) = 0; + + float stof_err_handle(string &str) + { + try { return stof(str); } + catch (std::exception& e) { return 0.0f; } + } + + int64_t parse_datetime_to_seconds(string &datetime, posix_time::ptime &start_time) + { + auto time = posix_time::time_from_string(datetime); + auto diff = time - start_time; + return diff.total_seconds(); + } +}; + +#endif // DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ \ No newline at end of file diff --git a/dataset_util/include/data_loader.h b/dataset_util/include/data_loader.h new file mode 100644 index 0000000..892d894 --- /dev/null +++ b/dataset_util/include/data_loader.h @@ -0,0 +1,37 @@ +#ifndef DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ +#define DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ + +#include + +#include +#include +#include + +using namespace std; + +template +class data_loader +{ +private: + google::protobuf::io::IstreamInputStream raw_in; + google::protobuf::io::CodedInputStream coded_in; +public: + data_loader() : + raw_in(&cin), + coded_in(&raw_in) + {} + ~data_loader(){} + + bool load_data(T& t) { + bool clean_eof; + if (!google::protobuf::util::ParseDelimitedFromCodedStream(&t, &coded_in, &clean_eof)) { + if (!clean_eof) { + cerr << "Fail to parse data from coded input stream." << endl; + } + return false; + } + return true; + } +}; + +#endif // DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ \ No newline at end of file diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h new file mode 100644 index 0000000..73867e4 --- /dev/null +++ b/dataset_util/include/data_parser.h @@ -0,0 +1,62 @@ +#ifndef DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ +#define DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ + +#include + +#include + +#include + +using namespace std; + +template +class data_parser +{ +private: + fstream &file; + data_gen *data_generator; +public: + data_parser(fstream &file, data_gen *data_generator) : + file(file), + data_generator(data_generator) + {} + + bool parse_csv_line(fstream &file, vector &row) { + string line; + + if (getline(file, line)) { + string word; + stringstream ss(line); + + while (getline(ss, word, ',')) { + row.push_back(word); + } + + return true; + } + return false; + } + + bool parse() { + string line; + getline(file, line); + + vector row; + while (true) { + if (!parse_csv_line(file, row)) { + break; + } + + T data; + data_generator->gen_data(row, &data); + if (!google::protobuf::util::SerializeDelimitedToOstream(data, &cout)) { + cerr << "Fail to serialize data into output stream" << endl; + return false; + } + row.clear(); + } + return true; + } +}; + +#endif // DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ diff --git a/dataset_util/include/taxi/taxi_data_gen.h b/dataset_util/include/taxi/taxi_data_gen.h new file mode 100644 index 0000000..b7f989c --- /dev/null +++ b/dataset_util/include/taxi/taxi_data_gen.h @@ -0,0 +1,75 @@ +#ifndef DATASET_LOADER_TAXI_DATA_LOADER_H_ +#define DATASET_LOADER_TAXI_DATA_LOADER_H_ + +#include + +#include + +#include + +#include + +using namespace std; +using namespace boost; + +class taxi_trip_data_gen : public data_gen +{ +private: + enum TAXI_DATA_INDEX { + MEDALLION, + HACK_LICENSE, + VENDOR_ID, + RATE_CODE, + STORE_AND_FWD_FLAG, + PICKUP_DATETIME, + DROPOFF_DATETIME, + PASSENGER_COUNT, + TRIP_TIME_IN_SECS, + TRIP_DISTANCE, + PICKUP_LONGITUDE, + PICKUP_LATITUDE, + DROPOFF_LONGITUDE, + DROPOFF_LATITUDE + }; + posix_time::ptime start_time; + +public: + taxi_trip_data_gen() : + start_time(boost::gregorian::date(1970, 1, 1)) + {} + ~taxi_trip_data_gen(){} + + void gen_data(vector &row, stream::taxi_trip *trip) override { + int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); + int64_t et = this->parse_datetime_to_seconds(row[DROPOFF_DATETIME], start_time); + int32_t medallion = stoi(row[MEDALLION]); + int32_t hack_license = stoi(row[HACK_LICENSE]); + string vendor_id = row[VENDOR_ID]; + int32_t rate_code = stoi(row[RATE_CODE]); + bool store_and_fwd_flag = false; + int32_t passenger_count = stoi(row[PASSENGER_COUNT]); + int32_t trip_time_in_secs = stoi(row[TRIP_TIME_IN_SECS]); + float trip_distance = this->stof_err_handle(row[TRIP_DISTANCE]); + float pickup_longitude = this->stof_err_handle(row[PICKUP_LONGITUDE]); + float pickup_latitude = this->stof_err_handle(row[PICKUP_LATITUDE]); + float dropoff_longitude = this->stof_err_handle(row[DROPOFF_LONGITUDE]); + float dropoff_latitude = this->stof_err_handle(row[DROPOFF_LATITUDE]); + + trip->set_st(st); + trip->set_et(et); + trip->set_medallion(medallion); + trip->set_hack_license(hack_license); + trip->set_vendor_id(vendor_id); + trip->set_rate_code(rate_code); + trip->set_store_and_fwd_flag(store_and_fwd_flag); + trip->set_passenger_count(passenger_count); + trip->set_trip_time_in_secs(trip_time_in_secs); + trip->set_trip_distance(trip_distance); + trip->set_dropoff_longitude(dropoff_longitude); + trip->set_pickup_latitude(pickup_latitude); + trip->set_pickup_longitude(pickup_longitude); + trip->set_dropoff_latitude(dropoff_latitude); + } +}; + +#endif // DATASET_LOADER_TAXI_DATA_LOADER_H_ \ No newline at end of file diff --git a/dataset_util/include/taxi/taxi_data_print.h b/dataset_util/include/taxi/taxi_data_print.h new file mode 100644 index 0000000..0bdfc87 --- /dev/null +++ b/dataset_util/include/taxi/taxi_data_print.h @@ -0,0 +1,28 @@ +#ifndef DATASET_LOADER_TAXI_DATA_PRINT_H_ +#define DATASET_LOADER_TAXI_DATA_PRINT_H_ + +#include + +#include + +using namespace std; + +ostream& operator<< (ostream& out, stream::taxi_trip const& trip) +{ + out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; + out << "medallion: " << trip.medallion() << ", "; + out << "hack_license: " << trip.hack_license() << ", "; + out << "vendor_id: " << trip.vendor_id() << ", "; + out << "rate_code: " << trip.rate_code() << ", "; + out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; + out << "passenger_count: " << trip.passenger_count() << ", "; + out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; + out << "trip_distance: " << trip.trip_distance() << ", "; + out << "pickup_longitude: " << trip.pickup_longitude() << ", "; + out << "pickup_latitude: " << trip.pickup_latitude() << ", "; + out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; + out << "dropoff_latitude: " << trip.dropoff_latitude(); + return out; +} + +#endif // DATASET_LOADER_TAXI_DATA_PRINT_H_ \ No newline at end of file diff --git a/dataset_util/protos/taxi_trip.proto b/dataset_util/protos/taxi_trip.proto new file mode 100644 index 0000000..6a6068c --- /dev/null +++ b/dataset_util/protos/taxi_trip.proto @@ -0,0 +1,23 @@ +syntax = "proto2"; +package stream; + +option java_multiple_files = true; +option java_package = "com.stream.taxi.protos"; +option java_outer_classname = "TaxiProtos"; + +message taxi_trip { + required int64 st = 1; + required int64 et = 2; + required int32 medallion = 3; + required int32 hack_license = 4; + required string vendor_id = 5; + required int32 rate_code = 6; + required bool store_and_fwd_flag = 7; + required int32 passenger_count = 8; + required int32 trip_time_in_secs = 9; + required float trip_distance = 10; + required float pickup_longitude = 11; + required float pickup_latitude = 12; + required float dropoff_longitude = 13; + required float dropoff_latitude = 14; +} \ No newline at end of file diff --git a/tilt_bench/CMakeLists.txt b/tilt_bench/CMakeLists.txt index 644ec43..8bea447 100644 --- a/tilt_bench/CMakeLists.txt +++ b/tilt_bench/CMakeLists.txt @@ -9,18 +9,30 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) find_package(LLVM 11.0.0 REQUIRED CONFIG) +find_package(protobuf CONFIG REQUIRED) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") +if(protobuf_VERBOSE) + message(STATUS "Using Protocol Buffers ${protobuf_VERSION}") +endif() + +get_filename_component(PROJECT_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +set(Protobuf_IMPORT_DIRS ${PROJECT_ROOT_DIR}/dataset_util/protos) add_subdirectory(tilt/third_party/easy_jit) -include_directories(${LLVM_INCLUDE_DIRS} include tilt/tilt/include) +include_directories(${LLVM_INCLUDE_DIRS} ${PROTOBUF_INCLUDE_DIRS} include tilt/tilt/include ${PROJECT_ROOT_DIR}/dataset_util/include) +set(CMAKE_INCLUDE_CURRENT_DIR TRUE) add_definitions(${LLVM_DEFINITIONS}) llvm_map_components_to_libnames(llvm_libs native orcjit mcjit objcarcopts) set(easyjit_lib "${CMAKE_BINARY_DIR}/tilt/third_party/easy_jit/bin/EasyJitPass.so") add_subdirectory(tilt/tilt) -add_executable(main main.cpp) -target_link_libraries(main tilt) +add_executable(main main.cpp ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) + +target_link_libraries(main protobuf::libprotobuf tilt) + +set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) +protobuf_generate(TARGET main LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) \ No newline at end of file diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index fabfafa..1dfd721 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -2,6 +2,10 @@ #include #include +#include +#include +#include + #include "tilt_select.h" #include "tilt_where.h" #include "tilt_aggregate.h" @@ -22,6 +26,8 @@ using namespace std; int main(int argc, char** argv) { + GOOGLE_PROTOBUF_VERIFY_VERSION; + const rlim_t kStackSize = 2 * 1024 * 1024 * 1024; // min stack size = 2 GB struct rlimit rl; int result; @@ -37,6 +43,13 @@ int main(int argc, char** argv) } } + data_loader loader; + while (true) { + stream::taxi_trip trip; + loader.load_data(trip); + cout << trip << endl; + } + string testcase = (argc > 1) ? argv[1] : "select"; int64_t size = (argc > 2) ? atoi(argv[2]) : 100000000; int64_t period = 1; @@ -113,5 +126,6 @@ int main(int argc, char** argv) cout << "Testcase: " << testcase <<", Size: " << size << ", Time: " << setprecision(3) << time / 1000000 << endl; + google::protobuf::ShutdownProtobufLibrary(); return 0; } From 8b803efa3e77b1257fa6420866eba456c1a9bbdc Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Mon, 21 Feb 2022 05:06:45 -0500 Subject: [PATCH 02/23] combine parser and data-gen --- dataset_parser/parser.cpp | 6 ++-- dataset_util/include/data_gen.h | 31 ------------------ dataset_util/include/data_parser.h | 32 +++++++++++++------ dataset_util/include/taxi/taxi_data_print.h | 28 ---------------- .../{taxi/taxi_data_gen.h => taxi_data.h} | 30 +++++++++++++---- tilt_bench/main.cpp | 2 +- 6 files changed, 50 insertions(+), 79 deletions(-) delete mode 100644 dataset_util/include/data_gen.h delete mode 100644 dataset_util/include/taxi/taxi_data_print.h rename dataset_util/include/{taxi/taxi_data_gen.h => taxi_data.h} (67%) diff --git a/dataset_parser/parser.cpp b/dataset_parser/parser.cpp index 96830e3..1b9d347 100644 --- a/dataset_parser/parser.cpp +++ b/dataset_parser/parser.cpp @@ -2,8 +2,7 @@ #include "taxi_trip.pb.h" -#include -#include +#include using namespace std; @@ -20,8 +19,7 @@ int main(int argc, char* argv[]) { return 1; } - taxi_trip_data_gen data_generator; - data_parser parser(file, &data_generator); + taxi_trip_data_parser parser(file); parser.parse(); file.close(); diff --git a/dataset_util/include/data_gen.h b/dataset_util/include/data_gen.h deleted file mode 100644 index 46c58cc..0000000 --- a/dataset_util/include/data_gen.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ -#define DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ - -#include - -using namespace std; -using namespace boost; - -template -class data_gen -{ -public: - data_gen(){} - ~data_gen(){} - virtual void gen_data(vector&, T*) = 0; - - float stof_err_handle(string &str) - { - try { return stof(str); } - catch (std::exception& e) { return 0.0f; } - } - - int64_t parse_datetime_to_seconds(string &datetime, posix_time::ptime &start_time) - { - auto time = posix_time::time_from_string(datetime); - auto diff = time - start_time; - return diff.total_seconds(); - } -}; - -#endif // DATASET_STREAMER_DATA_GEN_DATA_GEN_H_ \ No newline at end of file diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index 73867e4..57b324a 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -3,9 +3,9 @@ #include -#include +#include -#include +#include using namespace std; @@ -14,12 +14,21 @@ class data_parser { private: fstream &file; - data_gen *data_generator; -public: - data_parser(fstream &file, data_gen *data_generator) : - file(file), - data_generator(data_generator) - {} + virtual void gen_data(vector&, T*) = 0; + +protected: + float stof_err_handle(string &str) + { + try { return stof(str); } + catch (std::exception& e) { return 0.0f; } + } + + int64_t parse_datetime_to_seconds(string &datetime, boost::posix_time::ptime &start_time) + { + auto time = boost::posix_time::time_from_string(datetime); + auto diff = time - start_time; + return diff.total_seconds(); + } bool parse_csv_line(fstream &file, vector &row) { string line; @@ -37,6 +46,11 @@ class data_parser return false; } +public: + data_parser(fstream &file) : + file(file) + {} + bool parse() { string line; getline(file, line); @@ -48,7 +62,7 @@ class data_parser } T data; - data_generator->gen_data(row, &data); + gen_data(row, &data); if (!google::protobuf::util::SerializeDelimitedToOstream(data, &cout)) { cerr << "Fail to serialize data into output stream" << endl; return false; diff --git a/dataset_util/include/taxi/taxi_data_print.h b/dataset_util/include/taxi/taxi_data_print.h deleted file mode 100644 index 0bdfc87..0000000 --- a/dataset_util/include/taxi/taxi_data_print.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef DATASET_LOADER_TAXI_DATA_PRINT_H_ -#define DATASET_LOADER_TAXI_DATA_PRINT_H_ - -#include - -#include - -using namespace std; - -ostream& operator<< (ostream& out, stream::taxi_trip const& trip) -{ - out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; - out << "medallion: " << trip.medallion() << ", "; - out << "hack_license: " << trip.hack_license() << ", "; - out << "vendor_id: " << trip.vendor_id() << ", "; - out << "rate_code: " << trip.rate_code() << ", "; - out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; - out << "passenger_count: " << trip.passenger_count() << ", "; - out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; - out << "trip_distance: " << trip.trip_distance() << ", "; - out << "pickup_longitude: " << trip.pickup_longitude() << ", "; - out << "pickup_latitude: " << trip.pickup_latitude() << ", "; - out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; - out << "dropoff_latitude: " << trip.dropoff_latitude(); - return out; -} - -#endif // DATASET_LOADER_TAXI_DATA_PRINT_H_ \ No newline at end of file diff --git a/dataset_util/include/taxi/taxi_data_gen.h b/dataset_util/include/taxi_data.h similarity index 67% rename from dataset_util/include/taxi/taxi_data_gen.h rename to dataset_util/include/taxi_data.h index b7f989c..df5815a 100644 --- a/dataset_util/include/taxi/taxi_data_gen.h +++ b/dataset_util/include/taxi_data.h @@ -7,12 +7,11 @@ #include -#include +#include using namespace std; -using namespace boost; -class taxi_trip_data_gen : public data_gen +class taxi_trip_data_parser : public data_parser { private: enum TAXI_DATA_INDEX { @@ -31,13 +30,14 @@ class taxi_trip_data_gen : public data_gen DROPOFF_LONGITUDE, DROPOFF_LATITUDE }; - posix_time::ptime start_time; + boost::posix_time::ptime start_time; public: - taxi_trip_data_gen() : + taxi_trip_data_parser(fstream &file) : + data_parser(file), start_time(boost::gregorian::date(1970, 1, 1)) {} - ~taxi_trip_data_gen(){} + ~taxi_trip_data_parser(){} void gen_data(vector &row, stream::taxi_trip *trip) override { int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); @@ -72,4 +72,22 @@ class taxi_trip_data_gen : public data_gen } }; +ostream& operator<< (ostream& out, stream::taxi_trip const& trip) +{ + out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; + out << "medallion: " << trip.medallion() << ", "; + out << "hack_license: " << trip.hack_license() << ", "; + out << "vendor_id: " << trip.vendor_id() << ", "; + out << "rate_code: " << trip.rate_code() << ", "; + out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; + out << "passenger_count: " << trip.passenger_count() << ", "; + out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; + out << "trip_distance: " << trip.trip_distance() << ", "; + out << "pickup_longitude: " << trip.pickup_longitude() << ", "; + out << "pickup_latitude: " << trip.pickup_latitude() << ", "; + out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; + out << "dropoff_latitude: " << trip.dropoff_latitude(); + return out; +} + #endif // DATASET_LOADER_TAXI_DATA_LOADER_H_ \ No newline at end of file diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index 1dfd721..fb030fa 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include "tilt_select.h" #include "tilt_where.h" From bb819ad820d166ac59df923e628cfea04eda98c0 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Mon, 21 Feb 2022 17:54:17 -0500 Subject: [PATCH 03/23] Parse taxi trip dataset as a folder --- dataset_parser/CMakeLists.txt | 2 +- dataset_parser/parser.cpp | 18 +++---- dataset_util/include/data_parser.h | 52 ++++++++++--------- .../{taxi_data.h => taxi_data_parser.h} | 51 +++++++++++++++--- tilt_bench/main.cpp | 3 +- 5 files changed, 80 insertions(+), 46 deletions(-) rename dataset_util/include/{taxi_data.h => taxi_data_parser.h} (72%) diff --git a/dataset_parser/CMakeLists.txt b/dataset_parser/CMakeLists.txt index af7b941..c9024f6 100644 --- a/dataset_parser/CMakeLists.txt +++ b/dataset_parser/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) get_filename_component(PROJECT_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) set(Protobuf_IMPORT_DIRS ${PROJECT_ROOT_DIR}/dataset_util/protos) -find_package(Boost 1.78.0 REQUIRED COMPONENTS date_time) +find_package(Boost 1.78.0 REQUIRED COMPONENTS date_time filesystem) find_package(protobuf CONFIG REQUIRED) if(protobuf_VERBOSE) diff --git a/dataset_parser/parser.cpp b/dataset_parser/parser.cpp index 1b9d347..960bebd 100644 --- a/dataset_parser/parser.cpp +++ b/dataset_parser/parser.cpp @@ -2,27 +2,21 @@ #include "taxi_trip.pb.h" -#include +#include using namespace std; int main(int argc, char* argv[]) { - if (argc != 2) { - cerr << "Usage: " << endl; - } - GOOGLE_PROTOBUF_VERIFY_VERSION; - fstream file(argv[1]); - if (!file.is_open()) { - cerr << "Cannot open file " << argv[1] << endl; - return 1; + if (argc != 2) { + cerr << "Usage: " << endl; } + string dataset_dir = argv[1]; - taxi_trip_data_parser parser(file); + taxi_trip_data_parser parser(dataset_dir); parser.parse(); - - file.close(); + google::protobuf::ShutdownProtobufLibrary(); return 0; } diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index 57b324a..5fcbf6f 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -12,23 +12,9 @@ using namespace std; template class data_parser { -private: - fstream &file; - virtual void gen_data(vector&, T*) = 0; - protected: - float stof_err_handle(string &str) - { - try { return stof(str); } - catch (std::exception& e) { return 0.0f; } - } - - int64_t parse_datetime_to_seconds(string &datetime, boost::posix_time::ptime &start_time) - { - auto time = boost::posix_time::time_from_string(datetime); - auto diff = time - start_time; - return diff.total_seconds(); - } + virtual bool parse() = 0; + virtual void gen_data(vector&, T*) = 0; bool parse_csv_line(fstream &file, vector &row) { string line; @@ -46,12 +32,7 @@ class data_parser return false; } -public: - data_parser(fstream &file) : - file(file) - {} - - bool parse() { + bool parse_csv_file(fstream &file) { string line; getline(file, line); @@ -63,14 +44,37 @@ class data_parser T data; gen_data(row, &data); - if (!google::protobuf::util::SerializeDelimitedToOstream(data, &cout)) { - cerr << "Fail to serialize data into output stream" << endl; + if (!write_serialized_to_ostream(data)) { return false; } row.clear(); } return true; } + + bool write_serialized_to_ostream(T &t) { + if (!google::protobuf::util::SerializeDelimitedToOstream(t, &cout)) { + cerr << "Fail to serialize data into output stream" << endl; + return false; + } + return true; + } + + float stof_err_handle(string &str) + { + try { return stof(str); } + catch (std::exception& e) { return 0.0f; } + } + + int64_t parse_datetime_to_seconds(string &datetime, boost::posix_time::ptime &start_time) + { + auto time = boost::posix_time::time_from_string(datetime); + auto diff = time - start_time; + return diff.total_seconds(); + } + +public: + data_parser(){} }; #endif // DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ diff --git a/dataset_util/include/taxi_data.h b/dataset_util/include/taxi_data_parser.h similarity index 72% rename from dataset_util/include/taxi_data.h rename to dataset_util/include/taxi_data_parser.h index df5815a..6f72cfd 100644 --- a/dataset_util/include/taxi_data.h +++ b/dataset_util/include/taxi_data_parser.h @@ -2,14 +2,17 @@ #define DATASET_LOADER_TAXI_DATA_LOADER_H_ #include +#include #include +#include #include #include using namespace std; +using namespace boost::filesystem; class taxi_trip_data_parser : public data_parser { @@ -30,14 +33,9 @@ class taxi_trip_data_parser : public data_parser DROPOFF_LONGITUDE, DROPOFF_LATITUDE }; + string &dataset_dir; boost::posix_time::ptime start_time; - -public: - taxi_trip_data_parser(fstream &file) : - data_parser(file), - start_time(boost::gregorian::date(1970, 1, 1)) - {} - ~taxi_trip_data_parser(){} + const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; void gen_data(vector &row, stream::taxi_trip *trip) override { int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); @@ -70,6 +68,45 @@ class taxi_trip_data_parser : public data_parser trip->set_pickup_longitude(pickup_longitude); trip->set_dropoff_latitude(dropoff_latitude); } + +public: + taxi_trip_data_parser(string &dataset_dir) : + dataset_dir(dataset_dir), + start_time(boost::gregorian::date(1970, 1, 1)) + {} + ~taxi_trip_data_parser(){} + + bool parse() override { + const path data_dir(dataset_dir); + if (!is_directory(data_dir)) { + cerr << "Directory " << dataset_dir << " does not exist." << endl; + return false; + } + + for (auto &foil_folder : foil_folders) { + path foil_dir = data_dir / foil_folder; + if (!is_directory(foil_dir)) { + cerr << "Directory " << foil_dir << " is skipped because it does not exist" << endl; + continue; + } + + size_t i = 1; + while (true) { + path trip_data_file = foil_dir / ("trip_data_" + std::to_string(i) + ".csv"); + if (!exists(trip_data_file)) { + break; + } + cerr << "Parsing " << trip_data_file << endl; + std::fstream trip_csv_file(trip_data_file.string()); + parse_csv_file(trip_csv_file); + + trip_csv_file.close(); + i += 1; + } + } + + return true; + } }; ostream& operator<< (ostream& out, stream::taxi_trip const& trip) diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index fb030fa..6d6727a 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -2,9 +2,8 @@ #include #include -#include #include -#include +#include #include "tilt_select.h" #include "tilt_where.h" From e41008dd762abe21000344747bc75989665ee5e8 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Mon, 21 Feb 2022 23:03:18 -0500 Subject: [PATCH 04/23] Add taxi fare dataset --- dataset_parser/CMakeLists.txt | 4 +- dataset_parser/parser.cpp | 18 +- dataset_util/include/taxi_data_parser.h | 192 +++++++++++++----- .../protos/{taxi_trip.proto => taxi.proto} | 15 ++ tilt_bench/CMakeLists.txt | 4 +- tilt_bench/main.cpp | 8 +- 6 files changed, 175 insertions(+), 66 deletions(-) rename dataset_util/protos/{taxi_trip.proto => taxi.proto} (62%) diff --git a/dataset_parser/CMakeLists.txt b/dataset_parser/CMakeLists.txt index c9024f6..f86b630 100644 --- a/dataset_parser/CMakeLists.txt +++ b/dataset_parser/CMakeLists.txt @@ -22,9 +22,9 @@ endif() include_directories(${PROJECT_ROOT_DIR}/dataset_util/include ${PROTOBUF_INCLUDE_DIRS} ${Boost_INCLUDE_DIR}) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) -add_executable(parser parser.cpp ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) +add_executable(parser parser.cpp ${Protobuf_IMPORT_DIRS}/taxi.proto) target_link_libraries(parser protobuf::libprotobuf ${Boost_LIBRARIES}) set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET parser LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) \ No newline at end of file +protobuf_generate(TARGET parser LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file diff --git a/dataset_parser/parser.cpp b/dataset_parser/parser.cpp index 960bebd..4cafb8d 100644 --- a/dataset_parser/parser.cpp +++ b/dataset_parser/parser.cpp @@ -1,6 +1,6 @@ #include -#include "taxi_trip.pb.h" +#include "taxi.pb.h" #include @@ -9,13 +9,21 @@ using namespace std; int main(int argc, char* argv[]) { GOOGLE_PROTOBUF_VERIFY_VERSION; - if (argc != 2) { - cerr << "Usage: " << endl; + if (argc != 3) { + cerr << "Usage: " << endl; } string dataset_dir = argv[1]; + string dataset_name = argv[2]; - taxi_trip_data_parser parser(dataset_dir); - parser.parse(); + if (dataset_name == "taxi_trip") { + taxi_trip_data_parser parser(dataset_dir); + parser.parse(); + } else if (dataset_name == "taxi_fare") { + taxi_fare_data_parser parser(dataset_dir); + parser.parse(); + } else { + throw std::runtime_error("Unknown dataset name."); + } google::protobuf::ShutdownProtobufLibrary(); return 0; diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 6f72cfd..3882cb8 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -7,17 +7,101 @@ #include #include -#include +#include #include using namespace std; using namespace boost::filesystem; -class taxi_trip_data_parser : public data_parser +ostream& operator<< (ostream& out, stream::taxi_trip const& trip) +{ + out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; + out << "medallion: " << trip.medallion() << ", "; + out << "hack_license: " << trip.hack_license() << ", "; + out << "vendor_id: " << trip.vendor_id() << ", "; + out << "rate_code: " << trip.rate_code() << ", "; + out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; + out << "passenger_count: " << trip.passenger_count() << ", "; + out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; + out << "trip_distance: " << trip.trip_distance() << ", "; + out << "pickup_longitude: " << trip.pickup_longitude() << ", "; + out << "pickup_latitude: " << trip.pickup_latitude() << ", "; + out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; + out << "dropoff_latitude: " << trip.dropoff_latitude(); + return out; +} + +ostream& operator<< (ostream& out, stream::taxi_fare const& fare) +{ + out << "taxi_trip[" << fare.st() << ", " << fare.et() << "]: "; + out << "medallion: " << fare.medallion() << ", "; + out << "hack_license: " << fare.hack_license() << ", "; + out << "vendor_id: " << fare.vendor_id() << ", "; + out << "payment_type: " << fare.payment_type() << ", "; + out << "fare_amount: " << fare.fare_amount() << ", "; + out << "surcharge: " << fare.surcharge() << ", "; + out << "mta_tax: " << fare.mta_tax() << ", "; + out << "tip_amount: " << fare.tip_amount() << ", "; + out << "tolls_amount: " << fare.tolls_amount() << ", "; + out << "total_amount: " << fare.total_amount() << ", "; + + return out; +} + +template +class taxi_data_parser : public data_parser +{ +protected: + string &dataset_dir; + string file_name_prefix; + const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; + +public: + taxi_data_parser(string &dataset_dir, string file_name_prefix) : + dataset_dir(dataset_dir), + file_name_prefix(file_name_prefix) + {} + ~taxi_data_parser(){} + + bool parse() override { + const path data_dir(dataset_dir); + if (!is_directory(data_dir)) { + cerr << "Directory " << dataset_dir << " does not exist." << endl; + return false; + } + + for (auto &foil_folder : foil_folders) { + path foil_dir = data_dir / foil_folder; + if (!is_directory(foil_dir)) { + cerr << "Directory " << foil_dir << " is skipped because it does not exist" << endl; + continue; + } + + size_t i = 1; + while (true) { + path trip_data_file = foil_dir / (file_name_prefix + std::to_string(i) + ".csv"); + if (!exists(trip_data_file)) { + break; + } + cerr << "Parsing " << trip_data_file << endl; + std::fstream trip_csv_file(trip_data_file.string()); + this->parse_csv_file(trip_csv_file); + + trip_csv_file.close(); + i += 1; + } + } + + return true; + } +}; + + +class taxi_trip_data_parser : public taxi_data_parser { private: - enum TAXI_DATA_INDEX { + enum TAXI_TRIP_DATA_INDEX { MEDALLION, HACK_LICENSE, VENDOR_ID, @@ -33,9 +117,7 @@ class taxi_trip_data_parser : public data_parser DROPOFF_LONGITUDE, DROPOFF_LATITUDE }; - string &dataset_dir; boost::posix_time::ptime start_time; - const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; void gen_data(vector &row, stream::taxi_trip *trip) override { int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); @@ -71,60 +153,64 @@ class taxi_trip_data_parser : public data_parser public: taxi_trip_data_parser(string &dataset_dir) : - dataset_dir(dataset_dir), + taxi_data_parser(dataset_dir, "trip_data_"), start_time(boost::gregorian::date(1970, 1, 1)) {} ~taxi_trip_data_parser(){} +}; - bool parse() override { - const path data_dir(dataset_dir); - if (!is_directory(data_dir)) { - cerr << "Directory " << dataset_dir << " does not exist." << endl; - return false; - } - - for (auto &foil_folder : foil_folders) { - path foil_dir = data_dir / foil_folder; - if (!is_directory(foil_dir)) { - cerr << "Directory " << foil_dir << " is skipped because it does not exist" << endl; - continue; - } - - size_t i = 1; - while (true) { - path trip_data_file = foil_dir / ("trip_data_" + std::to_string(i) + ".csv"); - if (!exists(trip_data_file)) { - break; - } - cerr << "Parsing " << trip_data_file << endl; - std::fstream trip_csv_file(trip_data_file.string()); - parse_csv_file(trip_csv_file); - - trip_csv_file.close(); - i += 1; - } - } +class taxi_fare_data_parser : public taxi_data_parser +{ +private: + enum TAXI_FARE_DATA_INDEX { + MEDALLION, + HACK_LICENSE, + VENDOR_ID, + PICKUP_DATETIME, + PAYMENT_TYPE, + FARE_AMOUNT, + SURCHARGE, + MTA_TAX, + TIP_AMOUNT, + TOLLS_AMOUNT, + TOTAL_AMOUNT + }; + boost::posix_time::ptime start_time; - return true; + void gen_data(vector &row, stream::taxi_fare *fare) override { + int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); + int64_t et = st + 1; + int32_t medallion = stoi(row[MEDALLION]); + int32_t hack_license = stoi(row[HACK_LICENSE]); + string vendor_id = row[VENDOR_ID]; + string payment_type = row[PAYMENT_TYPE]; + float fare_amount = this->stof_err_handle(row[FARE_AMOUNT]); + float surcharge = this->stof_err_handle(row[SURCHARGE]); + float mta_tax = this->stof_err_handle(row[MTA_TAX]); + float tip_amount = this->stof_err_handle(row[TIP_AMOUNT]); + float tolls_amount = this->stof_err_handle(row[TOLLS_AMOUNT]); + float total_amount = this->stof_err_handle(row[TOTAL_AMOUNT]); + + fare->set_st(st); + fare->set_et(et); + fare->set_medallion(medallion); + fare->set_hack_license(hack_license); + fare->set_vendor_id(vendor_id); + fare->set_payment_type(payment_type); + fare->set_fare_amount(fare_amount); + fare->set_surcharge(surcharge); + fare->set_mta_tax(mta_tax); + fare->set_tip_amount(tip_amount); + fare->set_tolls_amount(tolls_amount); + fare->set_total_amount(total_amount); } -}; -ostream& operator<< (ostream& out, stream::taxi_trip const& trip) -{ - out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; - out << "medallion: " << trip.medallion() << ", "; - out << "hack_license: " << trip.hack_license() << ", "; - out << "vendor_id: " << trip.vendor_id() << ", "; - out << "rate_code: " << trip.rate_code() << ", "; - out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; - out << "passenger_count: " << trip.passenger_count() << ", "; - out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; - out << "trip_distance: " << trip.trip_distance() << ", "; - out << "pickup_longitude: " << trip.pickup_longitude() << ", "; - out << "pickup_latitude: " << trip.pickup_latitude() << ", "; - out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; - out << "dropoff_latitude: " << trip.dropoff_latitude(); - return out; -} +public: + taxi_fare_data_parser(string &dataset_dir) : + taxi_data_parser(dataset_dir, "trip_fare_"), + start_time(boost::gregorian::date(1970, 1, 1)) + {} + ~taxi_fare_data_parser(){} +}; #endif // DATASET_LOADER_TAXI_DATA_LOADER_H_ \ No newline at end of file diff --git a/dataset_util/protos/taxi_trip.proto b/dataset_util/protos/taxi.proto similarity index 62% rename from dataset_util/protos/taxi_trip.proto rename to dataset_util/protos/taxi.proto index 6a6068c..ffd2ad7 100644 --- a/dataset_util/protos/taxi_trip.proto +++ b/dataset_util/protos/taxi.proto @@ -20,4 +20,19 @@ message taxi_trip { required float pickup_latitude = 12; required float dropoff_longitude = 13; required float dropoff_latitude = 14; +} + +message taxi_fare { + required int64 st = 1; + required int64 et = 2; + required int32 medallion = 3; + required int32 hack_license = 4; + required string vendor_id = 5; + required string payment_type = 6; + required float fare_amount = 7; + required float surcharge = 8; + required float mta_tax = 9; + required float tip_amount = 10; + required float tolls_amount = 11; + required float total_amount = 12; } \ No newline at end of file diff --git a/tilt_bench/CMakeLists.txt b/tilt_bench/CMakeLists.txt index 8bea447..67f30db 100644 --- a/tilt_bench/CMakeLists.txt +++ b/tilt_bench/CMakeLists.txt @@ -30,9 +30,9 @@ set(easyjit_lib "${CMAKE_BINARY_DIR}/tilt/third_party/easy_jit/bin/EasyJitPass.s add_subdirectory(tilt/tilt) -add_executable(main main.cpp ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) +add_executable(main main.cpp ${Protobuf_IMPORT_DIRS}/taxi.proto) target_link_libraries(main protobuf::libprotobuf tilt) set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET main LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi_trip.proto) \ No newline at end of file +protobuf_generate(TARGET main LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index 6d6727a..b4bc030 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -42,11 +42,11 @@ int main(int argc, char** argv) } } - data_loader loader; + data_loader loader; while (true) { - stream::taxi_trip trip; - loader.load_data(trip); - cout << trip << endl; + stream::taxi_fare fare; + loader.load_data(fare); + cout << fare << endl; } string testcase = (argc > 1) ? argv[1] : "select"; From 975ec9c4856319fa8cc3342b5fce89ed05b25cb3 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 22 Feb 2022 18:23:26 -0500 Subject: [PATCH 05/23] Add java decoder for taxi dataset --- dataset_util/include/taxi_data_parser.h | 5 ++--- dataset_util/java/TaxiFareLoader.java | 27 +++++++++++++++++++++++ dataset_util/java/TaxiTripLoader.java | 29 +++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 dataset_util/java/TaxiFareLoader.java create mode 100644 dataset_util/java/TaxiTripLoader.java diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 3882cb8..3478e1d 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -34,7 +34,7 @@ ostream& operator<< (ostream& out, stream::taxi_trip const& trip) ostream& operator<< (ostream& out, stream::taxi_fare const& fare) { - out << "taxi_trip[" << fare.st() << ", " << fare.et() << "]: "; + out << "taxi_fare[" << fare.st() << ", " << fare.et() << "]: "; out << "medallion: " << fare.medallion() << ", "; out << "hack_license: " << fare.hack_license() << ", "; out << "vendor_id: " << fare.vendor_id() << ", "; @@ -44,8 +44,7 @@ ostream& operator<< (ostream& out, stream::taxi_fare const& fare) out << "mta_tax: " << fare.mta_tax() << ", "; out << "tip_amount: " << fare.tip_amount() << ", "; out << "tolls_amount: " << fare.tolls_amount() << ", "; - out << "total_amount: " << fare.total_amount() << ", "; - + out << "total_amount: " << fare.total_amount(); return out; } diff --git a/dataset_util/java/TaxiFareLoader.java b/dataset_util/java/TaxiFareLoader.java new file mode 100644 index 0000000..5611f89 --- /dev/null +++ b/dataset_util/java/TaxiFareLoader.java @@ -0,0 +1,27 @@ +import com.stream.taxi.protos.taxi_fare; +import java.io.IOException; + +public class TaxiFareLoader { + + public static void printTaxiFare(taxi_fare fare) { + System.out.format( + "taxi_fare[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, payment_type: %s, fare_amount: %f, " + + "surcharge: %f, mta_tax: %f, tip_amount: %f, tolls_amount: %f, total_amount: %f%n", + fare.getSt(), fare.getEt(), fare.getMedallion(), fare.getHackLicense(), fare.getVendorId(), + fare.getPaymentType(), fare.getFareAmount(), fare.getSurcharge(), fare.getMtaTax(), + fare.getTipAmount(), fare.getTollsAmount(), fare.getTotalAmount() + ); + } + + public static void main(String[] args) { + try { + while (true) { + taxi_fare fare = taxi_fare.parseDelimitedFrom(System.in); + TaxiFareLoader.printTaxiFare(fare); + } + } + catch (IOException e) { + return; + } + } +} \ No newline at end of file diff --git a/dataset_util/java/TaxiTripLoader.java b/dataset_util/java/TaxiTripLoader.java new file mode 100644 index 0000000..6fc0d42 --- /dev/null +++ b/dataset_util/java/TaxiTripLoader.java @@ -0,0 +1,29 @@ +import com.stream.taxi.protos.taxi_trip; +import java.io.IOException; + +public class TaxiTripLoader { + + public static void printTaxiTrip(taxi_trip trip) { + System.out.format( + "taxi_trip[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, rate_code: %d, " + + "store_and_fwd_flag: %b, passenger_count: %d, trip_time_in_secs: %d, trip_distance: %f, " + + "pickup_longitude: %f, pickup_latitude: %f, dropoff_longitude: %f, dropoff_latitude: %f%n", + trip.getSt(), trip.getEt(), trip.getMedallion(), trip.getHackLicense(), trip.getVendorId(), + trip.getRateCode(), trip.getStoreAndFwdFlag(), trip.getPassengerCount(), trip.getTripTimeInSecs(), + trip.getTripDistance(), trip.getPickupLongitude(), trip.getPickupLatitude(), + trip.getDropoffLongitude(), trip.getDropoffLatitude() + ); + } + + public static void main(String[] args) { + try { + while (true) { + taxi_trip trip = taxi_trip.parseDelimitedFrom(System.in); + TaxiTripLoader.printTaxiTrip(trip); + } + } + catch (IOException e) { + return; + } + } +} \ No newline at end of file From 112a53e0581aeca97a535f845371aae46eb78045 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 22 Feb 2022 19:52:18 -0500 Subject: [PATCH 06/23] Add documentation for dataset parser/loader --- dataset_parser/CMakeLists.txt | 2 +- dataset_util/README.md | 45 +++++++++++++++++++++++++++++++++++ tilt_bench/CMakeLists.txt | 2 +- 3 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 dataset_util/README.md diff --git a/dataset_parser/CMakeLists.txt b/dataset_parser/CMakeLists.txt index f86b630..432f0b7 100644 --- a/dataset_parser/CMakeLists.txt +++ b/dataset_parser/CMakeLists.txt @@ -22,7 +22,7 @@ endif() include_directories(${PROJECT_ROOT_DIR}/dataset_util/include ${PROTOBUF_INCLUDE_DIRS} ${Boost_INCLUDE_DIR}) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) -add_executable(parser parser.cpp ${Protobuf_IMPORT_DIRS}/taxi.proto) +add_executable(parser parser.cpp) target_link_libraries(parser protobuf::libprotobuf ${Boost_LIBRARIES}) diff --git a/dataset_util/README.md b/dataset_util/README.md new file mode 100644 index 0000000..0e7bfbe --- /dev/null +++ b/dataset_util/README.md @@ -0,0 +1,45 @@ +#### Dependencies and usage: + +Install Boost Library: + +``` +git clone https://github.com/boostorg/boost.git && \ +cd boost && \ +git checkout boost-1.78.0 && \ +git submodule init && \ +git submodule update && \ +./bootstrap.sh --prefix=/usr/local && \ +./b2 install --prefix=/usr/local +``` + +Install Protobuf Library: + +``` +git clone https://github.com/protocolbuffers/protobuf.git && \ +cd protobuf && \ +git checkout v3.19.4 && \ +git submodule init && \ +git submodule update && \ +cd cmake && \ +mkdir build && \ +cd build && \ +cmake -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ +make -j$(nproc) && \ +make install +``` + +Install Protobuf Java Runtime (protobuf-java-3.19.4.jar): + +``` +cd protobuf/java && \ +mvn test && \ +mvn package +``` + +Load Dataset with Java: + +``` +protoc -I /path/to/protos --java_out=. data.proto +javac -cp /path/to/protobuf-java-3.19.4.jar:. DataLoader.java +./dataset-parser | java -cp /path/to/protobuf-java-3.19.4.jar:. DataLoader +``` diff --git a/tilt_bench/CMakeLists.txt b/tilt_bench/CMakeLists.txt index 67f30db..2562cca 100644 --- a/tilt_bench/CMakeLists.txt +++ b/tilt_bench/CMakeLists.txt @@ -30,7 +30,7 @@ set(easyjit_lib "${CMAKE_BINARY_DIR}/tilt/third_party/easy_jit/bin/EasyJitPass.s add_subdirectory(tilt/tilt) -add_executable(main main.cpp ${Protobuf_IMPORT_DIRS}/taxi.proto) +add_executable(main main.cpp) target_link_libraries(main protobuf::libprotobuf tilt) From c3341b130b70ab55bc364408c190fc479738940d Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 22 Feb 2022 20:11:13 -0500 Subject: [PATCH 07/23] Refactor dataset_util --- dataset_util/loader/cpp/CMakeLists.txt | 28 +++++++++++++++++++ dataset_util/loader/cpp/loader.cpp | 19 +++++++++++++ .../{ => loader}/java/TaxiFareLoader.java | 0 .../{ => loader}/java/TaxiTripLoader.java | 0 .../parser}/CMakeLists.txt | 7 ++--- .../parser}/parser.cpp | 0 6 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 dataset_util/loader/cpp/CMakeLists.txt create mode 100644 dataset_util/loader/cpp/loader.cpp rename dataset_util/{ => loader}/java/TaxiFareLoader.java (100%) rename dataset_util/{ => loader}/java/TaxiTripLoader.java (100%) rename {dataset_parser => dataset_util/parser}/CMakeLists.txt (76%) rename {dataset_parser => dataset_util/parser}/parser.cpp (100%) diff --git a/dataset_util/loader/cpp/CMakeLists.txt b/dataset_util/loader/cpp/CMakeLists.txt new file mode 100644 index 0000000..7fef69a --- /dev/null +++ b/dataset_util/loader/cpp/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.13.4) +set(CMAKE_C_COMPILER clang) +set(CMAKE_CXX_COMPILER clang++) + +project(dataset_loader) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(protobuf CONFIG REQUIRED) + +if(protobuf_VERBOSE) + message(STATUS "Using Protocol Buffers ${protobuf_VERSION}") +endif() + +get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +get_filename_component(DATASET_UTIL_DIR ${PARENT_DIR} DIRECTORY) +set(Protobuf_IMPORT_DIRS ${DATASET_UTIL_DIR}/protos) + +include_directories(${PROTOBUF_INCLUDE_DIRS} ${DATASET_UTIL_DIR}/include) +set(CMAKE_INCLUDE_CURRENT_DIR TRUE) + +add_executable(loader loader.cpp) +target_link_libraries(loader protobuf::libprotobuf) + +set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) +protobuf_generate(TARGET loader LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file diff --git a/dataset_util/loader/cpp/loader.cpp b/dataset_util/loader/cpp/loader.cpp new file mode 100644 index 0000000..63e00c6 --- /dev/null +++ b/dataset_util/loader/cpp/loader.cpp @@ -0,0 +1,19 @@ +#include + +#include +#include + +using namespace std; + +int main(int argc, char** argv) +{ + data_loader loader; + while (true) { + stream::taxi_fare fare; + loader.load_data(fare); + cout << fare << endl; + } + + google::protobuf::ShutdownProtobufLibrary(); + return 0; +} \ No newline at end of file diff --git a/dataset_util/java/TaxiFareLoader.java b/dataset_util/loader/java/TaxiFareLoader.java similarity index 100% rename from dataset_util/java/TaxiFareLoader.java rename to dataset_util/loader/java/TaxiFareLoader.java diff --git a/dataset_util/java/TaxiTripLoader.java b/dataset_util/loader/java/TaxiTripLoader.java similarity index 100% rename from dataset_util/java/TaxiTripLoader.java rename to dataset_util/loader/java/TaxiTripLoader.java diff --git a/dataset_parser/CMakeLists.txt b/dataset_util/parser/CMakeLists.txt similarity index 76% rename from dataset_parser/CMakeLists.txt rename to dataset_util/parser/CMakeLists.txt index 432f0b7..dc41ba4 100644 --- a/dataset_parser/CMakeLists.txt +++ b/dataset_util/parser/CMakeLists.txt @@ -9,8 +9,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -get_filename_component(PROJECT_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -set(Protobuf_IMPORT_DIRS ${PROJECT_ROOT_DIR}/dataset_util/protos) +get_filename_component(DATASET_UTIL_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +set(Protobuf_IMPORT_DIRS ${DATASET_UTIL_DIR}/protos) find_package(Boost 1.78.0 REQUIRED COMPONENTS date_time filesystem) find_package(protobuf CONFIG REQUIRED) @@ -19,11 +19,10 @@ if(protobuf_VERBOSE) message(STATUS "Using Protocol Buffers ${protobuf_VERSION}") endif() -include_directories(${PROJECT_ROOT_DIR}/dataset_util/include ${PROTOBUF_INCLUDE_DIRS} ${Boost_INCLUDE_DIR}) +include_directories(${DATASET_UTIL_DIR}/include ${PROTOBUF_INCLUDE_DIRS} ${Boost_INCLUDE_DIR}) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) add_executable(parser parser.cpp) - target_link_libraries(parser protobuf::libprotobuf ${Boost_LIBRARIES}) set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) diff --git a/dataset_parser/parser.cpp b/dataset_util/parser/parser.cpp similarity index 100% rename from dataset_parser/parser.cpp rename to dataset_util/parser/parser.cpp From 72efa6078d381a18fc8f32c7234e6b6336f78078 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 22 Feb 2022 20:28:11 -0500 Subject: [PATCH 08/23] Add protobuf submodule --- .gitmodules | 3 +++ dataset_util/protobuf | 1 + 2 files changed, 4 insertions(+) create mode 160000 dataset_util/protobuf diff --git a/.gitmodules b/.gitmodules index 78e8b19..f2e962a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "lightsaber_bench/LightSaber"] path = lightsaber_bench/LightSaber url = https://github.com/wzhao18/LightSaber.git +[submodule "dataset_util/protobuf"] + path = dataset_util/protobuf + url = https://github.com/protocolbuffers/protobuf diff --git a/dataset_util/protobuf b/dataset_util/protobuf new file mode 160000 index 0000000..22d0e26 --- /dev/null +++ b/dataset_util/protobuf @@ -0,0 +1 @@ +Subproject commit 22d0e265de7d2b3d2e9a00d071313502e7d4cccf From 1c19a5ede38d614b4c01a76717d26695e7f99cf6 Mon Sep 17 00:00:00 2001 From: wzhao18 <740286700@qq.com> Date: Wed, 23 Feb 2022 00:24:44 -0500 Subject: [PATCH 09/23] Add dataset loader in trill --- trill_bench/bench/Program.cs | 6 ++++++ trill_bench/bench/bench.csproj | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index d864efd..100cb9a 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -2,6 +2,8 @@ using System.Diagnostics; using System.Reactive.Linq; using Microsoft.StreamProcessing; +using Google.Protobuf; +using Stream; namespace bench { @@ -44,6 +46,10 @@ static double RunTest( static void Main(string[] args) { + + MessageParser parser = new MessageParser(() => new taxi_trip()); + taxi_trip trip = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + string testcase = (args.Length > 0) ? args[0] : "normalize"; long size = (args.Length > 1) ? long.Parse(args[1]) : 100000000; long period = 1; diff --git a/trill_bench/bench/bench.csproj b/trill_bench/bench/bench.csproj index f0fe78f..6b922e1 100644 --- a/trill_bench/bench/bench.csproj +++ b/trill_bench/bench/bench.csproj @@ -11,4 +11,8 @@ + + + + From 9ea6c9f4739d0921896ba9e4affcb305526efdaa Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Wed, 23 Feb 2022 02:20:25 -0500 Subject: [PATCH 10/23] Add loader example for csharp --- dataset_util/loader/csharp/loader.sln | 48 +++++++++++++++++++ dataset_util/loader/csharp/loader/Program.cs | 18 +++++++ .../loader/csharp/loader/loader.csproj | 12 +++++ trill_bench/bench/Program.cs | 6 +-- trill_bench/bench/bench.csproj | 5 +- trill_bench/trill_bench.sln | 14 ++++++ 6 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 dataset_util/loader/csharp/loader.sln create mode 100644 dataset_util/loader/csharp/loader/Program.cs create mode 100644 dataset_util/loader/csharp/loader/loader.csproj diff --git a/dataset_util/loader/csharp/loader.sln b/dataset_util/loader/csharp/loader.sln new file mode 100644 index 0000000..1bf206c --- /dev/null +++ b/dataset_util/loader/csharp/loader.sln @@ -0,0 +1,48 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26124.0 +MinimumVisualStudioVersion = 15.0.26124.0 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "loader", "loader\loader.csproj", "{7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Google.Protobuf", "..\..\protobuf\csharp\src\Google.Protobuf\Google.Protobuf.csproj", "{E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x64.ActiveCfg = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x64.Build.0 = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x86.ActiveCfg = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x86.Build.0 = Debug|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|Any CPU.Build.0 = Release|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x64.ActiveCfg = Release|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x64.Build.0 = Release|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x86.ActiveCfg = Release|Any CPU + {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x86.Build.0 = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x64.ActiveCfg = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x64.Build.0 = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x86.ActiveCfg = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x86.Build.0 = Debug|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|Any CPU.Build.0 = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x64.ActiveCfg = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x64.Build.0 = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x86.ActiveCfg = Release|Any CPU + {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x86.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/dataset_util/loader/csharp/loader/Program.cs b/dataset_util/loader/csharp/loader/Program.cs new file mode 100644 index 0000000..5514ec2 --- /dev/null +++ b/dataset_util/loader/csharp/loader/Program.cs @@ -0,0 +1,18 @@ +using System; +using Stream; +using Google.Protobuf; + +namespace loader +{ + class Program + { + static void Main(string[] args) + { + MessageParser parser = new MessageParser(() => new taxi_fare()); + while (true) { + taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + Console.WriteLine(fare); + } + } + } +} diff --git a/dataset_util/loader/csharp/loader/loader.csproj b/dataset_util/loader/csharp/loader/loader.csproj new file mode 100644 index 0000000..9fcbaf6 --- /dev/null +++ b/dataset_util/loader/csharp/loader/loader.csproj @@ -0,0 +1,12 @@ + + + + + + + + Exe + netcoreapp3.1 + + + diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index 100cb9a..ed0148d 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -46,9 +46,9 @@ static double RunTest( static void Main(string[] args) { - - MessageParser parser = new MessageParser(() => new taxi_trip()); - taxi_trip trip = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + MessageParser parser = new MessageParser(() => new taxi_fare()); + taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + Console.WriteLine(fare.Surcharge); string testcase = (args.Length > 0) ? args[0] : "normalize"; long size = (args.Length > 1) ? long.Parse(args[1]) : 100000000; diff --git a/trill_bench/bench/bench.csproj b/trill_bench/bench/bench.csproj index 6b922e1..86ae4cf 100644 --- a/trill_bench/bench/bench.csproj +++ b/trill_bench/bench/bench.csproj @@ -9,10 +9,7 @@ - - - - + diff --git a/trill_bench/trill_bench.sln b/trill_bench/trill_bench.sln index e6e0e88..dd7c627 100644 --- a/trill_bench/trill_bench.sln +++ b/trill_bench/trill_bench.sln @@ -11,6 +11,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "bench", "bench\bench.csproj EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "System.Reactive", "reactive\Rx.NET\Source\src\System.Reactive\System.Reactive.csproj", "{BA73F0BE-62D0-4763-9A00-60BBCEA9A1DD}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Google.Protobuf", "..\dataset_util\protobuf\csharp\src\Google.Protobuf\Google.Protobuf.csproj", "{A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -72,5 +74,17 @@ Global {BA73F0BE-62D0-4763-9A00-60BBCEA9A1DD}.Release|x64.Build.0 = Release|Any CPU {BA73F0BE-62D0-4763-9A00-60BBCEA9A1DD}.Release|x86.ActiveCfg = Release|Any CPU {BA73F0BE-62D0-4763-9A00-60BBCEA9A1DD}.Release|x86.Build.0 = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|x64.ActiveCfg = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|x64.Build.0 = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|x86.ActiveCfg = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Debug|x86.Build.0 = Debug|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|Any CPU.Build.0 = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|x64.ActiveCfg = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|x64.Build.0 = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|x86.ActiveCfg = Release|Any CPU + {A14A7166-9BB0-4290-B21A-8C0FE54EAFAF}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection EndGlobal From 2fc8253982bb2fa81aeefbfd2bfba458a5dade48 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Wed, 23 Feb 2022 02:27:44 -0500 Subject: [PATCH 11/23] Add documentation for loading dataset with c++ and c# --- dataset_util/README.md | 11 +++++++++++ trill_bench/bench/Program.cs | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dataset_util/README.md b/dataset_util/README.md index 0e7bfbe..d306489 100644 --- a/dataset_util/README.md +++ b/dataset_util/README.md @@ -36,6 +36,10 @@ mvn test && \ mvn package ``` +Load Dataset with C++: + +Build with Cmake directly. Protobuf files will be generated by cmake function `protobuf_generate` + Load Dataset with Java: ``` @@ -43,3 +47,10 @@ protoc -I /path/to/protos --java_out=. data.proto javac -cp /path/to/protobuf-java-3.19.4.jar:. DataLoader.java ./dataset-parser | java -cp /path/to/protobuf-java-3.19.4.jar:. DataLoader ``` + +Load Dataset with C#: + +``` +protoc -I /path/to/protos --csharp_out=. data.proto +./dataset-parser | dotnet run +``` diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index ed0148d..acf1ec4 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -47,8 +47,10 @@ static double RunTest( static void Main(string[] args) { MessageParser parser = new MessageParser(() => new taxi_fare()); - taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - Console.WriteLine(fare.Surcharge); + while (true) { + taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + Console.WriteLine(fare); + } string testcase = (args.Length > 0) ? args[0] : "normalize"; long size = (args.Length > 1) ? long.Parse(args[1]) : 100000000; From 6f230d7add5bb79d8e927c093fcc9844f1d52197 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Wed, 23 Feb 2022 19:10:46 -0500 Subject: [PATCH 12/23] load dataset by command line arguments --- dataset_util/loader/cpp/loader.cpp | 31 +++++++++++++--- dataset_util/loader/csharp/loader/Program.cs | 39 ++++++++++++++++++-- 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/dataset_util/loader/cpp/loader.cpp b/dataset_util/loader/cpp/loader.cpp index 63e00c6..e82faa0 100644 --- a/dataset_util/loader/cpp/loader.cpp +++ b/dataset_util/loader/cpp/loader.cpp @@ -1,17 +1,38 @@ #include +#include #include #include using namespace std; -int main(int argc, char** argv) +template +void print_data(data_loader &loader) { - data_loader loader; while (true) { - stream::taxi_fare fare; - loader.load_data(fare); - cout << fare << endl; + T t; + if (!loader.load_data(t)) { + break; + } + cout << t << endl; + } +} + +int main(int argc, char** argv) +{ + GOOGLE_PROTOBUF_VERIFY_VERSION; + + string dataset_name = "taxi_fare"; + if (argc > 1) { + dataset_name = argv[1]; + } + + if (dataset_name == "taxi_fare") { + data_loader loader; + print_data(loader); + } else if (dataset_name == "taxi_trip") { + data_loader loader; + print_data(loader); } google::protobuf::ShutdownProtobufLibrary(); diff --git a/dataset_util/loader/csharp/loader/Program.cs b/dataset_util/loader/csharp/loader/Program.cs index 5514ec2..a423a5c 100644 --- a/dataset_util/loader/csharp/loader/Program.cs +++ b/dataset_util/loader/csharp/loader/Program.cs @@ -4,14 +4,45 @@ namespace loader { + public class DataLoader where T : IMessage, new() + { + private MessageParser MsgParser = new MessageParser(() => new T()); + + public T LoadData() + { + return MsgParser.ParseDelimitedFrom(Console.OpenStandardInput()); + } + } + class Program { + static void PrintData(DataLoader data_loader) where T : IMessage, new() + { + try { + while (true) { + T t = data_loader.LoadData(); + Console.WriteLine(t); + } + } catch (Exception e) { + return; + } + } + static void Main(string[] args) { - MessageParser parser = new MessageParser(() => new taxi_fare()); - while (true) { - taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - Console.WriteLine(fare); + string dataset = "taxi_fare"; + if (args.Length > 0) { + dataset = args[0]; + } + + if (dataset == "taxi_fare") { + DataLoader data_loader = new DataLoader(); + PrintData(data_loader); + } else if (dataset == "taxi_trip") { + DataLoader data_loader = new DataLoader(); + PrintData(data_loader); + } else { + throw new Exception("Unknown dataset"); } } } From 90f1e6a96a4b88b9aa38ccdf6539ddd19c235540 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Mon, 28 Feb 2022 01:10:57 -0500 Subject: [PATCH 13/23] Add parsing for vibration data --- dataset_util/include/vibration_data_parser.h | 105 +++++++++++++++++++ dataset_util/loader/cpp/CMakeLists.txt | 6 +- dataset_util/loader/cpp/loader.cpp | 6 ++ dataset_util/parser/CMakeLists.txt | 6 +- dataset_util/parser/parser.cpp | 8 +- dataset_util/protos/vibration.proto | 13 +++ 6 files changed, 139 insertions(+), 5 deletions(-) create mode 100644 dataset_util/include/vibration_data_parser.h create mode 100644 dataset_util/protos/vibration.proto diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h new file mode 100644 index 0000000..c251706 --- /dev/null +++ b/dataset_util/include/vibration_data_parser.h @@ -0,0 +1,105 @@ +#ifndef DATASET_LOADER_VIBRATION_DATA_LOADER_H_ +#define DATASET_LOADER_VIBRATION_DATA_LOADER_H_ + +#include +#include +#include + +#include +#include + +#include + +#include + +using namespace std; +using namespace boost::filesystem; + +ostream& operator<< (ostream& out, stream::vibration const& vibration) +{ + out << "vibration[" << vibration.st() << ", " << vibration.et() << "]: "; + out << "channel_1: " << vibration.channel_1() << ", "; + out << "channel_2: " << vibration.channel_2(); + return out; +} + +class vibration_data_parser : public data_parser +{ +private: + enum VIBRATION_DATA_INDEX { + TIMESTAMP, + CHANNEL_1, + CHANNEL_2 + }; +protected: + string &dataset_dir; + const map folder_prefix_map = { + {"1 Data collected from a healthy bearing", 'H'}, + {"2 Data collected from a bearing with inner race fault", 'I'}, + {"3 Data collected from a bearing with outer race fault", 'O'}, + {"4 Data collected from a bearing with ball fault", 'B'}, + {"5 Data collected from a bearing with a combination of faults", 'C'} + }; + const vector file_suffices = { + "-A-1.csv", "-A-2.csv", "-A-3.csv", + "-B-1.csv", "-B-2.csv", "-B-3.csv", + "-C-1.csv", "-C-2.csv", "-C-3.csv", + "-D-1.csv", "-D-2.csv", "-D-3.csv" + }; + +public: + vibration_data_parser(string &dataset_dir) : + dataset_dir(dataset_dir) + {} + ~vibration_data_parser(){} + + void gen_data(vector &row, stream::vibration *vibration) override { + int64_t st = stoi(row[TIMESTAMP]); + int64_t et = st + 1; + float channel_1 = this->stof_err_handle(row[CHANNEL_1]); + float channel_2 = this->stof_err_handle(row[CHANNEL_2]); + + vibration->set_st(st); + vibration->set_et(et); + vibration->set_channel_1(channel_1); + vibration->set_channel_2(channel_2); + } + + bool parse() override { + const path data_dir(dataset_dir); + if (!is_directory(data_dir)) { + cerr << "Directory " << dataset_dir << " does not exist." << endl; + return false; + } + + for (auto &pair : folder_prefix_map) { + auto folder_name = pair.first; + auto file_prefix = pair.second; + + path folder_dir = data_dir / folder_name; + if (!is_directory(folder_dir)) { + cerr << "Directory " << folder_dir << " is skipped because it does not exist" << endl; + continue; + } + + for (auto &file_suffix : file_suffices) { + path data_file = folder_dir / (file_prefix + file_suffix); + if (!exists(data_file)) { + cerr << "File " << data_file << " is skipped because it does not exist" << endl; + continue; + } + + cerr << "Parsing " << data_file << endl; + std::fstream data_csv_file(data_file.string()); + this->parse_csv_file(data_csv_file); + + data_csv_file.close(); + } + + } + + return true; + } +}; + +#endif // DATASET_LOADER_VIBRATION_DATA_LOADER_H_ \ No newline at end of file diff --git a/dataset_util/loader/cpp/CMakeLists.txt b/dataset_util/loader/cpp/CMakeLists.txt index 7fef69a..72a6193 100644 --- a/dataset_util/loader/cpp/CMakeLists.txt +++ b/dataset_util/loader/cpp/CMakeLists.txt @@ -24,5 +24,9 @@ set(CMAKE_INCLUDE_CURRENT_DIR TRUE) add_executable(loader loader.cpp) target_link_libraries(loader protobuf::libprotobuf) +file(GLOB PROTO_FILES + "${Protobuf_IMPORT_DIRS}/*.proto" +) + set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET loader LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file +protobuf_generate(TARGET loader LANGUAGE cpp PROTOS ${PROTO_FILES}) \ No newline at end of file diff --git a/dataset_util/loader/cpp/loader.cpp b/dataset_util/loader/cpp/loader.cpp index e82faa0..75a37ab 100644 --- a/dataset_util/loader/cpp/loader.cpp +++ b/dataset_util/loader/cpp/loader.cpp @@ -3,6 +3,7 @@ #include #include +#include using namespace std; @@ -33,6 +34,11 @@ int main(int argc, char** argv) } else if (dataset_name == "taxi_trip") { data_loader loader; print_data(loader); + } else if (dataset_name == "vibration") { + data_loader loader; + print_data(loader); + } else { + throw runtime_error("Unknown dataset name."); } google::protobuf::ShutdownProtobufLibrary(); diff --git a/dataset_util/parser/CMakeLists.txt b/dataset_util/parser/CMakeLists.txt index dc41ba4..d1aa922 100644 --- a/dataset_util/parser/CMakeLists.txt +++ b/dataset_util/parser/CMakeLists.txt @@ -25,5 +25,9 @@ set(CMAKE_INCLUDE_CURRENT_DIR TRUE) add_executable(parser parser.cpp) target_link_libraries(parser protobuf::libprotobuf ${Boost_LIBRARIES}) +file(GLOB PROTO_FILES + "${Protobuf_IMPORT_DIRS}/*.proto" +) + set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET parser LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file +protobuf_generate(TARGET parser LANGUAGE cpp PROTOS ${PROTO_FILES}) \ No newline at end of file diff --git a/dataset_util/parser/parser.cpp b/dataset_util/parser/parser.cpp index 4cafb8d..29bf181 100644 --- a/dataset_util/parser/parser.cpp +++ b/dataset_util/parser/parser.cpp @@ -1,8 +1,7 @@ #include -#include "taxi.pb.h" - #include +#include using namespace std; @@ -21,8 +20,11 @@ int main(int argc, char* argv[]) { } else if (dataset_name == "taxi_fare") { taxi_fare_data_parser parser(dataset_dir); parser.parse(); + } else if (dataset_name == "vibration") { + vibration_data_parser parser(dataset_dir); + parser.parse(); } else { - throw std::runtime_error("Unknown dataset name."); + throw runtime_error("Unknown dataset name."); } google::protobuf::ShutdownProtobufLibrary(); diff --git a/dataset_util/protos/vibration.proto b/dataset_util/protos/vibration.proto new file mode 100644 index 0000000..abd83c5 --- /dev/null +++ b/dataset_util/protos/vibration.proto @@ -0,0 +1,13 @@ +syntax = "proto2"; +package stream; + +option java_multiple_files = true; +option java_package = "com.stream.vibration.protos"; +option java_outer_classname = "VibrationProtos"; + +message vibration { + required int64 st = 1; + required int64 et = 2; + required float channel_1 = 3; + required float channel_2 = 4; +} \ No newline at end of file From 1b4e45efea1a23bc59f2d239631240bc439624e3 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Mon, 28 Feb 2022 13:32:39 -0500 Subject: [PATCH 14/23] Fix header file definitions --- dataset_util/include/data_loader.h | 6 +++--- dataset_util/include/data_parser.h | 6 +++--- dataset_util/include/taxi_data_parser.h | 6 +++--- dataset_util/include/vibration_data_parser.h | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dataset_util/include/data_loader.h b/dataset_util/include/data_loader.h index 892d894..00e568d 100644 --- a/dataset_util/include/data_loader.h +++ b/dataset_util/include/data_loader.h @@ -1,5 +1,5 @@ -#ifndef DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ -#define DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ +#ifndef DATASET_UTIL_DATA_LOADER_H_ +#define DATASET_UTIL_DATA_LOADER_H_ #include @@ -34,4 +34,4 @@ class data_loader } }; -#endif // DATASET_STREAMER_DATA_RECV_ISTREAM_DATA_RECV_H_ \ No newline at end of file +#endif // DATASET_UTIL_DATA_LOADER_H_ \ No newline at end of file diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index 5fcbf6f..99babc7 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -1,5 +1,5 @@ -#ifndef DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ -#define DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ +#ifndef DATASET_UTIL_DATA_PARSER_H_ +#define DATASET_UTIL_DATA_PARSER_H_ #include @@ -77,4 +77,4 @@ class data_parser data_parser(){} }; -#endif // DATASET_STREAMER_DATA_GEN_DATA_PARSER_H_ +#endif // DATASET_UTIL_DATA_PARSER_H_ diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 3478e1d..c9c1f1d 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -1,5 +1,5 @@ -#ifndef DATASET_LOADER_TAXI_DATA_LOADER_H_ -#define DATASET_LOADER_TAXI_DATA_LOADER_H_ +#ifndef DATASET_UTIL_TAXI_DATA_PARSER_H_ +#define DATASET_UTIL_TAXI_DATA_PARSER_H_ #include #include @@ -212,4 +212,4 @@ class taxi_fare_data_parser : public taxi_data_parser ~taxi_fare_data_parser(){} }; -#endif // DATASET_LOADER_TAXI_DATA_LOADER_H_ \ No newline at end of file +#endif // DATASET_UTIL_TAXI_DATA_PARSER_H_ \ No newline at end of file diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h index c251706..74ba582 100644 --- a/dataset_util/include/vibration_data_parser.h +++ b/dataset_util/include/vibration_data_parser.h @@ -1,5 +1,5 @@ -#ifndef DATASET_LOADER_VIBRATION_DATA_LOADER_H_ -#define DATASET_LOADER_VIBRATION_DATA_LOADER_H_ +#ifndef DATASET_UTIL_VIBRATION_DATA_PARSER_H_ +#define DATASET_UTIL_VIBRATION_DATA_PARSER_H_ #include #include @@ -102,4 +102,4 @@ class vibration_data_parser : public data_parser } }; -#endif // DATASET_LOADER_VIBRATION_DATA_LOADER_H_ \ No newline at end of file +#endif // DATASET_UTIL_VIBRATION_DATA_PARSER_H_ \ No newline at end of file From c1161abda5be4bea9d0e28a6430df9de677ee140 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 15 Mar 2022 00:13:00 -0400 Subject: [PATCH 15/23] Separate payload from message --- dataset_util/include/taxi_data_parser.h | 88 ++++++++++---------- dataset_util/include/vibration_data_parser.h | 8 +- dataset_util/loader/java/TaxiFareLoader.java | 8 +- dataset_util/loader/java/TaxiTripLoader.java | 10 ++- dataset_util/protos/taxi.proto | 62 ++++++++------ dataset_util/protos/vibration.proto | 14 ++-- 6 files changed, 103 insertions(+), 87 deletions(-) diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index c9c1f1d..5e0bcb7 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -17,34 +17,34 @@ using namespace boost::filesystem; ostream& operator<< (ostream& out, stream::taxi_trip const& trip) { out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; - out << "medallion: " << trip.medallion() << ", "; - out << "hack_license: " << trip.hack_license() << ", "; - out << "vendor_id: " << trip.vendor_id() << ", "; - out << "rate_code: " << trip.rate_code() << ", "; - out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; - out << "passenger_count: " << trip.passenger_count() << ", "; - out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; - out << "trip_distance: " << trip.trip_distance() << ", "; - out << "pickup_longitude: " << trip.pickup_longitude() << ", "; - out << "pickup_latitude: " << trip.pickup_latitude() << ", "; - out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; - out << "dropoff_latitude: " << trip.dropoff_latitude(); + out << "medallion: " << trip.payload().medallion() << ", "; + out << "hack_license: " << trip.payload().hack_license() << ", "; + out << "vendor_id: " << trip.payload().vendor_id() << ", "; + out << "rate_code: " << trip.payload().rate_code() << ", "; + out << "store_and_fwd_flag: " << trip.payload().store_and_fwd_flag() << ", "; + out << "passenger_count: " << trip.payload().passenger_count() << ", "; + out << "trip_time_in_secs: " << trip.payload().trip_time_in_secs() << ", "; + out << "trip_distance: " << trip.payload().trip_distance() << ", "; + out << "pickup_longitude: " << trip.payload().pickup_longitude() << ", "; + out << "pickup_latitude: " << trip.payload().pickup_latitude() << ", "; + out << "dropoff_longitude: " << trip.payload().dropoff_longitude() << ", "; + out << "dropoff_latitude: " << trip.payload().dropoff_latitude(); return out; } ostream& operator<< (ostream& out, stream::taxi_fare const& fare) { out << "taxi_fare[" << fare.st() << ", " << fare.et() << "]: "; - out << "medallion: " << fare.medallion() << ", "; - out << "hack_license: " << fare.hack_license() << ", "; - out << "vendor_id: " << fare.vendor_id() << ", "; - out << "payment_type: " << fare.payment_type() << ", "; - out << "fare_amount: " << fare.fare_amount() << ", "; - out << "surcharge: " << fare.surcharge() << ", "; - out << "mta_tax: " << fare.mta_tax() << ", "; - out << "tip_amount: " << fare.tip_amount() << ", "; - out << "tolls_amount: " << fare.tolls_amount() << ", "; - out << "total_amount: " << fare.total_amount(); + out << "medallion: " << fare.payload().medallion() << ", "; + out << "hack_license: " << fare.payload().hack_license() << ", "; + out << "vendor_id: " << fare.payload().vendor_id() << ", "; + out << "payment_type: " << fare.payload().payment_type() << ", "; + out << "fare_amount: " << fare.payload().fare_amount() << ", "; + out << "surcharge: " << fare.payload().surcharge() << ", "; + out << "mta_tax: " << fare.payload().mta_tax() << ", "; + out << "tip_amount: " << fare.payload().tip_amount() << ", "; + out << "tolls_amount: " << fare.payload().tolls_amount() << ", "; + out << "total_amount: " << fare.payload().total_amount(); return out; } @@ -136,18 +136,18 @@ class taxi_trip_data_parser : public taxi_data_parser trip->set_st(st); trip->set_et(et); - trip->set_medallion(medallion); - trip->set_hack_license(hack_license); - trip->set_vendor_id(vendor_id); - trip->set_rate_code(rate_code); - trip->set_store_and_fwd_flag(store_and_fwd_flag); - trip->set_passenger_count(passenger_count); - trip->set_trip_time_in_secs(trip_time_in_secs); - trip->set_trip_distance(trip_distance); - trip->set_dropoff_longitude(dropoff_longitude); - trip->set_pickup_latitude(pickup_latitude); - trip->set_pickup_longitude(pickup_longitude); - trip->set_dropoff_latitude(dropoff_latitude); + trip->mutable_payload()->set_medallion(medallion); + trip->mutable_payload()->set_hack_license(hack_license); + trip->mutable_payload()->set_vendor_id(vendor_id); + trip->mutable_payload()->set_rate_code(rate_code); + trip->mutable_payload()->set_store_and_fwd_flag(store_and_fwd_flag); + trip->mutable_payload()->set_passenger_count(passenger_count); + trip->mutable_payload()->set_trip_time_in_secs(trip_time_in_secs); + trip->mutable_payload()->set_trip_distance(trip_distance); + trip->mutable_payload()->set_dropoff_longitude(dropoff_longitude); + trip->mutable_payload()->set_pickup_latitude(pickup_latitude); + trip->mutable_payload()->set_pickup_longitude(pickup_longitude); + trip->mutable_payload()->set_dropoff_latitude(dropoff_latitude); } public: @@ -192,16 +192,16 @@ class taxi_fare_data_parser : public taxi_data_parser fare->set_st(st); fare->set_et(et); - fare->set_medallion(medallion); - fare->set_hack_license(hack_license); - fare->set_vendor_id(vendor_id); - fare->set_payment_type(payment_type); - fare->set_fare_amount(fare_amount); - fare->set_surcharge(surcharge); - fare->set_mta_tax(mta_tax); - fare->set_tip_amount(tip_amount); - fare->set_tolls_amount(tolls_amount); - fare->set_total_amount(total_amount); + fare->mutable_payload()->set_medallion(medallion); + fare->mutable_payload()->set_hack_license(hack_license); + fare->mutable_payload()->set_vendor_id(vendor_id); + fare->mutable_payload()->set_payment_type(payment_type); + fare->mutable_payload()->set_fare_amount(fare_amount); + fare->mutable_payload()->set_surcharge(surcharge); + fare->mutable_payload()->set_mta_tax(mta_tax); + fare->mutable_payload()->set_tip_amount(tip_amount); + fare->mutable_payload()->set_tolls_amount(tolls_amount); + fare->mutable_payload()->set_total_amount(total_amount); } public: diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h index 74ba582..0e28e88 100644 --- a/dataset_util/include/vibration_data_parser.h +++ b/dataset_util/include/vibration_data_parser.h @@ -18,8 +18,8 @@ using namespace boost::filesystem; ostream& operator<< (ostream& out, stream::vibration const& vibration) { out << "vibration[" << vibration.st() << ", " << vibration.et() << "]: "; - out << "channel_1: " << vibration.channel_1() << ", "; - out << "channel_2: " << vibration.channel_2(); + out << "channel_1: " << vibration.payload().channel_1() << ", "; + out << "channel_2: " << vibration.payload().channel_2(); return out; } @@ -61,8 +61,8 @@ class vibration_data_parser : public data_parser vibration->set_st(st); vibration->set_et(et); - vibration->set_channel_1(channel_1); - vibration->set_channel_2(channel_2); + vibration->mutable_payload()->set_channel_1(channel_1); + vibration->mutable_payload()->set_channel_2(channel_2); } bool parse() override { diff --git a/dataset_util/loader/java/TaxiFareLoader.java b/dataset_util/loader/java/TaxiFareLoader.java index 5611f89..fa4cb33 100644 --- a/dataset_util/loader/java/TaxiFareLoader.java +++ b/dataset_util/loader/java/TaxiFareLoader.java @@ -1,15 +1,17 @@ import com.stream.taxi.protos.taxi_fare; +import com.stream.taxi.protos.taxi_fare_payload; import java.io.IOException; public class TaxiFareLoader { public static void printTaxiFare(taxi_fare fare) { + taxi_fare_payload payload = fare.getPayload(); System.out.format( "taxi_fare[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, payment_type: %s, fare_amount: %f, " + "surcharge: %f, mta_tax: %f, tip_amount: %f, tolls_amount: %f, total_amount: %f%n", - fare.getSt(), fare.getEt(), fare.getMedallion(), fare.getHackLicense(), fare.getVendorId(), - fare.getPaymentType(), fare.getFareAmount(), fare.getSurcharge(), fare.getMtaTax(), - fare.getTipAmount(), fare.getTollsAmount(), fare.getTotalAmount() + fare.getSt(), fare.getEt(), payload.getMedallion(), payload.getHackLicense(), payload.getVendorId(), + payload.getPaymentType(), payload.getFareAmount(), payload.getSurcharge(), payload.getMtaTax(), + payload.getTipAmount(), payload.getTollsAmount(), payload.getTotalAmount() ); } diff --git a/dataset_util/loader/java/TaxiTripLoader.java b/dataset_util/loader/java/TaxiTripLoader.java index 6fc0d42..55a7bc4 100644 --- a/dataset_util/loader/java/TaxiTripLoader.java +++ b/dataset_util/loader/java/TaxiTripLoader.java @@ -1,17 +1,19 @@ import com.stream.taxi.protos.taxi_trip; +import com.stream.taxi.protos.taxi_trip_payload; import java.io.IOException; public class TaxiTripLoader { public static void printTaxiTrip(taxi_trip trip) { + taxi_trip_payload payload = trip.getPayload(); System.out.format( "taxi_trip[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, rate_code: %d, " + "store_and_fwd_flag: %b, passenger_count: %d, trip_time_in_secs: %d, trip_distance: %f, " + "pickup_longitude: %f, pickup_latitude: %f, dropoff_longitude: %f, dropoff_latitude: %f%n", - trip.getSt(), trip.getEt(), trip.getMedallion(), trip.getHackLicense(), trip.getVendorId(), - trip.getRateCode(), trip.getStoreAndFwdFlag(), trip.getPassengerCount(), trip.getTripTimeInSecs(), - trip.getTripDistance(), trip.getPickupLongitude(), trip.getPickupLatitude(), - trip.getDropoffLongitude(), trip.getDropoffLatitude() + trip.getSt(), trip.getEt(), payload.getMedallion(), payload.getHackLicense(), payload.getVendorId(), + payload.getRateCode(), payload.getStoreAndFwdFlag(), payload.getPassengerCount(), + payload.getTripTimeInSecs(), payload.getTripDistance(), payload.getPickupLongitude(), + payload.getPickupLatitude(), payload.getDropoffLongitude(), payload.getDropoffLatitude() ); } diff --git a/dataset_util/protos/taxi.proto b/dataset_util/protos/taxi.proto index ffd2ad7..69a9501 100644 --- a/dataset_util/protos/taxi.proto +++ b/dataset_util/protos/taxi.proto @@ -1,4 +1,4 @@ -syntax = "proto2"; +syntax = "proto3"; package stream; option java_multiple_files = true; @@ -6,33 +6,41 @@ option java_package = "com.stream.taxi.protos"; option java_outer_classname = "TaxiProtos"; message taxi_trip { - required int64 st = 1; - required int64 et = 2; - required int32 medallion = 3; - required int32 hack_license = 4; - required string vendor_id = 5; - required int32 rate_code = 6; - required bool store_and_fwd_flag = 7; - required int32 passenger_count = 8; - required int32 trip_time_in_secs = 9; - required float trip_distance = 10; - required float pickup_longitude = 11; - required float pickup_latitude = 12; - required float dropoff_longitude = 13; - required float dropoff_latitude = 14; + int64 st = 1; + int64 et = 2; + taxi_trip_payload payload = 3; +} + +message taxi_trip_payload { + int32 medallion = 1; + int32 hack_license = 2; + string vendor_id = 3; + int32 rate_code = 4; + bool store_and_fwd_flag = 5; + int32 passenger_count = 6; + int32 trip_time_in_secs = 7; + float trip_distance = 8; + float pickup_longitude = 9; + float pickup_latitude = 10; + float dropoff_longitude = 11; + float dropoff_latitude = 12; } message taxi_fare { - required int64 st = 1; - required int64 et = 2; - required int32 medallion = 3; - required int32 hack_license = 4; - required string vendor_id = 5; - required string payment_type = 6; - required float fare_amount = 7; - required float surcharge = 8; - required float mta_tax = 9; - required float tip_amount = 10; - required float tolls_amount = 11; - required float total_amount = 12; + int64 st = 1; + int64 et = 2; + taxi_fare_payload payload = 3; +} + +message taxi_fare_payload { + int32 medallion = 1; + int32 hack_license = 2; + string vendor_id = 3; + string payment_type = 4; + float fare_amount = 5; + float surcharge = 6; + float mta_tax = 7; + float tip_amount = 8; + float tolls_amount = 9; + float total_amount = 10; } \ No newline at end of file diff --git a/dataset_util/protos/vibration.proto b/dataset_util/protos/vibration.proto index abd83c5..d137773 100644 --- a/dataset_util/protos/vibration.proto +++ b/dataset_util/protos/vibration.proto @@ -1,4 +1,4 @@ -syntax = "proto2"; +syntax = "proto3"; package stream; option java_multiple_files = true; @@ -6,8 +6,12 @@ option java_package = "com.stream.vibration.protos"; option java_outer_classname = "VibrationProtos"; message vibration { - required int64 st = 1; - required int64 et = 2; - required float channel_1 = 3; - required float channel_2 = 4; + int64 st = 1; + int64 et = 2; + vibration_payload payload = 3; +} + +message vibration_payload { + float channel_1 = 1; + float channel_2 = 2; } \ No newline at end of file From aff95c2409604b0e21e390c60d54f0923b5b3035 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 15 Mar 2022 00:58:55 -0400 Subject: [PATCH 16/23] Clean up Main for Trill benchmark --- trill_bench/bench/BenchUtil.cs | 68 +++++++++++++++++++++ trill_bench/bench/Program.cs | 105 ++++++--------------------------- 2 files changed, 86 insertions(+), 87 deletions(-) create mode 100644 trill_bench/bench/BenchUtil.cs diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs new file mode 100644 index 0000000..b921f2b --- /dev/null +++ b/trill_bench/bench/BenchUtil.cs @@ -0,0 +1,68 @@ +using System; +using System.Diagnostics; +using System.Reactive.Linq; +using Microsoft.StreamProcessing; +using Google.Protobuf; +using Stream; + +namespace bench +{ + public class BenchUtil + { + public static double RunTest(Func> data, + Func, IStreamable> transform) + { + var stream = data(); + + var sw = new Stopwatch(); + sw.Start(); + var s_obs = transform(stream); + + s_obs + .ToStreamEventObservable() + .Wait(); + sw.Stop(); + return sw.Elapsed.TotalSeconds; + } + + public static double RunTest( + Func> data1, + Func> data2, + Func, IStreamable, IStreamable> transform) + { + var stream = data1(); + var stream2 = data2(); + + var sw = new Stopwatch(); + sw.Start(); + var s_obs = transform(stream,stream2); + + s_obs + .ToStreamEventObservable() + .Wait(); + sw.Stop(); + return sw.Elapsed.TotalSeconds; + } + + public static Func> DataFn(long p, long s) + { + return () => new TestObs(p, s) + .ToStreamable() + .Cache(); + } + + public static Func> TaxiFareDataFn(long p, long s) + { + return () => new TaxiFareData(p, s) + .ToStreamable() + .Cache(); + } + + public static Func> TaxiRideDataFn(long p, long s) + { + return () => new TaxiRideData(p, s) + .ToStreamable() + .Cache(); + } + } +} diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index acf1ec4..b5b8fad 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -9,117 +9,48 @@ namespace bench { class Program { - static double RunTest(Func> data, - Func, IStreamable> transform) - { - var stream = data(); - - var sw = new Stopwatch(); - sw.Start(); - var s_obs = transform(stream); - - s_obs - .ToStreamEventObservable() - .Wait(); - sw.Stop(); - return sw.Elapsed.TotalSeconds; - } - - static double RunTest( - Func> data1, - Func> data2, - Func, IStreamable, IStreamable> transform) - { - var stream = data1(); - var stream2 = data2(); - - var sw = new Stopwatch(); - sw.Start(); - var s_obs = transform(stream,stream2); - - s_obs - .ToStreamEventObservable() - .Wait(); - sw.Stop(); - return sw.Elapsed.TotalSeconds; - } - static void Main(string[] args) { - MessageParser parser = new MessageParser(() => new taxi_fare()); - while (true) { - taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - Console.WriteLine(fare); - } - string testcase = (args.Length > 0) ? args[0] : "normalize"; long size = (args.Length > 1) ? long.Parse(args[1]) : 100000000; long period = 1; double time = 0; - Func> data = () => - { - return new TestObs(period, size) - .ToStreamable() - .Cache(); - }; - - Func> DataFn(long p, long s) - { - return () => new TestObs(p, s) - .ToStreamable() - .Cache(); - } - - Func> TaxiFareDataFn(long p, long s) - { - return () => new TaxiFareData(p, s) - .ToStreamable() - .Cache(); - } - - Func> TaxiRideDataFn(long p, long s) - { - return () => new TaxiRideData(p, s) - .ToStreamable() - .Cache(); - } - switch (testcase) { case "select": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Select(e => e + 3) ); break; case "where": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Where(e => e > 0) ); break; case "aggregate": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .TumblingWindowLifetime(1000 * period) .Sum(e => e) ); break; case "alterdur": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .AlterEventDuration(10 * period) ); break; case "innerjoin": - time = RunTest(DataFn(period, size), DataFn(period, size), (stream,stream2) => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), BenchUtil.DataFn(period, size), (stream,stream2) => stream .Join(stream2, (left, right) => left + right) ); break; case "outerjoin": - time = RunTest(DataFn(period, size), DataFn(period, size), (stream, stream2) => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), BenchUtil.DataFn(period, size), (stream, stream2) => stream .FullOuterJoin(stream2, e => true, e => true, left => left, right => right, @@ -127,13 +58,13 @@ Func> TaxiRideDataFn(long p, long s) ); break; case "normalize": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Normalize(10000) ); break; case "fillmean": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .FillMean(10000, period) ); @@ -147,57 +78,57 @@ Func> TaxiRideDataFn(long p, long s) .ToStreamable() .Cache(); }; - time = RunTest(sig4, stream => + time = BenchUtil.RunTest(sig4, stream => stream .Resample(iperiod, operiod) ); break; case "algotrading": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .AlgoTrading(50, 20, period) ); break; case "largeqty": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .LargeQty(10, period) ); break; case "rsi": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .RSI(14, period) ); break; case "pantom": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .PanTom(period) ); break; case "kurtosis": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Kurtosis(100) ); break; case "taxi": - time = RunTest(TaxiRideDataFn(period, size), - TaxiFareDataFn(period, size), + time = BenchUtil.RunTest(BenchUtil.TaxiRideDataFn(period, size), + BenchUtil.TaxiFareDataFn(period, size), (stream, stream2) => stream .Taxi(stream2, 300) ); break; case "eg1": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Eg1(10, 20) ); break; case "eg2": - time = RunTest(DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => stream .Eg2(10, 20) ); From e3d905c1db12597c341fdf9dc65936375687dc0d Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 15 Mar 2022 03:07:02 -0400 Subject: [PATCH 17/23] Parser can parse different datasets at each run --- dataset_util/include/data_parser.h | 16 +- dataset_util/include/taxi_data_parser.h | 16 +- dataset_util/include/vibration_data_parser.h | 8 +- dataset_util/parser/parser.cpp | 34 +++-- trill_bench/bench/BenchUtil.cs | 20 ++- trill_bench/bench/Program.cs | 4 +- trill_bench/bench/TaxiData.cs | 149 +++++++++---------- 7 files changed, 131 insertions(+), 116 deletions(-) diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index 99babc7..d883d6c 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -12,6 +12,9 @@ using namespace std; template class data_parser { +private: + int64_t count = 0; + int64_t size; protected: virtual bool parse() = 0; virtual void gen_data(vector&, T*) = 0; @@ -37,7 +40,7 @@ class data_parser getline(file, line); vector row; - while (true) { + while (count < size) { if (!parse_csv_line(file, row)) { break; } @@ -48,8 +51,13 @@ class data_parser return false; } row.clear(); + count++; + } + if (count < size) { + return true; + } else { + return false; } - return true; } bool write_serialized_to_ostream(T &t) { @@ -74,7 +82,9 @@ class data_parser } public: - data_parser(){} + data_parser(int64_t size) : + size(size) + {} }; #endif // DATASET_UTIL_DATA_PARSER_H_ diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 5e0bcb7..32b590c 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -57,13 +57,15 @@ class taxi_data_parser : public data_parser const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; public: - taxi_data_parser(string &dataset_dir, string file_name_prefix) : + taxi_data_parser(string &dataset_dir, string file_name_prefix, int64_t size) : + data_parser(size), dataset_dir(dataset_dir), file_name_prefix(file_name_prefix) {} ~taxi_data_parser(){} bool parse() override { + int64_t count = 0; const path data_dir(dataset_dir); if (!is_directory(data_dir)) { cerr << "Directory " << dataset_dir << " does not exist." << endl; @@ -85,7 +87,9 @@ class taxi_data_parser : public data_parser } cerr << "Parsing " << trip_data_file << endl; std::fstream trip_csv_file(trip_data_file.string()); - this->parse_csv_file(trip_csv_file); + if (!this->parse_csv_file(trip_csv_file)) { + break; + } trip_csv_file.close(); i += 1; @@ -151,8 +155,8 @@ class taxi_trip_data_parser : public taxi_data_parser } public: - taxi_trip_data_parser(string &dataset_dir) : - taxi_data_parser(dataset_dir, "trip_data_"), + taxi_trip_data_parser(string &dataset_dir, int64_t size) : + taxi_data_parser(dataset_dir, "trip_data_", size), start_time(boost::gregorian::date(1970, 1, 1)) {} ~taxi_trip_data_parser(){} @@ -205,8 +209,8 @@ class taxi_fare_data_parser : public taxi_data_parser } public: - taxi_fare_data_parser(string &dataset_dir) : - taxi_data_parser(dataset_dir, "trip_fare_"), + taxi_fare_data_parser(string &dataset_dir, int64_t size) : + taxi_data_parser(dataset_dir, "trip_fare_", size), start_time(boost::gregorian::date(1970, 1, 1)) {} ~taxi_fare_data_parser(){} diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h index 0e28e88..863372e 100644 --- a/dataset_util/include/vibration_data_parser.h +++ b/dataset_util/include/vibration_data_parser.h @@ -48,7 +48,8 @@ class vibration_data_parser : public data_parser }; public: - vibration_data_parser(string &dataset_dir) : + vibration_data_parser(string &dataset_dir, int64_t size) : + data_parser(size), dataset_dir(dataset_dir) {} ~vibration_data_parser(){} @@ -91,11 +92,12 @@ class vibration_data_parser : public data_parser cerr << "Parsing " << data_file << endl; std::fstream data_csv_file(data_file.string()); - this->parse_csv_file(data_csv_file); + if(!this->parse_csv_file(data_csv_file)) { + break; + } data_csv_file.close(); } - } return true; diff --git a/dataset_util/parser/parser.cpp b/dataset_util/parser/parser.cpp index 29bf181..a80e032 100644 --- a/dataset_util/parser/parser.cpp +++ b/dataset_util/parser/parser.cpp @@ -8,23 +8,27 @@ using namespace std; int main(int argc, char* argv[]) { GOOGLE_PROTOBUF_VERIFY_VERSION; - if (argc != 3) { - cerr << "Usage: " << endl; + if (argc < 4 || argc % 3 != 1) { + cerr << "Usage: [ ]" << endl; } - string dataset_dir = argv[1]; - string dataset_name = argv[2]; - if (dataset_name == "taxi_trip") { - taxi_trip_data_parser parser(dataset_dir); - parser.parse(); - } else if (dataset_name == "taxi_fare") { - taxi_fare_data_parser parser(dataset_dir); - parser.parse(); - } else if (dataset_name == "vibration") { - vibration_data_parser parser(dataset_dir); - parser.parse(); - } else { - throw runtime_error("Unknown dataset name."); + for (int i = 0; i < argc / 3; i++) { + string dataset_dir = argv[1 + i * 3]; + string dataset_name = argv[2 + i * 3]; + int64_t size = stol(argv[3 + i * 3]); + + if (dataset_name == "taxi_trip") { + taxi_trip_data_parser parser(dataset_dir, size); + parser.parse(); + } else if (dataset_name == "taxi_fare") { + taxi_fare_data_parser parser(dataset_dir, size); + parser.parse(); + } else if (dataset_name == "vibration") { + vibration_data_parser parser(dataset_dir, size); + parser.parse(); + } else { + throw runtime_error("Unknown dataset name."); + } } google::protobuf::ShutdownProtobufLibrary(); diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs index b921f2b..6f6bd59 100644 --- a/trill_bench/bench/BenchUtil.cs +++ b/trill_bench/bench/BenchUtil.cs @@ -2,8 +2,6 @@ using System.Diagnostics; using System.Reactive.Linq; using Microsoft.StreamProcessing; -using Google.Protobuf; -using Stream; namespace bench { @@ -33,6 +31,16 @@ public static double RunTest( var stream = data1(); var stream2 = data2(); + // stream + // .ToStreamEventObservable() + // .Where(e => e.IsData) + // .ForEach(e => Console.WriteLine(e)); + + // stream2 + // .ToStreamEventObservable() + // .Where(e => e.IsData) + // .ForEach(e => Console.WriteLine(e)); + var sw = new Stopwatch(); sw.Start(); var s_obs = transform(stream,stream2); @@ -51,16 +59,16 @@ public static Func> DataFn(long p, long s) .Cache(); } - public static Func> TaxiFareDataFn(long p, long s) + public static Func> TaxiFareDataFn(long s) { - return () => new TaxiFareData(p, s) + return () => new TaxiFareData(s) .ToStreamable() .Cache(); } - public static Func> TaxiRideDataFn(long p, long s) + public static Func> TaxiRideDataFn(long s) { - return () => new TaxiRideData(p, s) + return () => new TaxiRideData(s) .ToStreamable() .Cache(); } diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index b5b8fad..c34108f 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -114,8 +114,8 @@ static void Main(string[] args) ); break; case "taxi": - time = BenchUtil.RunTest(BenchUtil.TaxiRideDataFn(period, size), - BenchUtil.TaxiFareDataFn(period, size), + time = BenchUtil.RunTest(BenchUtil.TaxiRideDataFn(size), + BenchUtil.TaxiFareDataFn(size), (stream, stream2) => stream .Taxi(stream2, 300) diff --git a/trill_bench/bench/TaxiData.cs b/trill_bench/bench/TaxiData.cs index 1800004..50fe7db 100644 --- a/trill_bench/bench/TaxiData.cs +++ b/trill_bench/bench/TaxiData.cs @@ -2,6 +2,8 @@ using System.Collections.Generic; using System.Threading; using Microsoft.StreamProcessing; +using Google.Protobuf; +using Stream; namespace bench { @@ -103,6 +105,21 @@ public TaxiRide(int medallion, int hack_license, string vendor_id, int rate_code this.dropoff_longitude = dropoff_longitude; this.dropoff_latitude = dropoff_latitude; } + + public override string ToString() { + return String.Format( + "{{medallion: {0}, hack_license: {1}, vendor_id: {2}, rate_code: {3}, " + + "store_and_fwd_flag: {4}, pickup_datetime: {5}, dropoff_datetime: {6}, " + + "passenger_count: {7}, trip_time_in_secs: {8}, trip_distance: {9}, " + + "pickup_longitude: {10}, pickup_latitude: {11}, dropoff_longitude: {12}, " + + "dropoff_latitude: {13}}}", + this.medallion, this.hack_license, this.vendor_id, this.rate_code, + this.store_and_fwd_flag, this.pickup_datetime, this.dropoff_datetime, + this.passenger_count, this.trip_time_in_secs, this.trip_distance, + this.pickup_longitude, this.pickup_latitude, this.dropoff_longitude, + this.dropoff_latitude + ); + } } public class TaxiFare @@ -135,45 +152,34 @@ public TaxiFare(int medallion, int hack_license, string vendor_id, DateTime pick this.tolls_amount = tolls_amount; this.total_amount = total_amount; } - } - - public class TaxiDrivers - { - public static List drivers; - - static TaxiDrivers() - { - SampleDrivers(); - } - public static void SampleDrivers() - { - TaxiDrivers.drivers = new List(); - for (int i = 0; i < 1000; i++) - { - var driver = new TaxiDriver(i, i, "Vendor-" + i.ToString()); - TaxiDrivers.drivers.Add(driver); - } + public override string ToString() { + return String.Format( + "{{medallion: {0}, hack_license: {1}, vendor_id: {2}, pickup_datetime: {3}, " + + "payment_type: {4}, fare_amount: {5}, surcharge: {6}, mta_tax: {7}, tip_amount: {8}, " + + "tolls_amount: {9}, total_amount: {10}}}", + this.medallion, this.hack_license, this.vendor_id, this.pickup_datetime, + this.payment_type, this.fare_amount, this.surcharge, this.mta_tax, this.tip_amount, + this.tolls_amount, this.total_amount + ); } } public abstract class TaxiDataObs : IObservable { public long size; - public long period; public List data; public DateTime datetime_base; - public TaxiDataObs(long period, long size) + public TaxiDataObs(long size) { - this.period = period; this.size = size; this.data = new List(); - this.datetime_base = new DateTime(2021, 10, 1, 0, 0, 0); - Sample(); + this.datetime_base = new DateTime(1970, 1, 1, 0, 0, 0); + LoadData(); } - public abstract void Sample(); + public abstract void LoadData(); public IDisposable Subscribe(IObserver observer) { @@ -213,81 +219,62 @@ public void Dispose() public class TaxiFareData : TaxiDataObs> { - public TaxiFareData(long period, long size) : base(period, size) + public TaxiFareData(long size) : base(size) {} - public override void Sample() + public override void LoadData() { - var rand = new Random(); + MessageParser parser = new MessageParser(() => new taxi_fare()); for (int i = 0; i < size; i++) { - var driver = TaxiDrivers.drivers[i % TaxiDrivers.drivers.Count]; - DateTime pickup_datetime = this.datetime_base.AddMinutes(i * 10); - string[] payment_types = {"VISA", "CASH"}; - string payment_type = payment_types[rand.Next(2)]; - float fare_amount = (float) (rand.NextDouble() * 100); - float surcharge = fare_amount * 0.1f; - float mta_tax = fare_amount * 0.05f; - float tip_amount = (float) (fare_amount * rand.NextDouble()); - float tolls_amount = (float) (rand.NextDouble() * 100); - float total_amount = fare_amount + surcharge + mta_tax + tip_amount + tolls_amount; - + taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + long st = fare.St; var payload = new TaxiFare( - driver.medallion, - driver.hack_license, - driver.vendor_id, - pickup_datetime, - payment_type, - fare_amount, - surcharge, - mta_tax, - tip_amount, - tolls_amount, - total_amount + fare.Payload.Medallion, + fare.Payload.HackLicense, + fare.Payload.VendorId, + this.datetime_base.AddSeconds(st), + fare.Payload.PaymentType, + fare.Payload.FareAmount, + fare.Payload.Surcharge, + fare.Payload.MtaTax, + fare.Payload.TipAmount, + fare.Payload.TollsAmount, + fare.Payload.TotalAmount ); - data.Add(StreamEvent.CreateInterval(i * period, (i + 1) * period, payload)); + data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); } } } public class TaxiRideData : TaxiDataObs> { - public TaxiRideData(long period, long size) : base(period, size) + public TaxiRideData(long size) : base(size) {} - public override void Sample() + public override void LoadData() { - var rand = new Random(); + MessageParser parser = new MessageParser(() => new taxi_trip()); for (int i = 0; i < size; i++) { - var driver = TaxiDrivers.drivers[i % TaxiDrivers.drivers.Count]; - int rate_code = rand.Next(10); - bool store_and_fwd_flag = rand.Next() > (Int32.MaxValue / 2); - DateTime pickup_datetime = this.datetime_base.AddMinutes(i * 10); - DateTime dropoff_datetime = pickup_datetime.AddMinutes(rand.Next(1, 100)); - int passenger_count = rand.Next(1, 4); - float trip_time_in_secs = (float) (dropoff_datetime - pickup_datetime).TotalSeconds; - float trip_distance = (float) (rand.NextDouble() * 100); - float pickup_longitude = (float) (rand.NextDouble() * 100); - float pickup_latitude = (float) (rand.NextDouble() * 100); - float dropoff_longitude = (float) (rand.NextDouble() * 100); - float dropoff_latitude = (float) (rand.NextDouble() * 100); - + taxi_trip trip = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + long st = trip.St; + long et = trip.Et; var payload = new TaxiRide( - driver.medallion, - driver.hack_license, - driver.vendor_id, - rate_code, - store_and_fwd_flag, - pickup_datetime, - dropoff_datetime, - passenger_count, - trip_time_in_secs, - trip_distance, - pickup_longitude, - pickup_latitude, - dropoff_longitude, - dropoff_latitude + trip.Payload.Medallion, + trip.Payload.HackLicense, + trip.Payload.VendorId, + trip.Payload.RateCode, + trip.Payload.StoreAndFwdFlag, + this.datetime_base.AddSeconds(st), + this.datetime_base.AddSeconds(et), + trip.Payload.PassengerCount, + trip.Payload.TripTimeInSecs, + trip.Payload.TripDistance, + trip.Payload.PickupLongitude, + trip.Payload.PickupLatitude, + trip.Payload.DropoffLongitude, + trip.Payload.DropoffLatitude ); - data.Add(StreamEvent.CreateInterval(i * period, (i + 1) * period, payload)); + data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); } } } From 0b505c58b5bb923c1141e36d2e44d253e0f445d2 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 15 Mar 2022 22:15:14 -0400 Subject: [PATCH 18/23] Remove loader folder --- dataset_util/loader/cpp/CMakeLists.txt | 32 ------------ dataset_util/loader/cpp/loader.cpp | 46 ----------------- dataset_util/loader/csharp/loader.sln | 48 ------------------ dataset_util/loader/csharp/loader/Program.cs | 49 ------------------- .../loader/csharp/loader/loader.csproj | 12 ----- dataset_util/loader/java/TaxiFareLoader.java | 29 ----------- dataset_util/loader/java/TaxiTripLoader.java | 31 ------------ 7 files changed, 247 deletions(-) delete mode 100644 dataset_util/loader/cpp/CMakeLists.txt delete mode 100644 dataset_util/loader/cpp/loader.cpp delete mode 100644 dataset_util/loader/csharp/loader.sln delete mode 100644 dataset_util/loader/csharp/loader/Program.cs delete mode 100644 dataset_util/loader/csharp/loader/loader.csproj delete mode 100644 dataset_util/loader/java/TaxiFareLoader.java delete mode 100644 dataset_util/loader/java/TaxiTripLoader.java diff --git a/dataset_util/loader/cpp/CMakeLists.txt b/dataset_util/loader/cpp/CMakeLists.txt deleted file mode 100644 index 72a6193..0000000 --- a/dataset_util/loader/cpp/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -cmake_minimum_required(VERSION 3.13.4) -set(CMAKE_C_COMPILER clang) -set(CMAKE_CXX_COMPILER clang++) - -project(dataset_loader) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) - -find_package(protobuf CONFIG REQUIRED) - -if(protobuf_VERBOSE) - message(STATUS "Using Protocol Buffers ${protobuf_VERSION}") -endif() - -get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) -get_filename_component(DATASET_UTIL_DIR ${PARENT_DIR} DIRECTORY) -set(Protobuf_IMPORT_DIRS ${DATASET_UTIL_DIR}/protos) - -include_directories(${PROTOBUF_INCLUDE_DIRS} ${DATASET_UTIL_DIR}/include) -set(CMAKE_INCLUDE_CURRENT_DIR TRUE) - -add_executable(loader loader.cpp) -target_link_libraries(loader protobuf::libprotobuf) - -file(GLOB PROTO_FILES - "${Protobuf_IMPORT_DIRS}/*.proto" -) - -set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET loader LANGUAGE cpp PROTOS ${PROTO_FILES}) \ No newline at end of file diff --git a/dataset_util/loader/cpp/loader.cpp b/dataset_util/loader/cpp/loader.cpp deleted file mode 100644 index 75a37ab..0000000 --- a/dataset_util/loader/cpp/loader.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include - -#include -#include -#include - -using namespace std; - -template -void print_data(data_loader &loader) -{ - while (true) { - T t; - if (!loader.load_data(t)) { - break; - } - cout << t << endl; - } -} - -int main(int argc, char** argv) -{ - GOOGLE_PROTOBUF_VERIFY_VERSION; - - string dataset_name = "taxi_fare"; - if (argc > 1) { - dataset_name = argv[1]; - } - - if (dataset_name == "taxi_fare") { - data_loader loader; - print_data(loader); - } else if (dataset_name == "taxi_trip") { - data_loader loader; - print_data(loader); - } else if (dataset_name == "vibration") { - data_loader loader; - print_data(loader); - } else { - throw runtime_error("Unknown dataset name."); - } - - google::protobuf::ShutdownProtobufLibrary(); - return 0; -} \ No newline at end of file diff --git a/dataset_util/loader/csharp/loader.sln b/dataset_util/loader/csharp/loader.sln deleted file mode 100644 index 1bf206c..0000000 --- a/dataset_util/loader/csharp/loader.sln +++ /dev/null @@ -1,48 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.26124.0 -MinimumVisualStudioVersion = 15.0.26124.0 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "loader", "loader\loader.csproj", "{7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}" -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Google.Protobuf", "..\..\protobuf\csharp\src\Google.Protobuf\Google.Protobuf.csproj", "{E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 - Release|Any CPU = Release|Any CPU - Release|x64 = Release|x64 - Release|x86 = Release|x86 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|Any CPU.Build.0 = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x64.ActiveCfg = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x64.Build.0 = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x86.ActiveCfg = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Debug|x86.Build.0 = Debug|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|Any CPU.ActiveCfg = Release|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|Any CPU.Build.0 = Release|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x64.ActiveCfg = Release|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x64.Build.0 = Release|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x86.ActiveCfg = Release|Any CPU - {7CB8C04D-8F65-468D-934B-56AA3B7CE2CD}.Release|x86.Build.0 = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|Any CPU.Build.0 = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x64.ActiveCfg = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x64.Build.0 = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x86.ActiveCfg = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Debug|x86.Build.0 = Debug|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|Any CPU.ActiveCfg = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|Any CPU.Build.0 = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x64.ActiveCfg = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x64.Build.0 = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x86.ActiveCfg = Release|Any CPU - {E39118E2-9A0A-4870-BFD0-71E13EE7EA7A}.Release|x86.Build.0 = Release|Any CPU - EndGlobalSection -EndGlobal diff --git a/dataset_util/loader/csharp/loader/Program.cs b/dataset_util/loader/csharp/loader/Program.cs deleted file mode 100644 index a423a5c..0000000 --- a/dataset_util/loader/csharp/loader/Program.cs +++ /dev/null @@ -1,49 +0,0 @@ -using System; -using Stream; -using Google.Protobuf; - -namespace loader -{ - public class DataLoader where T : IMessage, new() - { - private MessageParser MsgParser = new MessageParser(() => new T()); - - public T LoadData() - { - return MsgParser.ParseDelimitedFrom(Console.OpenStandardInput()); - } - } - - class Program - { - static void PrintData(DataLoader data_loader) where T : IMessage, new() - { - try { - while (true) { - T t = data_loader.LoadData(); - Console.WriteLine(t); - } - } catch (Exception e) { - return; - } - } - - static void Main(string[] args) - { - string dataset = "taxi_fare"; - if (args.Length > 0) { - dataset = args[0]; - } - - if (dataset == "taxi_fare") { - DataLoader data_loader = new DataLoader(); - PrintData(data_loader); - } else if (dataset == "taxi_trip") { - DataLoader data_loader = new DataLoader(); - PrintData(data_loader); - } else { - throw new Exception("Unknown dataset"); - } - } - } -} diff --git a/dataset_util/loader/csharp/loader/loader.csproj b/dataset_util/loader/csharp/loader/loader.csproj deleted file mode 100644 index 9fcbaf6..0000000 --- a/dataset_util/loader/csharp/loader/loader.csproj +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - Exe - netcoreapp3.1 - - - diff --git a/dataset_util/loader/java/TaxiFareLoader.java b/dataset_util/loader/java/TaxiFareLoader.java deleted file mode 100644 index fa4cb33..0000000 --- a/dataset_util/loader/java/TaxiFareLoader.java +++ /dev/null @@ -1,29 +0,0 @@ -import com.stream.taxi.protos.taxi_fare; -import com.stream.taxi.protos.taxi_fare_payload; -import java.io.IOException; - -public class TaxiFareLoader { - - public static void printTaxiFare(taxi_fare fare) { - taxi_fare_payload payload = fare.getPayload(); - System.out.format( - "taxi_fare[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, payment_type: %s, fare_amount: %f, " + - "surcharge: %f, mta_tax: %f, tip_amount: %f, tolls_amount: %f, total_amount: %f%n", - fare.getSt(), fare.getEt(), payload.getMedallion(), payload.getHackLicense(), payload.getVendorId(), - payload.getPaymentType(), payload.getFareAmount(), payload.getSurcharge(), payload.getMtaTax(), - payload.getTipAmount(), payload.getTollsAmount(), payload.getTotalAmount() - ); - } - - public static void main(String[] args) { - try { - while (true) { - taxi_fare fare = taxi_fare.parseDelimitedFrom(System.in); - TaxiFareLoader.printTaxiFare(fare); - } - } - catch (IOException e) { - return; - } - } -} \ No newline at end of file diff --git a/dataset_util/loader/java/TaxiTripLoader.java b/dataset_util/loader/java/TaxiTripLoader.java deleted file mode 100644 index 55a7bc4..0000000 --- a/dataset_util/loader/java/TaxiTripLoader.java +++ /dev/null @@ -1,31 +0,0 @@ -import com.stream.taxi.protos.taxi_trip; -import com.stream.taxi.protos.taxi_trip_payload; -import java.io.IOException; - -public class TaxiTripLoader { - - public static void printTaxiTrip(taxi_trip trip) { - taxi_trip_payload payload = trip.getPayload(); - System.out.format( - "taxi_trip[%d, %d]: medallion: %d, hack_license: %d, vendor_id: %s, rate_code: %d, " + - "store_and_fwd_flag: %b, passenger_count: %d, trip_time_in_secs: %d, trip_distance: %f, " + - "pickup_longitude: %f, pickup_latitude: %f, dropoff_longitude: %f, dropoff_latitude: %f%n", - trip.getSt(), trip.getEt(), payload.getMedallion(), payload.getHackLicense(), payload.getVendorId(), - payload.getRateCode(), payload.getStoreAndFwdFlag(), payload.getPassengerCount(), - payload.getTripTimeInSecs(), payload.getTripDistance(), payload.getPickupLongitude(), - payload.getPickupLatitude(), payload.getDropoffLongitude(), payload.getDropoffLatitude() - ); - } - - public static void main(String[] args) { - try { - while (true) { - taxi_trip trip = taxi_trip.parseDelimitedFrom(System.in); - TaxiTripLoader.printTaxiTrip(trip); - } - } - catch (IOException e) { - return; - } - } -} \ No newline at end of file From a7395f22afe65c87215d77fbfa7b0650f5b1639c Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 15 Mar 2022 22:37:27 -0400 Subject: [PATCH 19/23] Fix CMAKE file to accept folder of protos --- tilt_bench/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tilt_bench/CMakeLists.txt b/tilt_bench/CMakeLists.txt index 2562cca..10dd022 100644 --- a/tilt_bench/CMakeLists.txt +++ b/tilt_bench/CMakeLists.txt @@ -34,5 +34,9 @@ add_executable(main main.cpp) target_link_libraries(main protobuf::libprotobuf tilt) +file(GLOB PROTO_FILES + "${Protobuf_IMPORT_DIRS}/*.proto" +) + set(_protobuf_include_path -I ${Protobuf_IMPORT_DIRS}) -protobuf_generate(TARGET main LANGUAGE cpp PROTOS ${Protobuf_IMPORT_DIRS}/taxi.proto) \ No newline at end of file +protobuf_generate(TARGET main LANGUAGE cpp PROTOS ${PROTO_FILES}) \ No newline at end of file From bfc72322f35c39ecd9a195450043b00d159d3996 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Wed, 16 Mar 2022 17:00:57 -0400 Subject: [PATCH 20/23] Add Vibration dataset parser for trill --- trill_bench/bench/BenchUtil.cs | 12 +++++ trill_bench/bench/Dataset.cs | 88 ++++++++++++++++++++++++++++++++++ trill_bench/bench/Program.cs | 2 +- 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 trill_bench/bench/Dataset.cs diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs index 6f6bd59..8c6d7b6 100644 --- a/trill_bench/bench/BenchUtil.cs +++ b/trill_bench/bench/BenchUtil.cs @@ -12,6 +12,11 @@ public static double RunTest(Func e.IsData) + .ForEach(e => Console.WriteLine(e)); + var sw = new Stopwatch(); sw.Start(); var s_obs = transform(stream); @@ -72,5 +77,12 @@ public static Func> TaxiRideDataFn(long s) .ToStreamable() .Cache(); } + + public static Func> VibrationDataFn(long s) + { + return () => new VibrationObs(s) + .ToStreamable() + .Cache(); + } } } diff --git a/trill_bench/bench/Dataset.cs b/trill_bench/bench/Dataset.cs new file mode 100644 index 0000000..49147cc --- /dev/null +++ b/trill_bench/bench/Dataset.cs @@ -0,0 +1,88 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using Microsoft.StreamProcessing; +using Google.Protobuf; +using Stream; + +namespace bench +{ + using test_t = StreamEvent; + + public abstract class DatasetObs : IObservable + { + public long size; + public List data; + + public DatasetObs(long size) + { + this.size = size; + this.data = new List(); + } + + public abstract void LoadDataPoint(); + + public void LoadData() + { + for (int i = 0; i < size; i++) + { + this.LoadDataPoint(); + } + } + + public IDisposable Subscribe(IObserver observer) + { + return new Subscription(this, observer); + } + + private sealed class Subscription : IDisposable + { + private readonly DatasetObs observable; + private readonly IObserver observer; + + public Subscription(DatasetObs observable, IObserver observer) + { + this.observer = observer; + this.observable = observable; + ThreadPool.QueueUserWorkItem( + arg => + { + this.Sample(); + this.observer.OnCompleted(); + }); + } + + private void Sample() + { + for (int i = 0; i < observable.data.Count; i++) + { + this.observer.OnNext(observable.data[i]); + } + } + + public void Dispose() + { + } + } + } + + public class VibrationObs : DatasetObs + { + private MessageParser parser; + + public override void LoadDataPoint() + { + vibration vibration = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + long st = vibration.St; + long et = vibration.Et; + float payload = vibration.Payload.Channel1; + data.Add(StreamEvent.CreateInterval(st, et, payload)); + } + + public VibrationObs(long size) : base(size) + { + this.parser = new MessageParser(() => new vibration()); + LoadData(); + } + } +} \ No newline at end of file diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index c34108f..8a964b7 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -108,7 +108,7 @@ static void Main(string[] args) ); break; case "kurtosis": - time = BenchUtil.RunTest(BenchUtil.DataFn(period, size), stream => + time = BenchUtil.RunTest(BenchUtil.VibrationDataFn(size), stream => stream .Kurtosis(100) ); From 22c4b883b927d0cf4063d2b971955a99d7e20e1a Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Thu, 17 Mar 2022 03:49:48 -0400 Subject: [PATCH 21/23] Wrapper for stream event, allows to check the type of protobuf message --- dataset_util/include/data_loader.h | 7 +- dataset_util/include/data_parser.h | 30 +- dataset_util/include/data_printer.h | 67 ++++ dataset_util/include/taxi_data_parser.h | 293 ++++++++---------- dataset_util/include/vibration_data_parser.h | 31 +- dataset_util/parser/parser.cpp | 31 +- dataset_util/protos/stream_event.proto | 20 ++ dataset_util/protos/taxi.proto | 12 - dataset_util/protos/vibration.proto | 6 - tilt_bench/main.cpp | 18 +- trill_bench/bench/BenchUtil.cs | 71 +++-- .../bench/{Dataset.cs => DatasetData.cs} | 12 +- trill_bench/bench/Program.cs | 3 +- trill_bench/bench/TaxiData.cs | 95 +++--- 14 files changed, 360 insertions(+), 336 deletions(-) create mode 100644 dataset_util/include/data_printer.h create mode 100644 dataset_util/protos/stream_event.proto rename trill_bench/bench/{Dataset.cs => DatasetData.cs} (84%) diff --git a/dataset_util/include/data_loader.h b/dataset_util/include/data_loader.h index 00e568d..5c062fe 100644 --- a/dataset_util/include/data_loader.h +++ b/dataset_util/include/data_loader.h @@ -7,9 +7,10 @@ #include #include +#include + using namespace std; -template class data_loader { private: @@ -22,9 +23,9 @@ class data_loader {} ~data_loader(){} - bool load_data(T& t) { + bool load_data(stream::stream_event& event) { bool clean_eof; - if (!google::protobuf::util::ParseDelimitedFromCodedStream(&t, &coded_in, &clean_eof)) { + if (!google::protobuf::util::ParseDelimitedFromCodedStream(&event, &coded_in, &clean_eof)) { if (!clean_eof) { cerr << "Fail to parse data from coded input stream." << endl; } diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index d883d6c..4980f02 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -7,17 +7,15 @@ #include +#include + using namespace std; -template class data_parser { -private: - int64_t count = 0; - int64_t size; protected: virtual bool parse() = 0; - virtual void gen_data(vector&, T*) = 0; + virtual void gen_data(vector&, stream::stream_event*, int flag = 0) = 0; bool parse_csv_line(fstream &file, vector &row) { string line; @@ -35,32 +33,26 @@ class data_parser return false; } - bool parse_csv_file(fstream &file) { + void parse_csv_file(fstream &file, int flag = 0) { string line; getline(file, line); vector row; - while (count < size) { + while (true) { if (!parse_csv_line(file, row)) { break; } - T data; - gen_data(row, &data); + stream::stream_event data; + gen_data(row, &data, flag); if (!write_serialized_to_ostream(data)) { - return false; + break; } row.clear(); - count++; - } - if (count < size) { - return true; - } else { - return false; } } - bool write_serialized_to_ostream(T &t) { + bool write_serialized_to_ostream(stream::stream_event &t) { if (!google::protobuf::util::SerializeDelimitedToOstream(t, &cout)) { cerr << "Fail to serialize data into output stream" << endl; return false; @@ -82,9 +74,7 @@ class data_parser } public: - data_parser(int64_t size) : - size(size) - {} + data_parser(){} }; #endif // DATASET_UTIL_DATA_PARSER_H_ diff --git a/dataset_util/include/data_printer.h b/dataset_util/include/data_printer.h new file mode 100644 index 0000000..955e9c0 --- /dev/null +++ b/dataset_util/include/data_printer.h @@ -0,0 +1,67 @@ +#ifndef DATASET_UTIL_DATA_PRINTER_H_ +#define DATASET_UTIL_DATA_PRINTER_H_ + +#include + +#include + +using namespace std; + +ostream& operator<< (ostream& out, stream::vibration const& vibration) +{ + out << "vibration: "; + out << "channel_1: " << vibration.channel_1() << ", "; + out << "channel_2: " << vibration.channel_2(); + return out; +} + +ostream& operator<< (ostream& out, stream::taxi_trip const& trip) +{ + out << "taxi_trip: "; + out << "medallion: " << trip.medallion() << ", "; + out << "hack_license: " << trip.hack_license() << ", "; + out << "vendor_id: " << trip.vendor_id() << ", "; + out << "rate_code: " << trip.rate_code() << ", "; + out << "store_and_fwd_flag: " << trip.store_and_fwd_flag() << ", "; + out << "passenger_count: " << trip.passenger_count() << ", "; + out << "trip_time_in_secs: " << trip.trip_time_in_secs() << ", "; + out << "trip_distance: " << trip.trip_distance() << ", "; + out << "pickup_longitude: " << trip.pickup_longitude() << ", "; + out << "pickup_latitude: " << trip.pickup_latitude() << ", "; + out << "dropoff_longitude: " << trip.dropoff_longitude() << ", "; + out << "dropoff_latitude: " << trip.dropoff_latitude(); + return out; +} + +ostream& operator<< (ostream& out, stream::taxi_fare const& fare) +{ + out << "taxi_fare: "; + out << "medallion: " << fare.medallion() << ", "; + out << "hack_license: " << fare.hack_license() << ", "; + out << "vendor_id: " << fare.vendor_id() << ", "; + out << "payment_type: " << fare.payment_type() << ", "; + out << "fare_amount: " << fare.fare_amount() << ", "; + out << "surcharge: " << fare.surcharge() << ", "; + out << "mta_tax: " << fare.mta_tax() << ", "; + out << "tip_amount: " << fare.tip_amount() << ", "; + out << "tolls_amount: " << fare.tolls_amount() << ", "; + out << "total_amount: " << fare.total_amount(); + return out; +} + +ostream& operator<< (ostream& out, stream::stream_event const& event) +{ + out << "Event: [" << event.st() << ", " << event.et() << "]: "; + if (event.has_taxi_trip()) { + out << event.taxi_trip(); + } else if (event.has_taxi_fare()) { + out << event.taxi_fare(); + } else if (event.has_vibration()) { + out << event.vibration(); + } else { + out << "Unknown payload"; + } + return out; +} + +#endif // DATASET_UTIL_DATA_PRINTER_H_ diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 32b590c..46d1951 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -7,65 +7,126 @@ #include #include -#include - #include using namespace std; using namespace boost::filesystem; -ostream& operator<< (ostream& out, stream::taxi_trip const& trip) -{ - out << "taxi_trip[" << trip.st() << ", " << trip.et() << "]: "; - out << "medallion: " << trip.payload().medallion() << ", "; - out << "hack_license: " << trip.payload().hack_license() << ", "; - out << "vendor_id: " << trip.payload().vendor_id() << ", "; - out << "rate_code: " << trip.payload().rate_code() << ", "; - out << "store_and_fwd_flag: " << trip.payload().store_and_fwd_flag() << ", "; - out << "passenger_count: " << trip.payload().passenger_count() << ", "; - out << "trip_time_in_secs: " << trip.payload().trip_time_in_secs() << ", "; - out << "trip_distance: " << trip.payload().trip_distance() << ", "; - out << "pickup_longitude: " << trip.payload().pickup_longitude() << ", "; - out << "pickup_latitude: " << trip.payload().pickup_latitude() << ", "; - out << "dropoff_longitude: " << trip.payload().dropoff_longitude() << ", "; - out << "dropoff_latitude: " << trip.payload().dropoff_latitude(); - return out; -} - -ostream& operator<< (ostream& out, stream::taxi_fare const& fare) -{ - out << "taxi_fare[" << fare.st() << ", " << fare.et() << "]: "; - out << "medallion: " << fare.payload().medallion() << ", "; - out << "hack_license: " << fare.payload().hack_license() << ", "; - out << "vendor_id: " << fare.payload().vendor_id() << ", "; - out << "payment_type: " << fare.payload().payment_type() << ", "; - out << "fare_amount: " << fare.payload().fare_amount() << ", "; - out << "surcharge: " << fare.payload().surcharge() << ", "; - out << "mta_tax: " << fare.payload().mta_tax() << ", "; - out << "tip_amount: " << fare.payload().tip_amount() << ", "; - out << "tolls_amount: " << fare.payload().tolls_amount() << ", "; - out << "total_amount: " << fare.payload().total_amount(); - return out; -} - -template -class taxi_data_parser : public data_parser +class taxi_data_parser : public data_parser { +private: + enum TAXI_FARE_DATA_INDEX { + TAXI_FARE_MEDALLION, + TAXI_FARE_HACK_LICENSE, + TAXI_FARE_VENDOR_ID, + TAXI_FARE_PICKUP_DATETIME, + TAXI_FARE_PAYMENT_TYPE, + TAXI_FARE_FARE_AMOUNT, + TAXI_FARE_SURCHARGE, + TAXI_FARE_MTA_TAX, + TAXI_FARE_TIP_AMOUNT, + TAXI_FARE_TOLLS_AMOUNT, + TAXI_FARE_TOTAL_AMOUNT + }; + enum TAXI_TRIP_DATA_INDEX { + TAXI_TRIP_MEDALLION, + TAXI_TRIP_HACK_LICENSE, + TAXI_TRIP_VENDOR_ID, + TAXI_TRIP_RATE_CODE, + TAXI_TRIP_STORE_AND_FWD_FLAG, + TAXI_TRIP_PICKUP_DATETIME, + TAXI_TRIP_DROPOFF_DATETIME, + TAXI_TRIP_PASSENGER_COUNT, + TAXI_TRIP_TRIP_TIME_IN_SECS, + TAXI_TRIP_TRIP_DISTANCE, + TAXI_TRIP_PICKUP_LONGITUDE, + TAXI_TRIP_PICKUP_LATITUDE, + TAXI_TRIP_DROPOFF_LONGITUDE, + TAXI_TRIP_DROPOFF_LATITUDE + }; protected: + vector file_name_prefixes; string &dataset_dir; - string file_name_prefix; + boost::posix_time::ptime start_time; const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; + void gen_taxi_trip(vector &row, stream::stream_event *event) { + int64_t st = this->parse_datetime_to_seconds(row[TAXI_TRIP_PICKUP_DATETIME], start_time); + int64_t et = this->parse_datetime_to_seconds(row[TAXI_TRIP_DROPOFF_DATETIME], start_time); + int32_t medallion = stoi(row[TAXI_TRIP_MEDALLION]); + int32_t hack_license = stoi(row[TAXI_TRIP_HACK_LICENSE]); + string vendor_id = row[TAXI_TRIP_VENDOR_ID]; + int32_t rate_code = stoi(row[TAXI_TRIP_RATE_CODE]); + bool store_and_fwd_flag = false; + int32_t passenger_count = stoi(row[TAXI_TRIP_PASSENGER_COUNT]); + int32_t trip_time_in_secs = stoi(row[TAXI_TRIP_TRIP_TIME_IN_SECS]); + float trip_distance = this->stof_err_handle(row[TAXI_TRIP_TRIP_DISTANCE]); + float pickup_longitude = this->stof_err_handle(row[TAXI_TRIP_PICKUP_LONGITUDE]); + float pickup_latitude = this->stof_err_handle(row[TAXI_TRIP_PICKUP_LATITUDE]); + float dropoff_longitude = this->stof_err_handle(row[TAXI_TRIP_DROPOFF_LONGITUDE]); + float dropoff_latitude = this->stof_err_handle(row[TAXI_TRIP_DROPOFF_LATITUDE]); + + event->set_st(st); + event->set_et(et); + event->mutable_taxi_trip()->set_medallion(medallion); + event->mutable_taxi_trip()->set_hack_license(hack_license); + event->mutable_taxi_trip()->set_vendor_id(vendor_id); + event->mutable_taxi_trip()->set_rate_code(rate_code); + event->mutable_taxi_trip()->set_store_and_fwd_flag(store_and_fwd_flag); + event->mutable_taxi_trip()->set_passenger_count(passenger_count); + event->mutable_taxi_trip()->set_trip_time_in_secs(trip_time_in_secs); + event->mutable_taxi_trip()->set_trip_distance(trip_distance); + event->mutable_taxi_trip()->set_dropoff_longitude(dropoff_longitude); + event->mutable_taxi_trip()->set_pickup_latitude(pickup_latitude); + event->mutable_taxi_trip()->set_pickup_longitude(pickup_longitude); + event->mutable_taxi_trip()->set_dropoff_latitude(dropoff_latitude); + } + + void gen_taxi_fare(vector &row, stream::stream_event *event) { + int64_t st = this->parse_datetime_to_seconds(row[TAXI_FARE_PICKUP_DATETIME], start_time); + int64_t et = st + 1; + int32_t medallion = stoi(row[TAXI_FARE_MEDALLION]); + int32_t hack_license = stoi(row[TAXI_FARE_HACK_LICENSE]); + string vendor_id = row[TAXI_FARE_VENDOR_ID]; + string payment_type = row[TAXI_FARE_PAYMENT_TYPE]; + float fare_amount = this->stof_err_handle(row[TAXI_FARE_FARE_AMOUNT]); + float surcharge = this->stof_err_handle(row[TAXI_FARE_SURCHARGE]); + float mta_tax = this->stof_err_handle(row[TAXI_FARE_MTA_TAX]); + float tip_amount = this->stof_err_handle(row[TAXI_FARE_TIP_AMOUNT]); + float tolls_amount = this->stof_err_handle(row[TAXI_FARE_TOLLS_AMOUNT]); + float total_amount = this->stof_err_handle(row[TAXI_FARE_TOTAL_AMOUNT]); + + event->set_st(st); + event->set_et(et); + event->mutable_taxi_fare()->set_medallion(medallion); + event->mutable_taxi_fare()->set_hack_license(hack_license); + event->mutable_taxi_fare()->set_vendor_id(vendor_id); + event->mutable_taxi_fare()->set_payment_type(payment_type); + event->mutable_taxi_fare()->set_fare_amount(fare_amount); + event->mutable_taxi_fare()->set_surcharge(surcharge); + event->mutable_taxi_fare()->set_mta_tax(mta_tax); + event->mutable_taxi_fare()->set_tip_amount(tip_amount); + event->mutable_taxi_fare()->set_tolls_amount(tolls_amount); + event->mutable_taxi_fare()->set_total_amount(total_amount); + } + public: - taxi_data_parser(string &dataset_dir, string file_name_prefix, int64_t size) : - data_parser(size), + taxi_data_parser(string &dataset_name, string &dataset_dir) : dataset_dir(dataset_dir), - file_name_prefix(file_name_prefix) - {} + start_time(boost::gregorian::date(1970, 1, 1)) + { + if (dataset_name == "taxi_trip") { + file_name_prefixes.push_back("trip_data_"); + } else if (dataset_name == "taxi_fare") { + file_name_prefixes.push_back("trip_fare_"); + } else { + file_name_prefixes.push_back("trip_data_"); + file_name_prefixes.push_back("trip_fare_"); + } + } ~taxi_data_parser(){} bool parse() override { - int64_t count = 0; const path data_dir(dataset_dir); if (!is_directory(data_dir)) { cerr << "Directory " << dataset_dir << " does not exist." << endl; @@ -81,139 +142,37 @@ class taxi_data_parser : public data_parser size_t i = 1; while (true) { - path trip_data_file = foil_dir / (file_name_prefix + std::to_string(i) + ".csv"); - if (!exists(trip_data_file)) { - break; + bool file_exists = false; + for (auto &file_name_prefix : file_name_prefixes) { + path trip_data_file = foil_dir / (file_name_prefix + std::to_string(i) + ".csv"); + if (exists(trip_data_file)) { + file_exists = true; + } else { + continue; + } + cerr << "Parsing " << trip_data_file << endl; + std::fstream trip_csv_file(trip_data_file.string()); + this->parse_csv_file(trip_csv_file, file_name_prefix == "trip_data_" ? 0 : 1); + + trip_csv_file.close(); } - cerr << "Parsing " << trip_data_file << endl; - std::fstream trip_csv_file(trip_data_file.string()); - if (!this->parse_csv_file(trip_csv_file)) { + i++; + if (!file_exists) { break; } - - trip_csv_file.close(); - i += 1; } } return true; } -}; - - -class taxi_trip_data_parser : public taxi_data_parser -{ -private: - enum TAXI_TRIP_DATA_INDEX { - MEDALLION, - HACK_LICENSE, - VENDOR_ID, - RATE_CODE, - STORE_AND_FWD_FLAG, - PICKUP_DATETIME, - DROPOFF_DATETIME, - PASSENGER_COUNT, - TRIP_TIME_IN_SECS, - TRIP_DISTANCE, - PICKUP_LONGITUDE, - PICKUP_LATITUDE, - DROPOFF_LONGITUDE, - DROPOFF_LATITUDE - }; - boost::posix_time::ptime start_time; - - void gen_data(vector &row, stream::taxi_trip *trip) override { - int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); - int64_t et = this->parse_datetime_to_seconds(row[DROPOFF_DATETIME], start_time); - int32_t medallion = stoi(row[MEDALLION]); - int32_t hack_license = stoi(row[HACK_LICENSE]); - string vendor_id = row[VENDOR_ID]; - int32_t rate_code = stoi(row[RATE_CODE]); - bool store_and_fwd_flag = false; - int32_t passenger_count = stoi(row[PASSENGER_COUNT]); - int32_t trip_time_in_secs = stoi(row[TRIP_TIME_IN_SECS]); - float trip_distance = this->stof_err_handle(row[TRIP_DISTANCE]); - float pickup_longitude = this->stof_err_handle(row[PICKUP_LONGITUDE]); - float pickup_latitude = this->stof_err_handle(row[PICKUP_LATITUDE]); - float dropoff_longitude = this->stof_err_handle(row[DROPOFF_LONGITUDE]); - float dropoff_latitude = this->stof_err_handle(row[DROPOFF_LATITUDE]); - - trip->set_st(st); - trip->set_et(et); - trip->mutable_payload()->set_medallion(medallion); - trip->mutable_payload()->set_hack_license(hack_license); - trip->mutable_payload()->set_vendor_id(vendor_id); - trip->mutable_payload()->set_rate_code(rate_code); - trip->mutable_payload()->set_store_and_fwd_flag(store_and_fwd_flag); - trip->mutable_payload()->set_passenger_count(passenger_count); - trip->mutable_payload()->set_trip_time_in_secs(trip_time_in_secs); - trip->mutable_payload()->set_trip_distance(trip_distance); - trip->mutable_payload()->set_dropoff_longitude(dropoff_longitude); - trip->mutable_payload()->set_pickup_latitude(pickup_latitude); - trip->mutable_payload()->set_pickup_longitude(pickup_longitude); - trip->mutable_payload()->set_dropoff_latitude(dropoff_latitude); - } - -public: - taxi_trip_data_parser(string &dataset_dir, int64_t size) : - taxi_data_parser(dataset_dir, "trip_data_", size), - start_time(boost::gregorian::date(1970, 1, 1)) - {} - ~taxi_trip_data_parser(){} -}; -class taxi_fare_data_parser : public taxi_data_parser -{ -private: - enum TAXI_FARE_DATA_INDEX { - MEDALLION, - HACK_LICENSE, - VENDOR_ID, - PICKUP_DATETIME, - PAYMENT_TYPE, - FARE_AMOUNT, - SURCHARGE, - MTA_TAX, - TIP_AMOUNT, - TOLLS_AMOUNT, - TOTAL_AMOUNT - }; - boost::posix_time::ptime start_time; - - void gen_data(vector &row, stream::taxi_fare *fare) override { - int64_t st = this->parse_datetime_to_seconds(row[PICKUP_DATETIME], start_time); - int64_t et = st + 1; - int32_t medallion = stoi(row[MEDALLION]); - int32_t hack_license = stoi(row[HACK_LICENSE]); - string vendor_id = row[VENDOR_ID]; - string payment_type = row[PAYMENT_TYPE]; - float fare_amount = this->stof_err_handle(row[FARE_AMOUNT]); - float surcharge = this->stof_err_handle(row[SURCHARGE]); - float mta_tax = this->stof_err_handle(row[MTA_TAX]); - float tip_amount = this->stof_err_handle(row[TIP_AMOUNT]); - float tolls_amount = this->stof_err_handle(row[TOLLS_AMOUNT]); - float total_amount = this->stof_err_handle(row[TOTAL_AMOUNT]); - - fare->set_st(st); - fare->set_et(et); - fare->mutable_payload()->set_medallion(medallion); - fare->mutable_payload()->set_hack_license(hack_license); - fare->mutable_payload()->set_vendor_id(vendor_id); - fare->mutable_payload()->set_payment_type(payment_type); - fare->mutable_payload()->set_fare_amount(fare_amount); - fare->mutable_payload()->set_surcharge(surcharge); - fare->mutable_payload()->set_mta_tax(mta_tax); - fare->mutable_payload()->set_tip_amount(tip_amount); - fare->mutable_payload()->set_tolls_amount(tolls_amount); - fare->mutable_payload()->set_total_amount(total_amount); + void gen_data(vector &row, stream::stream_event *event, int flag) override { + if (flag == 0) { + gen_taxi_trip(row, event); + } else { + gen_taxi_fare(row, event); + } } - -public: - taxi_fare_data_parser(string &dataset_dir, int64_t size) : - taxi_data_parser(dataset_dir, "trip_fare_", size), - start_time(boost::gregorian::date(1970, 1, 1)) - {} - ~taxi_fare_data_parser(){} }; #endif // DATASET_UTIL_TAXI_DATA_PARSER_H_ \ No newline at end of file diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h index 863372e..a4fb1f9 100644 --- a/dataset_util/include/vibration_data_parser.h +++ b/dataset_util/include/vibration_data_parser.h @@ -8,22 +8,12 @@ #include #include -#include - #include using namespace std; using namespace boost::filesystem; -ostream& operator<< (ostream& out, stream::vibration const& vibration) -{ - out << "vibration[" << vibration.st() << ", " << vibration.et() << "]: "; - out << "channel_1: " << vibration.payload().channel_1() << ", "; - out << "channel_2: " << vibration.payload().channel_2(); - return out; -} - -class vibration_data_parser : public data_parser +class vibration_data_parser : public data_parser { private: enum VIBRATION_DATA_INDEX { @@ -48,22 +38,21 @@ class vibration_data_parser : public data_parser }; public: - vibration_data_parser(string &dataset_dir, int64_t size) : - data_parser(size), + vibration_data_parser(string &dataset_dir) : dataset_dir(dataset_dir) {} ~vibration_data_parser(){} - void gen_data(vector &row, stream::vibration *vibration) override { + void gen_data(vector &row, stream::stream_event *event, int flag) override { int64_t st = stoi(row[TIMESTAMP]); int64_t et = st + 1; float channel_1 = this->stof_err_handle(row[CHANNEL_1]); float channel_2 = this->stof_err_handle(row[CHANNEL_2]); - vibration->set_st(st); - vibration->set_et(et); - vibration->mutable_payload()->set_channel_1(channel_1); - vibration->mutable_payload()->set_channel_2(channel_2); + event->set_st(st); + event->set_et(et); + event->mutable_vibration()->set_channel_1(channel_1); + event->mutable_vibration()->set_channel_2(channel_2); } bool parse() override { @@ -92,10 +81,8 @@ class vibration_data_parser : public data_parser cerr << "Parsing " << data_file << endl; std::fstream data_csv_file(data_file.string()); - if(!this->parse_csv_file(data_csv_file)) { - break; - } - + this->parse_csv_file(data_csv_file); + data_csv_file.close(); } } diff --git a/dataset_util/parser/parser.cpp b/dataset_util/parser/parser.cpp index a80e032..0c9cc1b 100644 --- a/dataset_util/parser/parser.cpp +++ b/dataset_util/parser/parser.cpp @@ -8,27 +8,22 @@ using namespace std; int main(int argc, char* argv[]) { GOOGLE_PROTOBUF_VERIFY_VERSION; - if (argc < 4 || argc % 3 != 1) { - cerr << "Usage: [ ]" << endl; + if (argc != 3) { + cerr << "Usage: " << endl; + return 1; } - for (int i = 0; i < argc / 3; i++) { - string dataset_dir = argv[1 + i * 3]; - string dataset_name = argv[2 + i * 3]; - int64_t size = stol(argv[3 + i * 3]); + string dataset_name = argv[1]; + string dataset_dir = argv[2]; - if (dataset_name == "taxi_trip") { - taxi_trip_data_parser parser(dataset_dir, size); - parser.parse(); - } else if (dataset_name == "taxi_fare") { - taxi_fare_data_parser parser(dataset_dir, size); - parser.parse(); - } else if (dataset_name == "vibration") { - vibration_data_parser parser(dataset_dir, size); - parser.parse(); - } else { - throw runtime_error("Unknown dataset name."); - } + if (dataset_name.find("taxi") != string::npos) { + taxi_data_parser parser(dataset_name, dataset_dir); + parser.parse(); + } else if (dataset_name == "vibration") { + vibration_data_parser parser(dataset_dir); + parser.parse(); + } else { + throw runtime_error("Unknown dataset name."); } google::protobuf::ShutdownProtobufLibrary(); diff --git a/dataset_util/protos/stream_event.proto b/dataset_util/protos/stream_event.proto new file mode 100644 index 0000000..e454d9d --- /dev/null +++ b/dataset_util/protos/stream_event.proto @@ -0,0 +1,20 @@ + +syntax = "proto3"; +package stream; + +import "taxi.proto"; +import "vibration.proto"; + +option java_multiple_files = true; +option java_package = "com.stream.generic.protos"; +option java_outer_classname = "StreamEventProtos"; + +message stream_event { + int64 st = 1; + int64 et = 2; + oneof payload { + taxi_trip taxi_trip = 3; + taxi_fare taxi_fare = 4; + vibration vibration = 5; + } +} \ No newline at end of file diff --git a/dataset_util/protos/taxi.proto b/dataset_util/protos/taxi.proto index 69a9501..356c8e7 100644 --- a/dataset_util/protos/taxi.proto +++ b/dataset_util/protos/taxi.proto @@ -6,12 +6,6 @@ option java_package = "com.stream.taxi.protos"; option java_outer_classname = "TaxiProtos"; message taxi_trip { - int64 st = 1; - int64 et = 2; - taxi_trip_payload payload = 3; -} - -message taxi_trip_payload { int32 medallion = 1; int32 hack_license = 2; string vendor_id = 3; @@ -27,12 +21,6 @@ message taxi_trip_payload { } message taxi_fare { - int64 st = 1; - int64 et = 2; - taxi_fare_payload payload = 3; -} - -message taxi_fare_payload { int32 medallion = 1; int32 hack_license = 2; string vendor_id = 3; diff --git a/dataset_util/protos/vibration.proto b/dataset_util/protos/vibration.proto index d137773..74bcfa4 100644 --- a/dataset_util/protos/vibration.proto +++ b/dataset_util/protos/vibration.proto @@ -6,12 +6,6 @@ option java_package = "com.stream.vibration.protos"; option java_outer_classname = "VibrationProtos"; message vibration { - int64 st = 1; - int64 et = 2; - vibration_payload payload = 3; -} - -message vibration_payload { float channel_1 = 1; float channel_2 = 2; } \ No newline at end of file diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index b4bc030..10fb0c0 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include "tilt_select.h" #include "tilt_where.h" @@ -42,19 +42,21 @@ int main(int argc, char** argv) } } - data_loader loader; - while (true) { - stream::taxi_fare fare; - loader.load_data(fare); - cout << fare << endl; - } - string testcase = (argc > 1) ? argv[1] : "select"; int64_t size = (argc > 2) ? atoi(argv[2]) : 100000000; int64_t period = 1; double time = 0; + data_loader loader; + for (int i = 0; i < size; i++) { + stream::stream_event event; + if (!loader.load_data(event)) { + break; + } + cout << event << endl; + } + if (testcase == "select") { SelectBench bench(period, size); time = bench.run(); diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs index 8c6d7b6..7d93a8d 100644 --- a/trill_bench/bench/BenchUtil.cs +++ b/trill_bench/bench/BenchUtil.cs @@ -2,6 +2,8 @@ using System.Diagnostics; using System.Reactive.Linq; using Microsoft.StreamProcessing; +using Google.Protobuf; +using Stream; namespace bench { @@ -36,15 +38,34 @@ public static double RunTest( var stream = data1(); var stream2 = data2(); - // stream - // .ToStreamEventObservable() - // .Where(e => e.IsData) - // .ForEach(e => Console.WriteLine(e)); + var sw = new Stopwatch(); + sw.Start(); + var s_obs = transform(stream,stream2); + + s_obs + .ToStreamEventObservable() + .Wait(); + sw.Stop(); + return sw.Elapsed.TotalSeconds; + } + + public static double RunTest( + Func, IStreamable>> data, + Func, IStreamable, IStreamable> transform) + { + var result = data(); + var stream = result.Item1; + var stream2 = result.Item2; + + stream + .ToStreamEventObservable() + .Where(e => e.IsData) + .ForEach(e => Console.WriteLine(e)); - // stream2 - // .ToStreamEventObservable() - // .Where(e => e.IsData) - // .ForEach(e => Console.WriteLine(e)); + stream2 + .ToStreamEventObservable() + .Where(e => e.IsData) + .ForEach(e => Console.WriteLine(e)); var sw = new Stopwatch(); sw.Start(); @@ -64,18 +85,30 @@ public static Func> DataFn(long p, long s) .Cache(); } - public static Func> TaxiFareDataFn(long s) + public static Func, IStreamable>> TaxiDataFn(long s) { - return () => new TaxiFareData(s) - .ToStreamable() - .Cache(); - } + return () => { + var taxi_ride_data = new TaxiRideData(); + var taxi_fare_data = new TaxiFareData(); - public static Func> TaxiRideDataFn(long s) - { - return () => new TaxiRideData(s) - .ToStreamable() - .Cache(); + MessageParser parser = new MessageParser(() => new stream_event()); + for (int i = 0; i < s; i++) + { + stream_event s_event = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + if (s_event.PayloadCase == stream_event.PayloadOneofCase.TaxiTrip) { + taxi_ride_data.LoadDataPoint(s_event); + } else if (s_event.PayloadCase == stream_event.PayloadOneofCase.TaxiFare) { + taxi_fare_data.LoadDataPoint(s_event); + } else { + Debug.Assert(false); + } + } + + return Tuple.Create( + (IStreamable) taxi_ride_data.ToStreamable().Cache(), + (IStreamable) taxi_fare_data.ToStreamable().Cache() + ); + }; } public static Func> VibrationDataFn(long s) @@ -85,4 +118,4 @@ public static Func> VibrationDataFn(long s) .Cache(); } } -} +} \ No newline at end of file diff --git a/trill_bench/bench/Dataset.cs b/trill_bench/bench/DatasetData.cs similarity index 84% rename from trill_bench/bench/Dataset.cs rename to trill_bench/bench/DatasetData.cs index 49147cc..10118c2 100644 --- a/trill_bench/bench/Dataset.cs +++ b/trill_bench/bench/DatasetData.cs @@ -68,20 +68,20 @@ public void Dispose() public class VibrationObs : DatasetObs { - private MessageParser parser; + private MessageParser parser; public override void LoadDataPoint() { - vibration vibration = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - long st = vibration.St; - long et = vibration.Et; - float payload = vibration.Payload.Channel1; + stream_event s_event = parser.ParseDelimitedFrom(Console.OpenStandardInput()); + long st = s_event.St; + long et = s_event.Et; + float payload = s_event.Vibration.Channel1; data.Add(StreamEvent.CreateInterval(st, et, payload)); } public VibrationObs(long size) : base(size) { - this.parser = new MessageParser(() => new vibration()); + this.parser = new MessageParser(() => new stream_event()); LoadData(); } } diff --git a/trill_bench/bench/Program.cs b/trill_bench/bench/Program.cs index 8a964b7..e3f8bdb 100644 --- a/trill_bench/bench/Program.cs +++ b/trill_bench/bench/Program.cs @@ -114,8 +114,7 @@ static void Main(string[] args) ); break; case "taxi": - time = BenchUtil.RunTest(BenchUtil.TaxiRideDataFn(size), - BenchUtil.TaxiFareDataFn(size), + time = BenchUtil.RunTest(BenchUtil.TaxiDataFn(size), (stream, stream2) => stream .Taxi(stream2, 300) diff --git a/trill_bench/bench/TaxiData.cs b/trill_bench/bench/TaxiData.cs index 50fe7db..c53316f 100644 --- a/trill_bench/bench/TaxiData.cs +++ b/trill_bench/bench/TaxiData.cs @@ -1,4 +1,5 @@ using System; +using System.Diagnostics; using System.Collections.Generic; using System.Threading; using Microsoft.StreamProcessing; @@ -167,19 +168,16 @@ public override string ToString() { public abstract class TaxiDataObs : IObservable { - public long size; public List data; public DateTime datetime_base; - public TaxiDataObs(long size) + public TaxiDataObs() { - this.size = size; this.data = new List(); this.datetime_base = new DateTime(1970, 1, 1, 0, 0, 0); - LoadData(); } - public abstract void LoadData(); + public abstract void LoadDataPoint(stream_event s_event); public IDisposable Subscribe(IObserver observer) { @@ -219,63 +217,54 @@ public void Dispose() public class TaxiFareData : TaxiDataObs> { - public TaxiFareData(long size) : base(size) + public TaxiFareData() : base() {} - public override void LoadData() + public override void LoadDataPoint(stream_event s_event) { - MessageParser parser = new MessageParser(() => new taxi_fare()); - for (int i = 0; i < size; i++) - { - taxi_fare fare = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - long st = fare.St; - var payload = new TaxiFare( - fare.Payload.Medallion, - fare.Payload.HackLicense, - fare.Payload.VendorId, - this.datetime_base.AddSeconds(st), - fare.Payload.PaymentType, - fare.Payload.FareAmount, - fare.Payload.Surcharge, - fare.Payload.MtaTax, - fare.Payload.TipAmount, - fare.Payload.TollsAmount, - fare.Payload.TotalAmount - ); - data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); - } + long st = s_event.St; + var payload = new TaxiFare( + s_event.TaxiFare.Medallion, + s_event.TaxiFare.HackLicense, + s_event.TaxiFare.VendorId, + this.datetime_base.AddSeconds(st), + s_event.TaxiFare.PaymentType, + s_event.TaxiFare.FareAmount, + s_event.TaxiFare.Surcharge, + s_event.TaxiFare.MtaTax, + s_event.TaxiFare.TipAmount, + s_event.TaxiFare.TollsAmount, + s_event.TaxiFare.TotalAmount + ); + data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); } } public class TaxiRideData : TaxiDataObs> { - public TaxiRideData(long size) : base(size) + public TaxiRideData() : base() {} - public override void LoadData() + public override void LoadDataPoint(stream_event s_event) { - MessageParser parser = new MessageParser(() => new taxi_trip()); - for (int i = 0; i < size; i++) - { - taxi_trip trip = parser.ParseDelimitedFrom(Console.OpenStandardInput()); - long st = trip.St; - long et = trip.Et; - var payload = new TaxiRide( - trip.Payload.Medallion, - trip.Payload.HackLicense, - trip.Payload.VendorId, - trip.Payload.RateCode, - trip.Payload.StoreAndFwdFlag, - this.datetime_base.AddSeconds(st), - this.datetime_base.AddSeconds(et), - trip.Payload.PassengerCount, - trip.Payload.TripTimeInSecs, - trip.Payload.TripDistance, - trip.Payload.PickupLongitude, - trip.Payload.PickupLatitude, - trip.Payload.DropoffLongitude, - trip.Payload.DropoffLatitude - ); - data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); - } + long st = s_event.St; + long et = s_event.Et; + Debug.Assert(s_event.PayloadCase == stream_event.PayloadOneofCase.TaxiTrip); + var payload = new TaxiRide( + s_event.TaxiTrip.Medallion, + s_event.TaxiTrip.HackLicense, + s_event.TaxiTrip.VendorId, + s_event.TaxiTrip.RateCode, + s_event.TaxiTrip.StoreAndFwdFlag, + this.datetime_base.AddSeconds(st), + this.datetime_base.AddSeconds(et), + s_event.TaxiTrip.PassengerCount, + s_event.TaxiTrip.TripTimeInSecs, + s_event.TaxiTrip.TripDistance, + s_event.TaxiTrip.PickupLongitude, + s_event.TaxiTrip.PickupLatitude, + s_event.TaxiTrip.DropoffLongitude, + s_event.TaxiTrip.DropoffLatitude + ); + data.Add(StreamEvent.CreateInterval(st, st + 1, payload)); } } } \ No newline at end of file From 061d2afad5e07ad8e41f0ec782daeaaa29589aa1 Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Thu, 17 Mar 2022 23:32:24 -0400 Subject: [PATCH 22/23] Remove debugging code --- tilt_bench/main.cpp | 15 --------------- trill_bench/bench/BenchUtil.cs | 15 --------------- 2 files changed, 30 deletions(-) diff --git a/tilt_bench/main.cpp b/tilt_bench/main.cpp index 10fb0c0..fabfafa 100644 --- a/tilt_bench/main.cpp +++ b/tilt_bench/main.cpp @@ -2,9 +2,6 @@ #include #include -#include -#include - #include "tilt_select.h" #include "tilt_where.h" #include "tilt_aggregate.h" @@ -25,8 +22,6 @@ using namespace std; int main(int argc, char** argv) { - GOOGLE_PROTOBUF_VERIFY_VERSION; - const rlim_t kStackSize = 2 * 1024 * 1024 * 1024; // min stack size = 2 GB struct rlimit rl; int result; @@ -48,15 +43,6 @@ int main(int argc, char** argv) double time = 0; - data_loader loader; - for (int i = 0; i < size; i++) { - stream::stream_event event; - if (!loader.load_data(event)) { - break; - } - cout << event << endl; - } - if (testcase == "select") { SelectBench bench(period, size); time = bench.run(); @@ -127,6 +113,5 @@ int main(int argc, char** argv) cout << "Testcase: " << testcase <<", Size: " << size << ", Time: " << setprecision(3) << time / 1000000 << endl; - google::protobuf::ShutdownProtobufLibrary(); return 0; } diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs index 7d93a8d..6c830f2 100644 --- a/trill_bench/bench/BenchUtil.cs +++ b/trill_bench/bench/BenchUtil.cs @@ -14,11 +14,6 @@ public static double RunTest(Func e.IsData) - .ForEach(e => Console.WriteLine(e)); - var sw = new Stopwatch(); sw.Start(); var s_obs = transform(stream); @@ -57,16 +52,6 @@ public static double RunTest( var stream = result.Item1; var stream2 = result.Item2; - stream - .ToStreamEventObservable() - .Where(e => e.IsData) - .ForEach(e => Console.WriteLine(e)); - - stream2 - .ToStreamEventObservable() - .Where(e => e.IsData) - .ForEach(e => Console.WriteLine(e)); - var sw = new Stopwatch(); sw.Start(); var s_obs = transform(stream,stream2); From 9537d61592762120dbd2bd35c5e5675ebadc938e Mon Sep 17 00:00:00 2001 From: WeiZhao <740286700@qq.com> Date: Tue, 3 May 2022 23:22:27 -0400 Subject: [PATCH 23/23] Add partition key to stream_event --- dataset_util/include/csv_parser.h | 65 ++++++++++++ dataset_util/include/data_parser.h | 56 +---------- dataset_util/include/data_printer.h | 1 + dataset_util/include/taxi_data_parser.h | 100 ++++++++++++------- dataset_util/include/util.h | 21 ++++ dataset_util/include/vibration_data_parser.h | 30 +++--- dataset_util/protos/stream_event.proto | 7 +- trill_bench/bench/BenchUtil.cs | 9 +- trill_bench/bench/DatasetData.cs | 4 +- 9 files changed, 183 insertions(+), 110 deletions(-) create mode 100644 dataset_util/include/csv_parser.h create mode 100644 dataset_util/include/util.h diff --git a/dataset_util/include/csv_parser.h b/dataset_util/include/csv_parser.h new file mode 100644 index 0000000..7e7c4ad --- /dev/null +++ b/dataset_util/include/csv_parser.h @@ -0,0 +1,65 @@ +#ifndef DATASET_UTIL_CSV_PARSER_H_ +#define DATASET_UTIL_CSV_PARSER_H_ + +#include + +#include + +using namespace std; + +class csv_parser : public data_parser +{ +protected: + typedef vector csv_row; + virtual void decode(csv_row&, stream::stream_event*) = 0; + + bool parse_csv_line(fstream &file, csv_row &row) + { + string line; + + if (getline(file, line)) { + string word; + stringstream ss(line); + + while (getline(ss, word, ',')) { + row.push_back(word); + } + + return true; + } + return false; + } + + void parse_csv_file(string file_name) + { + cerr << "Begin parsing " << file_name << endl; + + std::fstream csv_file(file_name); + string line; + getline(csv_file, line); + long line_cnt = 0; + + csv_row row; + while (true) { + if (!parse_csv_line(csv_file, row)) { + break; + } + + stream::stream_event data; + decode(row, &data); + if (!write_serialized_to_ostream(data)) { + break; + } + row.clear(); + line_cnt++; + } + + csv_file.close(); + cerr << "Parsing finished. Number of data points: " << line_cnt << endl; + } + +public: + csv_parser(){} +}; + +#endif // DATASET_UTIL_CSV_PARSER_H_ diff --git a/dataset_util/include/data_parser.h b/dataset_util/include/data_parser.h index 4980f02..4d473f0 100644 --- a/dataset_util/include/data_parser.h +++ b/dataset_util/include/data_parser.h @@ -1,10 +1,6 @@ #ifndef DATASET_UTIL_DATA_PARSER_H_ #define DATASET_UTIL_DATA_PARSER_H_ -#include - -#include - #include #include @@ -15,44 +11,9 @@ class data_parser { protected: virtual bool parse() = 0; - virtual void gen_data(vector&, stream::stream_event*, int flag = 0) = 0; - - bool parse_csv_line(fstream &file, vector &row) { - string line; - - if (getline(file, line)) { - string word; - stringstream ss(line); - - while (getline(ss, word, ',')) { - row.push_back(word); - } - - return true; - } - return false; - } - - void parse_csv_file(fstream &file, int flag = 0) { - string line; - getline(file, line); - - vector row; - while (true) { - if (!parse_csv_line(file, row)) { - break; - } - - stream::stream_event data; - gen_data(row, &data, flag); - if (!write_serialized_to_ostream(data)) { - break; - } - row.clear(); - } - } - bool write_serialized_to_ostream(stream::stream_event &t) { + bool write_serialized_to_ostream(stream::stream_event &t) + { if (!google::protobuf::util::SerializeDelimitedToOstream(t, &cout)) { cerr << "Fail to serialize data into output stream" << endl; return false; @@ -60,19 +21,6 @@ class data_parser return true; } - float stof_err_handle(string &str) - { - try { return stof(str); } - catch (std::exception& e) { return 0.0f; } - } - - int64_t parse_datetime_to_seconds(string &datetime, boost::posix_time::ptime &start_time) - { - auto time = boost::posix_time::time_from_string(datetime); - auto diff = time - start_time; - return diff.total_seconds(); - } - public: data_parser(){} }; diff --git a/dataset_util/include/data_printer.h b/dataset_util/include/data_printer.h index 955e9c0..c2f5c6c 100644 --- a/dataset_util/include/data_printer.h +++ b/dataset_util/include/data_printer.h @@ -51,6 +51,7 @@ ostream& operator<< (ostream& out, stream::taxi_fare const& fare) ostream& operator<< (ostream& out, stream::stream_event const& event) { + out << "(Partition: " << event.part_key() << ") "; out << "Event: [" << event.st() << ", " << event.et() << "]: "; if (event.has_taxi_trip()) { out << event.taxi_trip(); diff --git a/dataset_util/include/taxi_data_parser.h b/dataset_util/include/taxi_data_parser.h index 46d1951..8c55fc8 100644 --- a/dataset_util/include/taxi_data_parser.h +++ b/dataset_util/include/taxi_data_parser.h @@ -1,18 +1,19 @@ #ifndef DATASET_UTIL_TAXI_DATA_PARSER_H_ #define DATASET_UTIL_TAXI_DATA_PARSER_H_ -#include -#include +#include #include #include -#include +#include + +#include using namespace std; using namespace boost::filesystem; -class taxi_data_parser : public data_parser +class taxi_data_parser : public csv_parser { private: enum TAXI_FARE_DATA_INDEX { @@ -44,15 +45,33 @@ class taxi_data_parser : public data_parser TAXI_TRIP_DROPOFF_LONGITUDE, TAXI_TRIP_DROPOFF_LATITUDE }; -protected: + + const vector foil_folders = { + "FOIL2010", "FOIL2011", + "FOIL2012", "FOIL2013" + }; vector file_name_prefixes; string &dataset_dir; boost::posix_time::ptime start_time; - const vector foil_folders = {"FOIL2010", "FOIL2011", "FOIL2012", "FOIL2013"}; - void gen_taxi_trip(vector &row, stream::stream_event *event) { - int64_t st = this->parse_datetime_to_seconds(row[TAXI_TRIP_PICKUP_DATETIME], start_time); - int64_t et = this->parse_datetime_to_seconds(row[TAXI_TRIP_DROPOFF_DATETIME], start_time); + int taxi_trip_part_key; + int taxi_fare_part_key; + int part_key; + + void decode(csv_row &row, stream::stream_event *event) override + { + if (part_key == taxi_trip_part_key) { + decode_taxi_trip(row, event); + } else if (part_key == taxi_fare_part_key) { + decode_taxi_fare(row, event); + } else { + assert(false && "Partition key is unrecgonized"); + } + } + + void decode_taxi_trip(csv_row &row, stream::stream_event *event) { + int64_t st = parse_datetime_to_seconds(row[TAXI_TRIP_PICKUP_DATETIME], start_time); + int64_t et = parse_datetime_to_seconds(row[TAXI_TRIP_DROPOFF_DATETIME], start_time); int32_t medallion = stoi(row[TAXI_TRIP_MEDALLION]); int32_t hack_license = stoi(row[TAXI_TRIP_HACK_LICENSE]); string vendor_id = row[TAXI_TRIP_VENDOR_ID]; @@ -60,14 +79,15 @@ class taxi_data_parser : public data_parser bool store_and_fwd_flag = false; int32_t passenger_count = stoi(row[TAXI_TRIP_PASSENGER_COUNT]); int32_t trip_time_in_secs = stoi(row[TAXI_TRIP_TRIP_TIME_IN_SECS]); - float trip_distance = this->stof_err_handle(row[TAXI_TRIP_TRIP_DISTANCE]); - float pickup_longitude = this->stof_err_handle(row[TAXI_TRIP_PICKUP_LONGITUDE]); - float pickup_latitude = this->stof_err_handle(row[TAXI_TRIP_PICKUP_LATITUDE]); - float dropoff_longitude = this->stof_err_handle(row[TAXI_TRIP_DROPOFF_LONGITUDE]); - float dropoff_latitude = this->stof_err_handle(row[TAXI_TRIP_DROPOFF_LATITUDE]); + float trip_distance = stof_err_handle(row[TAXI_TRIP_TRIP_DISTANCE]); + float pickup_longitude = stof_err_handle(row[TAXI_TRIP_PICKUP_LONGITUDE]); + float pickup_latitude = stof_err_handle(row[TAXI_TRIP_PICKUP_LATITUDE]); + float dropoff_longitude = stof_err_handle(row[TAXI_TRIP_DROPOFF_LONGITUDE]); + float dropoff_latitude = stof_err_handle(row[TAXI_TRIP_DROPOFF_LATITUDE]); event->set_st(st); event->set_et(et); + event->set_part_key(part_key); event->mutable_taxi_trip()->set_medallion(medallion); event->mutable_taxi_trip()->set_hack_license(hack_license); event->mutable_taxi_trip()->set_vendor_id(vendor_id); @@ -82,22 +102,23 @@ class taxi_data_parser : public data_parser event->mutable_taxi_trip()->set_dropoff_latitude(dropoff_latitude); } - void gen_taxi_fare(vector &row, stream::stream_event *event) { - int64_t st = this->parse_datetime_to_seconds(row[TAXI_FARE_PICKUP_DATETIME], start_time); + void decode_taxi_fare(csv_row &row, stream::stream_event *event) { + int64_t st = parse_datetime_to_seconds(row[TAXI_FARE_PICKUP_DATETIME], start_time); int64_t et = st + 1; int32_t medallion = stoi(row[TAXI_FARE_MEDALLION]); int32_t hack_license = stoi(row[TAXI_FARE_HACK_LICENSE]); string vendor_id = row[TAXI_FARE_VENDOR_ID]; string payment_type = row[TAXI_FARE_PAYMENT_TYPE]; - float fare_amount = this->stof_err_handle(row[TAXI_FARE_FARE_AMOUNT]); - float surcharge = this->stof_err_handle(row[TAXI_FARE_SURCHARGE]); - float mta_tax = this->stof_err_handle(row[TAXI_FARE_MTA_TAX]); - float tip_amount = this->stof_err_handle(row[TAXI_FARE_TIP_AMOUNT]); - float tolls_amount = this->stof_err_handle(row[TAXI_FARE_TOLLS_AMOUNT]); - float total_amount = this->stof_err_handle(row[TAXI_FARE_TOTAL_AMOUNT]); + float fare_amount = stof_err_handle(row[TAXI_FARE_FARE_AMOUNT]); + float surcharge = stof_err_handle(row[TAXI_FARE_SURCHARGE]); + float mta_tax = stof_err_handle(row[TAXI_FARE_MTA_TAX]); + float tip_amount = stof_err_handle(row[TAXI_FARE_TIP_AMOUNT]); + float tolls_amount = stof_err_handle(row[TAXI_FARE_TOLLS_AMOUNT]); + float total_amount = stof_err_handle(row[TAXI_FARE_TOTAL_AMOUNT]); event->set_st(st); event->set_et(et); + event->set_part_key(part_key); event->mutable_taxi_fare()->set_medallion(medallion); event->mutable_taxi_fare()->set_hack_license(hack_license); event->mutable_taxi_fare()->set_vendor_id(vendor_id); @@ -111,10 +132,20 @@ class taxi_data_parser : public data_parser } public: - taxi_data_parser(string &dataset_name, string &dataset_dir) : + taxi_data_parser( + string &dataset_name, + string &dataset_dir, + int taxi_trip_part_key = 0, + int taxi_fare_part_key = 1 + ) : dataset_dir(dataset_dir), - start_time(boost::gregorian::date(1970, 1, 1)) + start_time(boost::gregorian::date(1970, 1, 1)), + taxi_trip_part_key(taxi_trip_part_key), + taxi_fare_part_key(taxi_fare_part_key) { + assert(taxi_trip_part_key != taxi_fare_part_key && + "Partition key for trip and fare data cannot be the same"); + if (dataset_name == "taxi_trip") { file_name_prefixes.push_back("trip_data_"); } else if (dataset_name == "taxi_fare") { @@ -150,11 +181,16 @@ class taxi_data_parser : public data_parser } else { continue; } - cerr << "Parsing " << trip_data_file << endl; - std::fstream trip_csv_file(trip_data_file.string()); - this->parse_csv_file(trip_csv_file, file_name_prefix == "trip_data_" ? 0 : 1); - trip_csv_file.close(); + if (file_name_prefix == "trip_data_") { + part_key = taxi_trip_part_key; + } else if (file_name_prefix == "trip_fare_") { + part_key = taxi_fare_part_key; + } else { + assert(false); + } + + this->parse_csv_file(trip_data_file.string()); } i++; if (!file_exists) { @@ -165,14 +201,6 @@ class taxi_data_parser : public data_parser return true; } - - void gen_data(vector &row, stream::stream_event *event, int flag) override { - if (flag == 0) { - gen_taxi_trip(row, event); - } else { - gen_taxi_fare(row, event); - } - } }; #endif // DATASET_UTIL_TAXI_DATA_PARSER_H_ \ No newline at end of file diff --git a/dataset_util/include/util.h b/dataset_util/include/util.h new file mode 100644 index 0000000..81de5ec --- /dev/null +++ b/dataset_util/include/util.h @@ -0,0 +1,21 @@ +#ifndef DATASET_UTIL_UTIL_H_ +#define DATASET_UTIL_UTIL_H_ + +#include + +using namespace std; + +float stof_err_handle(string &str) +{ + try { return stof(str); } + catch (std::exception& e) { return 0.0f; } +} + +int64_t parse_datetime_to_seconds(string &datetime, boost::posix_time::ptime &start_time) +{ + auto time = boost::posix_time::time_from_string(datetime); + auto diff = time - start_time; + return diff.total_seconds(); +} + +#endif // DATASET_UTIL_UTIL_H_ \ No newline at end of file diff --git a/dataset_util/include/vibration_data_parser.h b/dataset_util/include/vibration_data_parser.h index a4fb1f9..ba31ea3 100644 --- a/dataset_util/include/vibration_data_parser.h +++ b/dataset_util/include/vibration_data_parser.h @@ -1,19 +1,19 @@ #ifndef DATASET_UTIL_VIBRATION_DATA_PARSER_H_ #define DATASET_UTIL_VIBRATION_DATA_PARSER_H_ -#include -#include #include #include #include -#include +#include + +#include using namespace std; using namespace boost::filesystem; -class vibration_data_parser : public data_parser +class vibration_data_parser : public csv_parser { private: enum VIBRATION_DATA_INDEX { @@ -21,7 +21,7 @@ class vibration_data_parser : public data_parser CHANNEL_1, CHANNEL_2 }; -protected: + string &dataset_dir; const map folder_prefix_map = { {"1 Data collected from a healthy bearing", 'H'}, @@ -37,20 +37,24 @@ class vibration_data_parser : public data_parser "-D-1.csv", "-D-2.csv", "-D-3.csv" }; + int part_key; + public: - vibration_data_parser(string &dataset_dir) : - dataset_dir(dataset_dir) + vibration_data_parser(string &dataset_dir, int part_key = 0) : + dataset_dir(dataset_dir), + part_key(part_key) {} ~vibration_data_parser(){} - void gen_data(vector &row, stream::stream_event *event, int flag) override { + void decode(csv_row &row, stream::stream_event *event) override { int64_t st = stoi(row[TIMESTAMP]); int64_t et = st + 1; - float channel_1 = this->stof_err_handle(row[CHANNEL_1]); - float channel_2 = this->stof_err_handle(row[CHANNEL_2]); + float channel_1 = stof_err_handle(row[CHANNEL_1]); + float channel_2 = stof_err_handle(row[CHANNEL_2]); event->set_st(st); event->set_et(et); + event->set_part_key(part_key); event->mutable_vibration()->set_channel_1(channel_1); event->mutable_vibration()->set_channel_2(channel_2); } @@ -79,11 +83,7 @@ class vibration_data_parser : public data_parser continue; } - cerr << "Parsing " << data_file << endl; - std::fstream data_csv_file(data_file.string()); - this->parse_csv_file(data_csv_file); - - data_csv_file.close(); + this->parse_csv_file(data_file.string()); } } diff --git a/dataset_util/protos/stream_event.proto b/dataset_util/protos/stream_event.proto index e454d9d..e3273d8 100644 --- a/dataset_util/protos/stream_event.proto +++ b/dataset_util/protos/stream_event.proto @@ -12,9 +12,10 @@ option java_outer_classname = "StreamEventProtos"; message stream_event { int64 st = 1; int64 et = 2; + int32 part_key = 3; oneof payload { - taxi_trip taxi_trip = 3; - taxi_fare taxi_fare = 4; - vibration vibration = 5; + taxi_trip taxi_trip = 4; + taxi_fare taxi_fare = 5; + vibration vibration = 6; } } \ No newline at end of file diff --git a/trill_bench/bench/BenchUtil.cs b/trill_bench/bench/BenchUtil.cs index 6c830f2..b9d3cba 100644 --- a/trill_bench/bench/BenchUtil.cs +++ b/trill_bench/bench/BenchUtil.cs @@ -75,19 +75,26 @@ public static Func, IStreamable { var taxi_ride_data = new TaxiRideData(); var taxi_fare_data = new TaxiFareData(); + long cnt_trips = 0; + long cnt_fares = 0; + + Console.WriteLine("Start loading taxi trips and taxi fares ..."); MessageParser parser = new MessageParser(() => new stream_event()); - for (int i = 0; i < s; i++) + for (long i = 0; i < s; i++) { stream_event s_event = parser.ParseDelimitedFrom(Console.OpenStandardInput()); if (s_event.PayloadCase == stream_event.PayloadOneofCase.TaxiTrip) { taxi_ride_data.LoadDataPoint(s_event); + cnt_trips++; } else if (s_event.PayloadCase == stream_event.PayloadOneofCase.TaxiFare) { taxi_fare_data.LoadDataPoint(s_event); + cnt_fares++; } else { Debug.Assert(false); } } + Console.WriteLine("Finished loading {0} taxi trips and {1} taxi fares", cnt_trips, cnt_fares); return Tuple.Create( (IStreamable) taxi_ride_data.ToStreamable().Cache(), diff --git a/trill_bench/bench/DatasetData.cs b/trill_bench/bench/DatasetData.cs index 10118c2..fb86f90 100644 --- a/trill_bench/bench/DatasetData.cs +++ b/trill_bench/bench/DatasetData.cs @@ -24,7 +24,7 @@ public DatasetObs(long size) public void LoadData() { - for (int i = 0; i < size; i++) + for (long i = 0; i < size; i++) { this.LoadDataPoint(); } @@ -81,8 +81,10 @@ public override void LoadDataPoint() public VibrationObs(long size) : base(size) { + Console.WriteLine("Start loading vibration data points ..."); this.parser = new MessageParser(() => new stream_event()); LoadData(); + Console.WriteLine("Finished loading {0} vibration data points", size); } } } \ No newline at end of file