From f4b1d56149ba7e9c18a220a24df401c92e0a35eb Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Fri, 28 Feb 2025 13:23:20 +0100 Subject: [PATCH 01/16] Add dynamic reconfigure in main for serial and write --- Sinks/CMakeLists.txt | 2 +- Sinks/CSVSink/CSVSink.cpp | 248 ++++++++++-------- Sinks/main.cpp | 77 ++++-- .../EnvironmentManager.cpp | 162 ++++++++++++ .../EnvironmentManager.h | 105 ++++++++ xdbc/RuntimeEnv.h | 28 +- 6 files changed, 487 insertions(+), 135 deletions(-) create mode 100644 xdbc/EnvironmentReconfigure/EnvironmentManager.cpp create mode 100644 xdbc/EnvironmentReconfigure/EnvironmentManager.h diff --git a/Sinks/CMakeLists.txt b/Sinks/CMakeLists.txt index c38e07f..6620687 100644 --- a/Sinks/CMakeLists.txt +++ b/Sinks/CMakeLists.txt @@ -29,7 +29,7 @@ endif() # link_directories(${Boost_LIBRARY_DIRS}) #endif() -add_executable(xdbcsinks main.cpp CSVSink/CSVSink.cpp PQSink/PQSink.cpp ../xdbc/ControllerInterface/WebSocketClient.cpp) +add_executable(xdbcsinks main.cpp CSVSink/CSVSink.cpp PQSink/PQSink.cpp ../xdbc/ControllerInterface/WebSocketClient.cpp ../xdbc/EnvironmentReconfigure/EnvironmentManager.cpp) # Link dependencies set(USED_LIBS ${Boost_SYSTEM_LIBRARY} ${Boost_THREAD_LIBRARY} ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_REGEX_LIBRARY}) diff --git a/Sinks/CSVSink/CSVSink.cpp b/Sinks/CSVSink/CSVSink.cpp index fb39316..99bd9ac 100644 --- a/Sinks/CSVSink/CSVSink.cpp +++ b/Sinks/CSVSink/CSVSink.cpp @@ -7,44 +7,47 @@ #include #include #include -#include // For serialization/deserialization +#include // For serialization/deserialization #include #include "deserializers_parquet.h" CsvSink::CsvSink(std::string baseFilename, xdbc::RuntimeEnv *runtimeEnv) - : baseFilename(std::move(baseFilename)), runtimeEnv(runtimeEnv) { + : baseFilename(std::move(baseFilename)), runtimeEnv(runtimeEnv) +{ bufferPool = runtimeEnv->bp; auto console = spdlog::stdout_color_mt("XDBC.CSVSINK"); - } -void CsvSink::serialize(int thr) { +void CsvSink::serialize(int thr) +{ runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "start"}); size_t writtenBuffers = 0; size_t writtenTuples = 0; - spdlog::get("XDBC.CSVSINK")->info("CSV Serializer started thread {}", thr); + if (runtimeEnv->skip_serializer) + { - if (runtimeEnv->skip_serializer) { - - while (true) { + while (true) + { int bufferId = runtimeEnv->decompressedBufferIds->pop(); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - if (bufferId == -1) break; + if (bufferId == -1) + break; runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); runtimeEnv->serializedBufferIds->push(bufferId); writtenBuffers++; } - - } else { + } + else + { const auto &schema = runtimeEnv->schema; size_t schemaSize = schema.size(); @@ -58,24 +61,32 @@ void CsvSink::serialize(int thr) { std::vector parquetSerializers(schemaSize); size_t maxTupleSize = 0; - for (size_t i = 0; i < schemaSize; ++i) { + for (size_t i = 0; i < schemaSize; ++i) + { - if (schema[i].tpe[0] == 'I') { + if (schema[i].tpe[0] == 'I') + { sizes[i] = 4; // sizeof(int) serializers[i] = SerializeAttribute; parquetSerializers[i] = SerializeParquetAttribute; maxTupleSize += 12; // Pessimistic size for integer serialization - } else if (schema[i].tpe[0] == 'D') { + } + else if (schema[i].tpe[0] == 'D') + { sizes[i] = 8; // sizeof(double) serializers[i] = SerializeAttribute; parquetSerializers[i] = SerializeParquetAttribute; maxTupleSize += 24; // Pessimistic size for double serialization - } else if (schema[i].tpe[0] == 'C') { + } + else if (schema[i].tpe[0] == 'C') + { sizes[i] = 1; // sizeof(char) serializers[i] = SerializeAttribute; parquetSerializers[i] = SerializeParquetAttribute; maxTupleSize += 2; // Single character + delimiter - } else if (schema[i].tpe[0] == 'S') { + } + else if (schema[i].tpe[0] == 'S') + { sizes[i] = schema[i].size; serializers[i] = SerializeAttribute; parquetSerializers[i] = SerializeParquetAttribute; @@ -85,47 +96,46 @@ void CsvSink::serialize(int thr) { delimiters[i] = (i == schemaSize - 1) ? '\n' : ','; // Newline for the last attribute, commas for others } - - //TODO: only for format 1 + // TODO: only for format 1 std::vector columnOffsets(schemaSize); size_t totalRowSize = 0; - for (size_t j = 0; j < schemaSize; ++j) { + for (size_t j = 0; j < schemaSize; ++j) + { columnOffsets[j] = totalRowSize; totalRowSize += sizes[j]; } - //TODO: only for format 3 (arrow) + // TODO: only for format 3 (arrow) std::vector> dataExtractors(schemaSize); size_t bufferSizeInBytes = runtimeEnv->buffer_size * 1024; - while (true) { + while (true) + { int bufferId = runtimeEnv->decompressedBufferIds->pop(); - if (bufferId == -1) break; + if (bufferId == -1) + break; runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); const auto &inBufferPtr = (*bufferPool)[bufferId]; auto header = *reinterpret_cast(inBufferPtr.data()); if (header.totalTuples > runtimeEnv->tuples_per_buffer || header.totalSize > runtimeEnv->buffer_size * 1024) - spdlog::get("XDBC.CSVSINK")->error("Size of buffer larger than expected tuples:{}/{}, size {}/{}", - header.totalTuples, runtimeEnv->tuples_per_buffer, header.totalSize, - runtimeEnv->buffer_size * 1024); - + spdlog::get("XDBC.CSVSINK")->error("Size of buffer larger than expected tuples:{}/{}, size {}/{}", header.totalTuples, runtimeEnv->tuples_per_buffer, header.totalSize, runtimeEnv->buffer_size * 1024); const char *basePtr = reinterpret_cast(inBufferPtr.data() + sizeof(xdbc::Header)); - if (header.intermediateFormat == 1 || header.intermediateFormat == 2 || header.intermediateFormat == 3) { - //spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); - + if (header.intermediateFormat == 1 || header.intermediateFormat == 2 || header.intermediateFormat == 3) + { + // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); std::vector> arrays; // To store Arrow arrays for `iformat == 3` - if (runtimeEnv->iformat == 3) { + if (runtimeEnv->iformat == 3) + { // Deserialize Arrow RecordBatch from raw memory const auto *bufferData = reinterpret_cast(inBufferPtr.data() + sizeof(xdbc::Header)); @@ -143,38 +153,48 @@ void CsvSink::serialize(int thr) { arrays = recordBatch->columns(); // Precompute accessors for Arrow arrays - for (size_t j = 0; j < schemaSize; ++j) { - switch (arrays[j]->type_id()) { - case arrow::Type::INT32: { - auto intArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [intArray](int i) { - return reinterpret_cast(intArray->raw_values() + i); - }; - break; - } - case arrow::Type::DOUBLE: { - auto doubleArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [doubleArray](int i) { - return reinterpret_cast(doubleArray->raw_values() + i); - }; - break; - } - case arrow::Type::STRING: { - auto stringArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [stringArray](int i) { - return stringArray->GetString(i).c_str(); - }; - break; - } - case arrow::Type::FIXED_SIZE_BINARY: { - auto fixedArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [fixedArray](int i) { - return reinterpret_cast(fixedArray->GetValue(i)); - }; - break; - } - default: - throw std::runtime_error("Unsupported Arrow array type for serialization."); + for (size_t j = 0; j < schemaSize; ++j) + { + switch (arrays[j]->type_id()) + { + case arrow::Type::INT32: + { + auto intArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [intArray](int i) + { + return reinterpret_cast(intArray->raw_values() + i); + }; + break; + } + case arrow::Type::DOUBLE: + { + auto doubleArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [doubleArray](int i) + { + return reinterpret_cast(doubleArray->raw_values() + i); + }; + break; + } + case arrow::Type::STRING: + { + auto stringArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [stringArray](int i) + { + return stringArray->GetString(i).c_str(); + }; + break; + } + case arrow::Type::FIXED_SIZE_BINARY: + { + auto fixedArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [fixedArray](int i) + { + return reinterpret_cast(fixedArray->GetValue(i)); + }; + break; + } + default: + throw std::runtime_error("Unsupported Arrow array type for serialization."); } } } @@ -187,23 +207,25 @@ void CsvSink::serialize(int thr) { std::vector columnStartPointers(schemaSize); size_t cumulativeOffset = 0; - for (size_t k = 0; k < schemaSize; ++k) { + for (size_t k = 0; k < schemaSize; ++k) + { columnStartPointers[k] = basePtr + cumulativeOffset; - //TODO: check this, maybe write header.totalTuples instead of tuples_per_buffer + // TODO: check this, maybe write header.totalTuples instead of tuples_per_buffer cumulativeOffset += - runtimeEnv->tuples_per_buffer * sizes[k]; // Move by the total size of this column - + runtimeEnv->tuples_per_buffer * sizes[k]; // Move by the total size of this column } - for (size_t i = 0; i < header.totalTuples; ++i) { - if (totalSerializedBytes + maxTupleSize > bufferSizeInBytes) { + for (size_t i = 0; i < header.totalTuples; ++i) + { + if (totalSerializedBytes + maxTupleSize > bufferSizeInBytes) + { // Buffer is full, push it to the queue xdbc::Header head{}; head.totalSize = totalSerializedBytes; std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", - "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", + "push"}); runtimeEnv->serializedBufferIds->push(serializedBufferId); // Fetch a new buffer @@ -214,38 +236,45 @@ void CsvSink::serialize(int thr) { writtenBuffers++; } - - for (size_t j = 0; j < schemaSize; ++j) { + for (size_t j = 0; j < schemaSize; ++j) + { const char *dataPtr; - if (runtimeEnv->iformat == 1) { + if (runtimeEnv->iformat == 1) + { dataPtr = basePtr + i * totalRowSize + columnOffsets[j]; - } else if (runtimeEnv->iformat == 2) { + } + else if (runtimeEnv->iformat == 2) + { dataPtr = columnStartPointers[j] + i * sizes[j]; - } else if (runtimeEnv->iformat == 3) { + } + else if (runtimeEnv->iformat == 3) + { dataPtr = dataExtractors[j](i); } totalSerializedBytes += serializers[j]( - dataPtr, writePtr + totalSerializedBytes, sizes[j], delimiters[j]); + dataPtr, writePtr + totalSerializedBytes, sizes[j], delimiters[j]); } } writtenTuples += header.totalTuples; // Write any remaining data to the buffer - if (totalSerializedBytes > 0) { + if (totalSerializedBytes > 0) + { xdbc::Header head{}; head.totalSize = totalSerializedBytes; std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); runtimeEnv->serializedBufferIds->push(serializedBufferId); writtenBuffers++; } } - if (header.intermediateFormat == 4) { - //spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); + if (header.intermediateFormat == 4) + { + // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); auto writeBuff = runtimeEnv->freeBufferIds->pop(); char *writeBuffPtr = reinterpret_cast((*bufferPool)[writeBuff].data() + sizeof(xdbc::Header)); @@ -263,16 +292,19 @@ void CsvSink::serialize(int thr) { parquet::StreamReader stream{parquet::ParquetFileReader::Open(buffer_reader)}; size_t totalSerializedBytes = 0; - while (!stream.eof()) { + while (!stream.eof()) + { - for (size_t j = 0; j < schemaSize; ++j) { + for (size_t j = 0; j < schemaSize; ++j) + { totalSerializedBytes += parquetSerializers[j](stream, writeBuffPtr + totalSerializedBytes, sizes[j], delimiters[j]); } stream >> parquet::EndRow; numRows++; - if (totalSerializedBytes + 1000 > runtimeEnv->buffer_size * 1024) { + if (totalSerializedBytes + 1000 > runtimeEnv->buffer_size * 1024) + { xdbc::Header head{}; head.totalSize = totalSerializedBytes; head.totalTuples = numRows; @@ -280,8 +312,8 @@ void CsvSink::serialize(int thr) { std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", - "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", + "push"}); runtimeEnv->serializedBufferIds->push(writeBuff); @@ -293,8 +325,9 @@ void CsvSink::serialize(int thr) { } } - //write remaining - if (totalSerializedBytes > 0) { + // write remaining + if (totalSerializedBytes > 0) + { xdbc::Header head{}; head.totalSize = totalSerializedBytes; head.totalTuples = numRows; @@ -302,7 +335,6 @@ void CsvSink::serialize(int thr) { std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); runtimeEnv->serializedBufferIds->push(writeBuff); } - } // Release decompressed buffer back to freeBufferIds @@ -312,20 +344,20 @@ void CsvSink::serialize(int thr) { runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "end"}); - spdlog::get("XDBC.CSVSINK")->info("CSV Serializer stopping thread {}, written buffers: {}, tuples: {}", thr, - writtenBuffers, writtenTuples); - + spdlog::get("XDBC.CSVSINK")->info("CSV Serializer stopping thread {}, written buffers: {}, tuples: {}", thr, writtenBuffers, writtenTuples); runtimeEnv->finishedSerializerThreads.fetch_add(1); - if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) { - for (int i = 0; i < runtimeEnv->write_parallelism; ++i) { - runtimeEnv->serializedBufferIds->push(-1); // Termination signal + if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) + { + for (int i = 0; i < runtimeEnv->write_parallelism; ++i) + { + // runtimeEnv->serializedBufferIds->push(-1); // Termination signal } } } - -void CsvSink::write(int thr) { +void CsvSink::write(int thr) +{ runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "start"}); @@ -335,18 +367,21 @@ void CsvSink::write(int thr) { size_t buffersWritten = 0; outputFile.open(fileName, std::ios::out | std::ios::binary); - if (!outputFile.is_open()) { + if (!outputFile.is_open()) + { throw std::runtime_error("Failed to open output file: " + fileName); } - while (true) { + while (true) + { int bufferId = runtimeEnv->serializedBufferIds->pop(); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); - //spdlog::get("XDBC.CSVSINK")->info("CSV Writer {} got serialized buff {}", thr, bufferId); + // spdlog::get("XDBC.CSVSINK")->info("CSV Writer {} got serialized buff {}", thr, bufferId); - if (bufferId == -1) break; + if (bufferId == -1) + break; const auto &serializedBuffer = (*bufferPool)[bufferId]; auto header = *reinterpret_cast(serializedBuffer.data()); @@ -355,7 +390,7 @@ void CsvSink::write(int thr) { outputFile.write(dataPtr, header.totalSize); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); runtimeEnv->freeBufferIds->push(bufferId); buffersWritten++; @@ -365,5 +400,4 @@ void CsvSink::write(int thr) { runtimeEnv->finishedWriteThreads.fetch_add(1); runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "end"}); spdlog::get("XDBC.CSVSINK")->info("CSV Writer thread {} wrote buffers: {}", thr, buffersWritten); - } \ No newline at end of file diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 5a2b9c2..2d9dd13 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -8,6 +8,7 @@ #include #include #include "../xdbc/ControllerInterface/WebSocketClient.h" +#include "../xdbc/EnvironmentReconfigure/EnvironmentManager.h" #include "../xdbc/metrics_calculator.h" // Utility functions for schema handling @@ -252,6 +253,9 @@ int main(int argc, char *argv[]) } // *** Finished Setup websocket interface for controller *** + //*** Setting up EnvironmentReconfigure that handles threads during run-time + EnvironmentManager env_manager; + //*** // Initialize XClient xdbc::XClient xclient(env); xclient.startReceiving(env.table); @@ -260,16 +264,53 @@ int main(int argc, char *argv[]) { CsvSink csvSink(outputBasePath, &env); + env_manager.registerOperation("CSV_serial", [&](int thr) + { try { + if (thr >= env.max_threads) { + spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); + return; // Prevent out-of-bounds access + } + csvSink.serialize(thr); + } catch (const std::exception& e) { + spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); + } }, env.decompressedBufferIds); + // Start the reconfiguration manager + env_manager.start(); + env_manager.registerOperation("CSV_write", [&](int thr) + { try { + if (thr >= env.max_threads) { + spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); + return; // Prevent out-of-bounds access + } + csvSink.write(thr); + } catch (const std::exception& e) { + spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); + } }, env.serializedBufferIds); + + env_manager.configureThreads("CSV_serial", env.ser_parallelism); + env_manager.configureThreads("CSV_write", env.write_parallelism); + std::this_thread::sleep_for(std::chrono::milliseconds(6000)); + env.ser_parallelism = 2; + env_manager.configureThreads("CSV_serial", env.ser_parallelism); + // Wait for threads to finish + env_manager.joinThreads("CSV_serial"); + env_manager.configureThreads("CSV_write", 0); + env_manager.joinThreads("CSV_write"); + // Start serialization and writing threads - std::vector threads; - for (int i = 0; i < env.ser_parallelism; ++i) - { - xclient._serThreads[i] = std::thread(&CsvSink::serialize, &csvSink, i); - } - for (int i = 0; i < env.write_parallelism; ++i) - { - xclient._writeThreads[i] = std::thread(&CsvSink::write, &csvSink, i); - } + // std::vector threads; + // for (int i = 0; i < env.ser_parallelism; ++i) + // { + // xclient._serThreads[i] = std::thread(&CsvSink::serialize, &csvSink, i); + // } + // for (int i = 0; i < env.write_parallelism; ++i) + // { + // xclient._writeThreads[i] = std::thread(&CsvSink::write, &csvSink, i); + // } } else if (env.target == "parquet") { @@ -287,19 +328,19 @@ int main(int argc, char *argv[]) } } - // Wait for threads to finish - for (int i = 0; i < env.ser_parallelism; ++i) - { - xclient._serThreads[i].join(); - } - for (int i = 0; i < env.write_parallelism; ++i) - { - xclient._writeThreads[i].join(); - } + // for (int i = 0; i < env.ser_parallelism; ++i) + // { + // xclient._serThreads[i].join(); + // } + // for (int i = 0; i < env.write_parallelism; ++i) + // { + // xclient._writeThreads[i].join(); + // } xclient.finalize(); spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); + env_manager.stop(); // *** Stop Reconfigurration handler // *** Stop websocket client if (env.spawn_source == 1) { diff --git a/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp b/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp new file mode 100644 index 0000000..3cf9790 --- /dev/null +++ b/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp @@ -0,0 +1,162 @@ +// EnvironmentManager.cpp +#include "EnvironmentManager.h" + +EnvironmentManager::EnvironmentManager() : terminate_(false), config_update_(false), config_over_(false) {} + +EnvironmentManager::~EnvironmentManager() +{ + stop(); // Ensure all threads are stopped before destruction +} + +void EnvironmentManager::registerOperation(const std::string &name, Task task, std::shared_ptr> poisonQueue) +{ + std::unique_lock lock(mutex_); + operations_[name] = {task, poisonQueue, 0, 0}; + // cv_.notify_all(); // Notify that a new operation is registered +} + +void EnvironmentManager::configureThreads(const std::string &name, int new_thread_count) +{ + std::unique_lock lock(mutex_); + auto it = operations_.find(name); + if (it != operations_.end()) + { + it->second.desired_threads = new_thread_count; + config_update_ = true; + config_over_ = false; + cv_.notify_all(); + + // Wait until all requested threads are actually started + + cv_.wait(lock, [this] + { return config_over_.load(); }); + } +} + +void EnvironmentManager::start() +{ + reconfig_thread_ = std::thread(&EnvironmentManager::run, this); +} + +void EnvironmentManager::joinThreads(const std::string &name) +{ + std::unique_lock lock(mutex_); + auto it = operations_.find(name); + if (it != operations_.end()) + { + Operation &op = it->second; + + for (auto &thread : op.threads) + { + if (thread.joinable()) + { + thread.join(); // Wait for the thread to finish + } + else + { + spdlog::info("Thread with ID: {} is not joinable.", std::hash{}(thread.get_id())); + } + } + + op.threads.clear(); // Clear the threads after joining + op.active_threads = 0; // Reset the active thread count + op.desired_threads = 0; // Reset the desired thread count + } + else + { + spdlog::warn("Operation '{}' not found. No threads to join.", name); + } +} + +void EnvironmentManager::stop() +{ + { + std::unique_lock lock(mutex_); + terminate_ = true; + cv_.notify_all(); + } + + if (reconfig_thread_.joinable()) + { + reconfig_thread_.join(); + } + + // Join all threads before exiting + for (auto &op : operations_) + { + for (auto &t : op.second.threads) + { + if (t.joinable()) + { + t.join(); + } + } + } +} + +void EnvironmentManager::run() +{ + while (!terminate_) + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] + { return terminate_.load() || config_update_; }); + + config_update_ = false; + + for (auto &[name, operation] : operations_) + { + int delta_threads = operation.desired_threads - operation.active_threads; + + if (delta_threads > 0) + { + + for (int i = 0; i < delta_threads; ++i) + { + int thread_id = operation.active_threads + i; + + if (!operation.task) + { + spdlog::error("Task is null for operation {}", name); + continue; + } + + // Push a new thread instead of accessing via index + operation.threads.emplace_back([this, task = operation.task, thread_id, name] + { + try + { + task(thread_id); + } + catch (const std::exception &e) + { + spdlog::error("Exception in thread {}: {}", thread_id, e.what()); + } + catch (...) + { + spdlog::error("Unknown exception in thread {}", thread_id); + } }); + } + spdlog::info("Reconfigure thread for operation {0} by {1}", name, delta_threads); + } + else if (delta_threads < 0) + { + + for (int i = 0; i < -delta_threads; ++i) + { + if (!operation.poisonQueue) + { + spdlog::error("poisonQueue is null for operation {}", name); + continue; + } + operation.poisonQueue->push(-1); + } + spdlog::info("Reconfigure thread for operation {0} by {1}", name, delta_threads); + } + + operation.active_threads = operation.desired_threads; + } + config_over_ = true; + cv_.notify_all(); + } +} diff --git a/xdbc/EnvironmentReconfigure/EnvironmentManager.h b/xdbc/EnvironmentReconfigure/EnvironmentManager.h new file mode 100644 index 0000000..8368efc --- /dev/null +++ b/xdbc/EnvironmentReconfigure/EnvironmentManager.h @@ -0,0 +1,105 @@ +// EnvironmentManager.h +#ifndef ENVIRONMENTMANAGER_H +#define ENVIRONMENTMANAGER_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../xclient.h" + +class EnvironmentManager +{ +public: + using Task = std::function; + + // Constructor + EnvironmentManager(); + + // Destructor + ~EnvironmentManager(); + + // Register an operation (e.g., write, decompress) + void registerOperation(const std::string &name, Task task, std::shared_ptr> poisonQueue); + + // Configure the number of threads for an operation + void configureThreads(const std::string &name, int new_thread_count); + + // Start the reconfiguration manager + void start(); + + // Stop the reconfiguration manager and all threads + void stop(); + + // Join all threads for a specific operation + void joinThreads(const std::string &name); + +private: + struct Operation + { + Task task; + std::shared_ptr> poisonQueue; + int active_threads = 0; + int desired_threads = 0; + std::vector threads; + }; + + void run(); // Main loop that handles thread creation and termination + + std::unordered_map operations_; + std::mutex mutex_; + std::condition_variable cv_; + std::atomic terminate_; + std::atomic config_update_; + std::atomic config_over_; + std::thread reconfig_thread_; +}; + +// Sample code for user: +/* + EnvironmentManager reconfig_manager; + +// Register operations with lambdas to bind arguments + +// Register ANALYTICS operation with specific arguments +reconfig_manager.registerOperation("ANALYTICS", + [&](int thr) { + int min = 0, max = 0; + long sum = 0, cnt = 0, totalcnt = 0; + analyticsThread(thr, min, max, sum, cnt, totalcnt); // Using thread index 'thr' and other arguments + }, + writeBufferIds); + +// Register STORAGE operation with specific arguments +reconfig_manager.registerOperation("STORAGE", + [&](int thr) { + std::string filename = "data_file"; // Dynamically generate filename based on thread ID + storageThread(thr, filename); // Using thread index 'thr' and filename + }, + decompressedBufferIds); + +// Start the reconfiguration manager +reconfig_manager.start(); + +// Configure threads dynamically +reconfig_manager.configureThreads("ANALYTICS", 5); // Start 5 threads for analytics +reconfig_manager.configureThreads("STORAGE", 3); // Start 3 threads for storage + +// Simulate reconfiguration at runtime +reconfig_manager.configureThreads("ANALYTICS", 2); // Reduce threads for ANALYTICS +reconfig_manager.configureThreads("STORAGE", 4); // Increase threads for STORAGE + +// Join threads for both operations +reconfig_manager.joinThreads("ANALYTICS"); +reconfig_manager.joinThreads("STORAGE"); + +// Stop the manager and all threads +reconfig_manager.stop(); +*/ + +#endif // EnvironmentManager_H diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index b4e4c6e..f97b002 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -10,17 +10,20 @@ #include #include -namespace xdbc { +namespace xdbc +{ constexpr size_t MAX_ATTRIBUTES = 230; - struct SchemaAttribute { + struct SchemaAttribute + { std::string name; std::string tpe; int size; }; - struct ProfilingTimestamps { + struct ProfilingTimestamps + { std::chrono::high_resolution_clock::time_point timestamp; int thread; std::string component; @@ -30,13 +33,15 @@ namespace xdbc { typedef std::shared_ptr> FBQ_ptr; typedef std::shared_ptr> PTQ_ptr; - struct transfer_details { + struct transfer_details + { float elapsed_time = 0.0f; // Default value for elapsed_time std::vector bufProcessed; // Default value: vector with one element, 0 std::tuple latest_queueSizes; }; - class RuntimeEnv { + class RuntimeEnv + { public: // Public members for configuration and state std::vector> *bp = nullptr; @@ -78,8 +83,10 @@ namespace xdbc { int spawn_source; transfer_details tf_paras; std::atomic enable_updation; + int max_threads = 16; - std::string toString() const { + std::string toString() const + { std::ostringstream oss; oss << "RuntimeEnv Configuration:\n"; @@ -127,9 +134,11 @@ namespace xdbc { RuntimeEnv() = default; // Utility to calculate tuple size based on schema - void calculateTupleSize() { + void calculateTupleSize() + { tuple_size = std::accumulate(schema.begin(), schema.end(), 0, - [](int acc, const SchemaAttribute &attr) { + [](int acc, const SchemaAttribute &attr) + { return acc + attr.size; }); tuples_per_buffer = (buffer_size * 1024 / tuple_size); @@ -139,7 +148,8 @@ namespace xdbc { typedef std::shared_ptr> FBQ_ptr; typedef std::shared_ptr> PTQ_ptr; - struct Header { + struct Header + { size_t compressionType; size_t totalSize; From 85f30d5fb1835c035d5e4f2ffc5d23423d974670 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Fri, 28 Feb 2025 15:12:59 +0100 Subject: [PATCH 02/16] Make env_manager part of runtime env --- CMakeLists.txt | 1 + Sinks/CMakeLists.txt | 2 +- Sinks/main.cpp | 93 +++-- .../EnvironmentManager.h | 3 +- xdbc/RuntimeEnv.h | 4 +- xdbc/xclient.cpp | 336 +++++++++--------- xdbc/xclient.h | 183 +++++----- 7 files changed, 315 insertions(+), 307 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b5b865..e1eecf1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ include(GNUInstallDirs) add_library(xdbc SHARED xdbc/xclient.cpp + xdbc/EnvironmentReconfigure/EnvironmentManager.cpp xdbc/EnvironmentReconfigure/EnvironmentManager.h xdbc/Decompression/Decompressor.cpp xdbc/Decompression/Decompressor.h) set_target_properties(xdbc PROPERTIES diff --git a/Sinks/CMakeLists.txt b/Sinks/CMakeLists.txt index 6620687..c38e07f 100644 --- a/Sinks/CMakeLists.txt +++ b/Sinks/CMakeLists.txt @@ -29,7 +29,7 @@ endif() # link_directories(${Boost_LIBRARY_DIRS}) #endif() -add_executable(xdbcsinks main.cpp CSVSink/CSVSink.cpp PQSink/PQSink.cpp ../xdbc/ControllerInterface/WebSocketClient.cpp ../xdbc/EnvironmentReconfigure/EnvironmentManager.cpp) +add_executable(xdbcsinks main.cpp CSVSink/CSVSink.cpp PQSink/PQSink.cpp ../xdbc/ControllerInterface/WebSocketClient.cpp) # Link dependencies set(USED_LIBS ${Boost_SYSTEM_LIBRARY} ${Boost_THREAD_LIBRARY} ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_REGEX_LIBRARY}) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 2d9dd13..32ff760 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -253,8 +253,6 @@ int main(int argc, char *argv[]) } // *** Finished Setup websocket interface for controller *** - //*** Setting up EnvironmentReconfigure that handles threads during run-time - EnvironmentManager env_manager; //*** // Initialize XClient xdbc::XClient xclient(env); @@ -264,22 +262,18 @@ int main(int argc, char *argv[]) { CsvSink csvSink(outputBasePath, &env); - env_manager.registerOperation("CSV_serial", [&](int thr) - { try { - if (thr >= env.max_threads) { - spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); - return; // Prevent out-of-bounds access - } + env.env_manager.registerOperation("serial", [&](int thr) + { try { + csvSink.serialize(thr); } catch (const std::exception& e) { spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); } catch (...) { spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.decompressedBufferIds); - // Start the reconfiguration manager - env_manager.start(); - env_manager.registerOperation("CSV_write", [&](int thr) - { try { + + env.env_manager.registerOperation("write", [&](int thr) + { try { if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); return; // Prevent out-of-bounds access @@ -291,56 +285,51 @@ int main(int argc, char *argv[]) spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.serializedBufferIds); - env_manager.configureThreads("CSV_serial", env.ser_parallelism); - env_manager.configureThreads("CSV_write", env.write_parallelism); - std::this_thread::sleep_for(std::chrono::milliseconds(6000)); - env.ser_parallelism = 2; - env_manager.configureThreads("CSV_serial", env.ser_parallelism); - // Wait for threads to finish - env_manager.joinThreads("CSV_serial"); - env_manager.configureThreads("CSV_write", 0); - env_manager.joinThreads("CSV_write"); - - // Start serialization and writing threads - // std::vector threads; - // for (int i = 0; i < env.ser_parallelism; ++i) - // { - // xclient._serThreads[i] = std::thread(&CsvSink::serialize, &csvSink, i); - // } - // for (int i = 0; i < env.write_parallelism; ++i) - // { - // xclient._writeThreads[i] = std::thread(&CsvSink::write, &csvSink, i); - // } + env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("write", env.write_parallelism); } else if (env.target == "parquet") { PQSink parquetSink(outputBasePath, &env); - // Start serialization and writing threads - std::vector threads; - for (int i = 0; i < env.ser_parallelism; ++i) - { - xclient._serThreads[i] = std::thread(&PQSink::serialize, &parquetSink, i); - } - for (int i = 0; i < env.write_parallelism; ++i) - { - xclient._writeThreads[i] = std::thread(&PQSink::write, &parquetSink, i); - } + env.env_manager.registerOperation("serial", [&](int thr) + { try { + + parquetSink.serialize(thr); + } catch (const std::exception& e) { + spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); + } }, env.decompressedBufferIds); + + env.env_manager.registerOperation("write", [&](int thr) + { try { + if (thr >= env.max_threads) { + spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); + return; // Prevent out-of-bounds access + } + parquetSink.write(thr); + } catch (const std::exception& e) { + spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); + } }, env.serializedBufferIds); + + env.env_manager.configureThreads("serial", env.ser_parallelism); // start serial component threads + env.env_manager.configureThreads("write", env.write_parallelism); // start write component threads } - // for (int i = 0; i < env.ser_parallelism; ++i) - // { - // xclient._serThreads[i].join(); - // } - // for (int i = 0; i < env.write_parallelism; ++i) - // { - // xclient._writeThreads[i].join(); - // } + std::this_thread::sleep_for(std::chrono::milliseconds(6000)); + env.ser_parallelism = 2; + env.env_manager.configureThreads("serial", env.ser_parallelism); + // Wait for threads to finish + env.env_manager.joinThreads("serial"); + env.env_manager.configureThreads("write", 0); + env.env_manager.joinThreads("write"); + xclient.finishReceiving(); xclient.finalize(); spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); - - env_manager.stop(); // *** Stop Reconfigurration handler // *** Stop websocket client if (env.spawn_source == 1) { diff --git a/xdbc/EnvironmentReconfigure/EnvironmentManager.h b/xdbc/EnvironmentReconfigure/EnvironmentManager.h index 8368efc..cc70038 100644 --- a/xdbc/EnvironmentReconfigure/EnvironmentManager.h +++ b/xdbc/EnvironmentReconfigure/EnvironmentManager.h @@ -11,7 +11,8 @@ #include #include #include -#include "../xclient.h" +#include "customQueue.h" +// #include "../xclient.h" class EnvironmentManager { diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index f97b002..8771b11 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -1,7 +1,7 @@ #ifndef XDBC_RUNTIMEENV_H #define XDBC_RUNTIMEENV_H -#include "customQueue.h" +// #include "customQueue.h" #include #include #include @@ -9,6 +9,7 @@ #include #include #include +#include "EnvironmentReconfigure/EnvironmentManager.h" namespace xdbc { @@ -84,6 +85,7 @@ namespace xdbc transfer_details tf_paras; std::atomic enable_updation; int max_threads = 16; + EnvironmentManager env_manager; std::string toString() const { diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 46723f2..8d7b34d 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -16,26 +16,27 @@ using namespace boost::asio; using ip::tcp; -namespace xdbc { - - - XClient::XClient(RuntimeEnv &env) : - _xdbcenv(&env), - _bufferPool(), - _totalBuffersRead(0), - _decompThreads(env.decomp_parallelism), - _rcvThreads(env.rcv_parallelism), - _serThreads(env.ser_parallelism), - _writeThreads(env.write_parallelism), - _readSockets(), - //_emptyDecompThreadCtr(env.write_parallelism), - _markedFreeCounter(0), - _emptyDecompThreadCtr(0), - _baseSocket(_ioContext) { +namespace xdbc +{ + + XClient::XClient(RuntimeEnv &env) : _xdbcenv(&env), + _bufferPool(), + _totalBuffersRead(0), + _decompThreads(env.decomp_parallelism), + _rcvThreads(env.rcv_parallelism), + _serThreads(env.ser_parallelism), + _writeThreads(env.write_parallelism), + _readSockets(), + //_emptyDecompThreadCtr(env.write_parallelism), + _markedFreeCounter(0), + _emptyDecompThreadCtr(0), + _baseSocket(_ioContext) + { auto console_logger = spdlog::get("XDBC.CLIENT"); - if (!console_logger) { + if (!console_logger) + { // Logger does not exist, create it console_logger = spdlog::stdout_color_mt("XDBC.CLIENT"); } @@ -43,35 +44,29 @@ namespace xdbc { PTQ_ptr pq(new customQueue); env.pts = pq; - - spdlog::get("XDBC.CLIENT")->info("Creating Client: {0}, BPS: {1}, BS: {2} KiB, TS: {3} bytes, iformat: {4} ", - _xdbcenv->env_name, env.buffers_in_bufferpool, env.buffer_size, env.tuple_size, - env.iformat); + spdlog::get("XDBC.CLIENT")->info("Creating Client: {0}, BPS: {1}, BS: {2} KiB, TS: {3} bytes, iformat: {4} ", _xdbcenv->env_name, env.buffers_in_bufferpool, env.buffer_size, env.tuple_size, env.iformat); // populate bufferpool with empty vectors (header + payload) _bufferPool.resize(env.buffers_in_bufferpool, std::vector(sizeof(Header) + env.tuples_per_buffer * env.tuple_size)); _xdbcenv->bp = &_bufferPool; - //calculate buffers per queue + // calculate buffers per queue int total_workers = _xdbcenv->rcv_parallelism + _xdbcenv->decomp_parallelism + _xdbcenv->ser_parallelism + _xdbcenv->write_parallelism; - //TODO: check and increase bufferpool size if necessary or exit + // TODO: check and increase bufferpool size if necessary or exit int available_buffers_for_queues = _xdbcenv->buffers_in_bufferpool - total_workers; if (_xdbcenv->buffers_in_bufferpool < total_workers || - available_buffers_for_queues < total_workers) { - - spdlog::get("XDBC.CLIENT")->error( - "Buffer allocation error: Total buffers: {0}. " - "\nRequired buffers: Total: {1}," - "\nAvailable for queues: {2}. " - "\nIncrease the buffer pool size to at least {1}.", - _xdbcenv->buffers_in_bufferpool, - total_workers, - available_buffers_for_queues); - + available_buffers_for_queues < total_workers) + { + + spdlog::get("XDBC.CLIENT")->error("Buffer allocation error: Total buffers: {0}. " + "\nRequired buffers: Total: {1}," + "\nAvailable for queues: {2}. " + "\nIncrease the buffer pool size to at least {1}.", + _xdbcenv->buffers_in_bufferpool, total_workers, available_buffers_for_queues); } int queueCapacityPerComp = available_buffers_for_queues / 4; @@ -94,37 +89,42 @@ namespace xdbc { _xdbcenv->finishedWriteThreads.store(0); // Initially populate the freeBufferIds (receive) queue with all buffer IDs - for (int i = 0; i < env.buffers_in_bufferpool; ++i) { + for (int i = 0; i < env.buffers_in_bufferpool; ++i) + { _xdbcenv->freeBufferIds->push(i); } - spdlog::get("XDBC.CLIENT")->info("Initialized queues, " "freeBuffersQ: {0}, " "compQ: {1}, " "decompQ: {1}, " "serQ: {2} ", env.buffers_in_bufferpool, queueCapacityPerComp, serQueueCapacity); - } - void XClient::finalize() { + void XClient::finishReceiving() + { - spdlog::get("XDBC.CLIENT")->info( - "Finalizing XClient: {0}, shutting down {1} receive threads & {2} decomp threads", - _xdbcenv->env_name, _xdbcenv->rcv_parallelism, _xdbcenv->decomp_parallelism); + spdlog::get("XDBC.CLIENT")->info("Finalizing XClient: {0}, shutting down {1} receive threads & {2} decomp threads", _xdbcenv->env_name, _xdbcenv->rcv_parallelism, _xdbcenv->decomp_parallelism); - _xdbcenv->monitor.store(false); - _monitorThread.join(); - - for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) { + for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) + { _decompThreads[i].join(); } - for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) { + for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) + { _rcvThreads[i].join(); } + _xdbcenv->env_manager.stop(); // *** Stop Reconfigurration handler + } + + void XClient::finalize() + { + _xdbcenv->monitor.store(false); + _monitorThread.join(); + _baseSocket.close(); spdlog::get("XDBC.CLIENT")->info("Finalizing: basesocket closed"); @@ -132,7 +132,6 @@ namespace xdbc { auto total_time = std::chrono::duration_cast(end - _xdbcenv->startTime).count(); spdlog::get("XDBC.CLIENT")->info("Total elapsed time: {0} ms", total_time); - auto pts = std::vector(_xdbcenv->pts->size()); while (_xdbcenv->pts->size() != 0) pts.push_back(_xdbcenv->pts->pop()); @@ -144,22 +143,20 @@ namespace xdbc { std::ostringstream totalThroughput; std::ostringstream perBufferThroughput; - for (const auto &[component, metrics]: component_metrics) { + for (const auto &[component, metrics] : component_metrics) + { - if (!component.empty()) { + if (!component.empty()) + { totalTimes << component << ":\t" << metrics.overall_time_ms << "ms, "; procTimes << component << ":\t" << metrics.processing_time_ms << "ms, "; waitingTimes << component << ":\t" << metrics.waiting_time_ms << "ms, "; totalThroughput << component << ":\t" << metrics.total_throughput << "mb/s, "; perBufferThroughput << component << ":\t" << metrics.per_buffer_throughput << "mb/s, "; } - } - spdlog::get("XDBC.CLIENT")->info( - "xdbc client | \n all:\t {} \n proc:\t{} \n wait:\t{} \n thr:\t {} \n thr/b:\t {}", - totalTimes.str(), procTimes.str(), waitingTimes.str(), totalThroughput.str(), - perBufferThroughput.str()); + spdlog::get("XDBC.CLIENT")->info("xdbc client | \n all:\t {} \n proc:\t{} \n wait:\t{} \n thr:\t {} \n thr/b:\t {}", totalTimes.str(), procTimes.str(), waitingTimes.str(), totalThroughput.str(), perBufferThroughput.str()); auto loads = printAndReturnAverageLoad(*_xdbcenv); @@ -205,56 +202,63 @@ namespace xdbc { << component_metrics["write"].per_buffer_throughput << "," << std::get<3>(loads) << "\n"; csv_file.close(); - } - - std::string XClient::get_name() const { + std::string XClient::get_name() const + { return _xdbcenv->env_name; } - std::string read_(tcp::socket &socket) { + std::string read_(tcp::socket &socket) + { boost::asio::streambuf buf; boost::system::error_code error; size_t bytes = boost::asio::read_until(socket, buf, "\n", error); - if (error) { + if (error) + { spdlog::get("XDBC.CLIENT")->warn("Boost error while reading: {0} ", error.message()); } std::string data = boost::asio::buffer_cast(buf.data()); return data; } - int XClient::startReceiving(const std::string &tableName) { + int XClient::startReceiving(const std::string &tableName) + { + // Start the reconfiguration manager + _xdbcenv->env_manager.start(); - //establish base connection with server + // establish base connection with server XClient::initialize(tableName); _xdbcenv->monitor.store(true); _monitorThread = std::thread(&XClient::monitorQueues, this, _xdbcenv->profilingInterval); - //create rcv threads - for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) { + // create rcv threads + for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) + { _rcvThreads[i] = std::thread(&XClient::receive, this, i); } - //create decomp threads - for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) { + // create decomp threads + for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) + { _decompThreads[i] = std::thread(&XClient::decompress, this, i); } spdlog::get("XDBC.CLIENT")->info("Initialized receiver & decomp threads"); - return 1; } - void XClient::monitorQueues(int interval_ms) { + void XClient::monitorQueues(int interval_ms) + { long long curTimeInterval = interval_ms / 1000; - while (_xdbcenv->monitor) { + while (_xdbcenv->monitor) + { auto now = std::chrono::high_resolution_clock::now(); auto timestamp = std::chrono::duration_cast(now.time_since_epoch()).count(); @@ -288,17 +292,17 @@ namespace xdbc { } } + void XClient::initialize(const std::string &tableName) + { - void XClient::initialize(const std::string &tableName) { - - //this is for IP address + // this is for IP address /*boost::asio::io_service io_service; //socket creation ip::tcp::socket socket(io_service); socket.connect(tcp::endpoint(boost::asio::ip::address::from_string("127.0.0.1"), 1234)); */ - //this is for hostname + // this is for hostname boost::asio::ip::tcp::resolver resolver(_ioContext); boost::asio::ip::tcp::resolver::query query(_xdbcenv->server_host, _xdbcenv->server_port); @@ -311,22 +315,22 @@ namespace xdbc { _baseSocket.connect(endpoint, ec); int tries = 0; - while (ec && tries < 3) { + while (ec && tries < 3) + { spdlog::get("XDBC.CLIENT")->warn("Basesocket not connecting, trying to reconnect..."); tries++; _baseSocket.close(); std::this_thread::sleep_for(_xdbcenv->sleep_time * 10); _baseSocket.connect(endpoint, ec); - } - if (ec) { + if (ec) + { spdlog::get("XDBC.CLIENT")->error("Failed to connect after retries: {0}", ec.message()); - throw boost::system::system_error(ec); // Explicitly throw if connection fails + throw boost::system::system_error(ec); // Explicitly throw if connection fails } - spdlog::get("XDBC.CLIENT")->info("Basesocket: connected to {0}:{1}", - endpoint.address().to_string(), endpoint.port()); + spdlog::get("XDBC.CLIENT")->info("Basesocket: connected to {0}:{1}", endpoint.address().to_string(), endpoint.port()); boost::system::error_code error; const std::string &msg = tableName; @@ -338,7 +342,6 @@ namespace xdbc { boost::asio::write(_baseSocket, tableNameBuffers, error); - std::uint32_t data_size = _xdbcenv->schemaJSON.size(); std::vector buffers; buffers.emplace_back(boost::asio::buffer(&data_size, sizeof(data_size))); @@ -346,19 +349,18 @@ namespace xdbc { boost::asio::write(_baseSocket, buffers, error); - //std::this_thread::sleep_for(_xdbcenv->sleep_time*10); + // std::this_thread::sleep_for(_xdbcenv->sleep_time*10); std::string ready = read_(_baseSocket); - //TODO: make a check that server is actually ready and try again until ready - //ready.erase(std::remove(ready.begin(), ready.end(), '\n'), ready.cend()); + // TODO: make a check that server is actually ready and try again until ready + // ready.erase(std::remove(ready.begin(), ready.end(), '\n'), ready.cend()); spdlog::get("XDBC.CLIENT")->info("Basesocket: Server signaled: {0}", ready); - //return socket; - + // return socket; } - - void XClient::receive(int thr) { + void XClient::receive(int thr) + { _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "rcv", "start"}); spdlog::get("XDBC.CLIENT")->info("Entered receive thread {0} ", thr); boost::asio::io_service io_service; @@ -371,25 +373,30 @@ namespace xdbc { bool connected = false; - try { + try + { socket.connect(endpoint); connected = true; - spdlog::get("XDBC.CLIENT")->info("Receive thread {0} connected to {1}:{2}", - thr, endpoint.address().to_string(), endpoint.port()); - - } catch (const boost::system::system_error &error) { + spdlog::get("XDBC.CLIENT")->info("Receive thread {0} connected to {1}:{2}", thr, endpoint.address().to_string(), endpoint.port()); + } + catch (const boost::system::system_error &error) + { spdlog::get("XDBC.CLIENT")->warn("Server error: {0}", error.what()); - //std::this_thread::sleep_for(_xdbcenv->sleep_time); + // std::this_thread::sleep_for(_xdbcenv->sleep_time); } - if (connected) { + if (connected) + { const std::string msg = std::to_string(thr) + "\n"; boost::system::error_code error; - try { + try + { size_t b = boost::asio::write(socket, boost::asio::buffer(msg), error); - } catch (const boost::system::system_error &e) { + } + catch (const boost::system::system_error &e) + { spdlog::get("XDBC.CLIENT")->warn("Could not write thread no, error: {0}", e.what()); } @@ -401,50 +408,46 @@ namespace xdbc { size_t headerBytes; size_t readBytes; - while (error != boost::asio::error::eof) { + while (error != boost::asio::error::eof) + { bpi = _xdbcenv->freeBufferIds->pop(); _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "rcv", "pop"}); - //spdlog::get("XDBC.CLIENT")->info("Receive thread {0} got buff {1}", thr, bpi); + // spdlog::get("XDBC.CLIENT")->info("Receive thread {0} got buff {1}", thr, bpi); // getting response from server, first the header headerBytes = boost::asio::read(socket, boost::asio::buffer(_bufferPool[bpi].data(), sizeof(Header)), boost::asio::transfer_exactly(sizeof(Header)), error); Header header = *reinterpret_cast
(_bufferPool[bpi].data()); - //TODO: handle error types (e.g., EOF) + // TODO: handle error types (e.g., EOF) if (error || header.compressionType > 6 || - header.totalSize > _xdbcenv->tuples_per_buffer * _xdbcenv->tuple_size) { - - if (error) { - spdlog::get("XDBC.CLIENT")->error("Receive thread {0}: boost error while reading header: {1}", - thr, - error.message()); - if (error == boost::asio::error::eof) { + header.totalSize > _xdbcenv->tuples_per_buffer * _xdbcenv->tuple_size) + { + + if (error) + { + spdlog::get("XDBC.CLIENT")->error("Receive thread {0}: boost error while reading header: {1}", thr, error.message()); + if (error == boost::asio::error::eof) + { spdlog::get("XDBC.CLIENT")->error("EOF"); } break; } - spdlog::get("XDBC.CLIENT")->error( - "Client: corrupt body: comp: {0}, size: {1}/{2}, headerbytes: {3}", - header.compressionType, header.totalSize, - _xdbcenv->tuples_per_buffer * _xdbcenv->tuple_size, headerBytes); - + spdlog::get("XDBC.CLIENT")->error("Client: corrupt body: comp: {0}, size: {1}/{2}, headerbytes: {3}", header.compressionType, header.totalSize, _xdbcenv->tuples_per_buffer * _xdbcenv->tuple_size, headerBytes); } // all good, read incoming body and measure time - readBytes = boost::asio::read(socket, boost::asio::buffer(_bufferPool[bpi].data() + sizeof(Header), - header.totalSize), + readBytes = boost::asio::read(socket, boost::asio::buffer(_bufferPool[bpi].data() + sizeof(Header), header.totalSize), boost::asio::transfer_exactly(header.totalSize), error); - //TODO: handle errors correctly - if (error) { - spdlog::get("XDBC.CLIENT")->error( - "Client: boost error while reading body: readBytes {0}, error: {1}", - readBytes, error.message()); - if (error == boost::asio::error::eof) { - + // TODO: handle errors correctly + if (error) + { + spdlog::get("XDBC.CLIENT")->error("Client: boost error while reading body: readBytes {0}, error: {1}", readBytes, error.message()); + if (error == boost::asio::error::eof) + { } break; } @@ -454,30 +457,33 @@ namespace xdbc { _xdbcenv->compressedBufferIds->push(bpi); buffers++; - } _xdbcenv->finishedRcvThreads.fetch_add(1); - if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) { + if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) + { for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) _xdbcenv->compressedBufferIds->push(-1); } socket.close(); spdlog::get("XDBC.CLIENT")->info("Receive thread {0} finished, #buffers: {1}", thr, buffers); - } else + } + else spdlog::get("XDBC.CLIENT")->error("Receive thread {0} could not connect", thr); _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "rcv", "end"}); } - void XClient::decompress(int thr) { + void XClient::decompress(int thr) + { _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "start"}); int decompError; int buffersDecompressed = 0; - while (true) { + while (true) + { int compBuffId = _xdbcenv->compressedBufferIds->pop(); @@ -487,46 +493,49 @@ namespace xdbc { Header *header = reinterpret_cast
(_bufferPool[compBuffId].data()); std::byte *compressed_buffer = _bufferPool[compBuffId].data() + sizeof(Header); - //spdlog::get("XDBC.CLIENT")->info("decompress thread total tuples {}", header->totalTuples); + // spdlog::get("XDBC.CLIENT")->info("decompress thread total tuples {}", header->totalTuples); - //just forward buffer if not compressed - if (header->compressionType == 0) { + // just forward buffer if not compressed + if (header->compressionType == 0) + { _xdbcenv->pts->push( - ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "push"}); + ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "push"}); _xdbcenv->decompressedBufferIds->push(compBuffId); + } + else if (header->compressionType > 0) + { - - } else if (header->compressionType > 0) { - - //we need a free buffer to decompress + // we need a free buffer to decompress int decompBuffId = _xdbcenv->freeBufferIds->pop(); - //spdlog::get("XDBC.CLIENT")->info("Decompressor thr {} got free buff {}", thr, decompBuffId); + // spdlog::get("XDBC.CLIENT")->info("Decompressor thr {} got free buff {}", thr, decompBuffId); auto &decompressed_buffer = _bufferPool[decompBuffId]; - //TODO: refactor decompress_cols with schema in Decompressor - if (header->compressionType == 6) { - - //TODO: decompress every column individually + // TODO: refactor decompress_cols with schema in Decompressor + if (header->compressionType == 6) + { - } else + // TODO: decompress every column individually + } + else decompError = Decompressor::decompress(header->compressionType, decompressed_buffer.data() + sizeof(Header), compressed_buffer, header->totalSize, _xdbcenv->tuples_per_buffer * _xdbcenv->tuple_size); - if (decompError == 1) { + if (decompError == 1) + { - //TODO: check error handling - spdlog::get("XDBC.CLIENT")->warn("decompress error: header: comp: {0}, size: {1}", - header->compressionType, header->totalSize); + // TODO: check error handling + spdlog::get("XDBC.CLIENT")->warn("decompress error: header: comp: {0}, size: {1}", header->compressionType, header->totalSize); - //since there was an error return both buffers + // since there was an error return both buffers _xdbcenv->freeBufferIds->push(compBuffId); _xdbcenv->freeBufferIds->push(decompBuffId); - - } else { + } + else + { Header newHeader{}; newHeader.totalTuples = header->totalTuples; @@ -534,10 +543,10 @@ namespace xdbc { newHeader.intermediateFormat = header->intermediateFormat; memcpy(decompressed_buffer.data(), &newHeader, sizeof(Header)); - //spdlog::get("XDBC.CLIENT")->warn("read totalTuples: {}", header->totalTuples); + // spdlog::get("XDBC.CLIENT")->warn("read totalTuples: {}", header->totalTuples); _xdbcenv->pts->push( - ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "push"}); + ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "push"}); _xdbcenv->decompressedBufferIds->push(decompBuffId); _xdbcenv->freeBufferIds->push(compBuffId); @@ -545,11 +554,11 @@ namespace xdbc { } buffersDecompressed++; - } _xdbcenv->finishedDecompThreads.fetch_add(1); - if (_xdbcenv->finishedDecompThreads == _xdbcenv->decomp_parallelism) { + if (_xdbcenv->finishedDecompThreads == _xdbcenv->decomp_parallelism) + { for (int i = 0; i < _xdbcenv->ser_parallelism; i++) _xdbcenv->decompressedBufferIds->push(-1); } @@ -557,20 +566,22 @@ namespace xdbc { _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "end"}); } - //TODO: handle parallelism internally - bool XClient::hasNext(int readThreadId) { + // TODO: handle parallelism internally + bool XClient::hasNext(int readThreadId) + { if (_emptyDecompThreadCtr == _xdbcenv->write_parallelism) return false; return true; } - //TODO: handle parallelism internally - buffWithId XClient::getBuffer(int readThreadId) { + // TODO: handle parallelism internally + buffWithId XClient::getBuffer(int readThreadId) + { int buffId = _xdbcenv->decompressedBufferIds->pop(); _xdbcenv->pts->push( - ProfilingTimestamps{std::chrono::high_resolution_clock::now(), readThreadId, "write", "pop"}); + ProfilingTimestamps{std::chrono::high_resolution_clock::now(), readThreadId, "write", "pop"}); buffWithId curBuf{}; if (buffId == -1) @@ -579,7 +590,8 @@ namespace xdbc { size_t totalTuples = 0; size_t totalSize = 0; - if (buffId > -1) { + if (buffId > -1) + { auto header = reinterpret_cast
(_bufferPool[buffId].data()); totalTuples = header->totalTuples; totalSize = header->totalSize; @@ -589,21 +601,23 @@ namespace xdbc { curBuf.totalTuples = totalTuples; curBuf.totalSize = totalSize; - //TODO: set intermediate format dynamically + // TODO: set intermediate format dynamically curBuf.iformat = _xdbcenv->iformat; - //spdlog::get("XDBC.CLIENT")->warn("Sending buffer {0} to read thread {1}", buffId, readThreadId); + // spdlog::get("XDBC.CLIENT")->warn("Sending buffer {0} to read thread {1}", buffId, readThreadId); return curBuf; } - int XClient::getBufferPoolSize() const { + int XClient::getBufferPoolSize() const + { return _xdbcenv->buffers_in_bufferpool; } - void XClient::markBufferAsRead(int buffId) { - //TODO: ensure equal distribution - //spdlog::get("XDBC.CLIENT")->warn("freeing {0} for {1}", buffId, _markedFreeCounter % _xdbcenv->rcv_parallelism); + void XClient::markBufferAsRead(int buffId) + { + // TODO: ensure equal distribution + // spdlog::get("XDBC.CLIENT")->warn("freeing {0} for {1}", buffId, _markedFreeCounter % _xdbcenv->rcv_parallelism); _xdbcenv->freeBufferIds->push(buffId); _markedFreeCounter.fetch_add(1); } diff --git a/xdbc/xclient.h b/xdbc/xclient.h index 2b3343d..ef73d0e 100644 --- a/xdbc/xclient.h +++ b/xdbc/xclient.h @@ -16,10 +16,11 @@ using namespace boost::asio; using ip::tcp; -namespace xdbc { +namespace xdbc +{ - - struct buffWithId { + struct buffWithId + { int id; int iformat; size_t totalTuples; @@ -27,9 +28,9 @@ namespace xdbc { std::byte *buff; }; - class XClient { + class XClient + { private: - RuntimeEnv *_xdbcenv; std::vector> _bufferPool; std::vector> _consumedAll; @@ -68,76 +69,75 @@ namespace xdbc { [[nodiscard]] std::string get_name() const; -/* - * Purpose: Retrieves the name of the XClient environment. - * Input: None. - * Output: Returns the environment name as a string. - * Process: Fetches and returns the `env_name` from the `_xdbcenv` object. - */ + /* + * Purpose: Retrieves the name of the XClient environment. + * Input: None. + * Output: Returns the environment name as a string. + * Process: Fetches and returns the `env_name` from the `_xdbcenv` object. + */ void receive(int threadno); -/* - * Purpose: Handles receiving data from the server on a specific thread. - * Input: The thread index (`thr`) to uniquely identify the thread. - * Output: None. - * Process: - * - Establishes a socket connection to the server (specific to the thread index). - * - Sends the thread index to the server to indicate which thread is handling the data. - * - Reads header and body data from the server, handling potential errors. - * - Processes the data (e.g., checks headers for correctness) and pushes it to the decompression queue. - * - Tracks and logs profiling timestamps throughout the process. - * - Closes the socket when finished and logs the number of buffers processed. - */ + /* + * Purpose: Handles receiving data from the server on a specific thread. + * Input: The thread index (`thr`) to uniquely identify the thread. + * Output: None. + * Process: + * - Establishes a socket connection to the server (specific to the thread index). + * - Sends the thread index to the server to indicate which thread is handling the data. + * - Reads header and body data from the server, handling potential errors. + * - Processes the data (e.g., checks headers for correctness) and pushes it to the decompression queue. + * - Tracks and logs profiling timestamps throughout the process. + * - Closes the socket when finished and logs the number of buffers processed. + */ void decompress(int threadno); -/* - void XClient::decompress(int thr) { - Purpose: Decompresses buffers received by a specific thread and processes them. - Input: The thread ID (`thr`) specifying which decompression thread is calling the function. - Output: Decompressed data is written back into the buffer pool for further processing. - Data Processing: The function pops compressed buffer IDs, checks the compression type, decompresses the data (either via specific column decompression or general decompression), and writes the decompressed data into a buffer. The process involves handling different attribute types and error checking. Profiling timestamps are logged during various stages of decompression. - } - */ + /* + void XClient::decompress(int thr) { + Purpose: Decompresses buffers received by a specific thread and processes them. + Input: The thread ID (`thr`) specifying which decompression thread is calling the function. + Output: Decompressed data is written back into the buffer pool for further processing. + Data Processing: The function pops compressed buffer IDs, checks the compression type, decompresses the data (either via specific column decompression or general decompression), and writes the decompressed data into a buffer. The process involves handling different attribute types and error checking. Profiling timestamps are logged during various stages of decompression. + } + */ void initialize(const std::string &tableName); -/* - * Purpose: Initializes the base connection to the server and sends the table schema. - * Input: The table name (`tableName`) to identify the target table. - * Output: None. - * Process: - * - Resolves the server's address and establishes a connection to the server using Boost.Asio. - * - Sends the table name and schema data to the server. - * - Reads the server's "ready" signal to confirm that it is prepared for communication. - * - Handles retries if the connection fails. - */ + /* + * Purpose: Initializes the base connection to the server and sends the table schema. + * Input: The table name (`tableName`) to identify the target table. + * Output: None. + * Process: + * - Resolves the server's address and establishes a connection to the server using Boost.Asio. + * - Sends the table name and schema data to the server. + * - Reads the server's "ready" signal to confirm that it is prepared for communication. + * - Handles retries if the connection fails. + */ int startReceiving(const std::string &tableName); -/* - * Purpose: Initializes the client to start receiving data from the server. - * Input: A table name string (`tableName`) to identify the target table. - * Output: Returns 1 if successful, indicating that receiving is initialized. - * Process: - * - Calls the `initialize()` function to establish a base connection with the server. - * - Starts a monitor thread to track queue sizes at intervals. - * - Creates and starts multiple threads to handle receiving, decompression, and writing operations. - * - Sets up necessary buffer pools for each thread to operate on. - */ + /* + * Purpose: Initializes the client to start receiving data from the server. + * Input: A table name string (`tableName`) to identify the target table. + * Output: Returns 1 if successful, indicating that receiving is initialized. + * Process: + * - Calls the `initialize()` function to establish a base connection with the server. + * - Starts a monitor thread to track queue sizes at intervals. + * - Creates and starts multiple threads to handle receiving, decompression, and writing operations. + * - Sets up necessary buffer pools for each thread to operate on. + */ bool hasNext(int readThread); -/* - bool XClient::hasNext(int readThreadId) { - Purpose: Checks if there are more buffers available for processing by the specified read thread. - Input: The read thread ID (`readThreadId`). - Output: Returns `true` if there are still decompressed buffers for the read thread to process, `false` otherwise. - Data Processing: Compares the number of empty decompression threads with the total parallelism to decide if the read thread can continue fetching buffers. - } - */ - + /* + bool XClient::hasNext(int readThreadId) { + Purpose: Checks if there are more buffers available for processing by the specified read thread. + Input: The read thread ID (`readThreadId`). + Output: Returns `true` if there are still decompressed buffers for the read thread to process, `false` otherwise. + Data Processing: Compares the number of empty decompression threads with the total parallelism to decide if the read thread can continue fetching buffers. + } + */ buffWithId getBuffer(int readThread); @@ -164,43 +164,44 @@ namespace xdbc { void finalize(); /* - * Purpose: Cleans up and finalizes the client by shutting down threads and closing connections. - * Input: None. - * Output: None. - * Process: - * - Logs the finalization of the client with thread counts. - * - Joins all active threads (receive, decompress) to ensure all tasks complete. - * - Closes the base socket connection. - * - Logs the total elapsed time for the client run. - * - Collects profiling timestamps, calculates component metrics (e.g., receive, decompress, write times), - * and formats them into strings. - * - Writes performance metrics to a CSV file. - */ + * Purpose: Cleans up and finalizes the client by shutting down threads and closing connections. + * Input: None. + * Output: None. + * Process: + * - Logs the finalization of the client with thread counts. + * - Joins all active threads (receive, decompress) to ensure all tasks complete. + * - Closes the base socket connection. + * - Logs the total elapsed time for the client run. + * - Collects profiling timestamps, calculates component metrics (e.g., receive, decompress, write times), + * and formats them into strings. + * - Writes performance metrics to a CSV file. + */ void markBufferAsRead(int buffId); -/* - void XClient::markBufferAsRead(int buffId) { - Purpose: Marks a buffer as read and frees it for reuse by other threads. - Input: The buffer ID (`buffId`) to be marked as read. - Output: None (the buffer is freed for reuse). - Data Processing: Pushes the buffer ID onto a free buffer queue (`freeBufferIds`) for reuse and increments a counter to ensure buffer distribution among threads. - } - */ + /* + void XClient::markBufferAsRead(int buffId) { + Purpose: Marks a buffer as read and frees it for reuse by other threads. + Input: The buffer ID (`buffId`) to be marked as read. + Output: None (the buffer is freed for reuse). + Data Processing: Pushes the buffer ID onto a free buffer queue (`freeBufferIds`) for reuse and increments a counter to ensure buffer distribution among threads. + } + */ void monitorQueues(int interval_ms); -/* - * Purpose: Monitors and logs the size of various queues (free, compressed, decompressed) periodically. - * Input: Interval in milliseconds (`interval_ms`) to check queue sizes. - * Output: None. - * Process: - * - Continuously monitors the size of the free, compressed, and decompressed buffers. - * - Logs the size of the queues at regular intervals. - * - Stores queue size data as a tuple for later analysis. - */ - + /* + * Purpose: Monitors and logs the size of various queues (free, compressed, decompressed) periodically. + * Input: Interval in milliseconds (`interval_ms`) to check queue sizes. + * Output: None. + * Process: + * - Continuously monitors the size of the free, compressed, and decompressed buffers. + * - Logs the size of the queues at regular intervals. + * - Stores queue size data as a tuple for later analysis. + */ + + void finishReceiving(); }; } -#endif //XDBC_XCLIENT_H +#endif // XDBC_XCLIENT_H From 5408e7588359965de2988b82794e34b71ee7fba8 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Fri, 28 Feb 2025 15:39:08 +0100 Subject: [PATCH 03/16] Add dynamic thread reconfiguration for decompress --- Sinks/main.cpp | 4 +++- xdbc/xclient.cpp | 55 +++++++++++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 32ff760..1502230 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -322,12 +322,14 @@ int main(int argc, char *argv[]) std::this_thread::sleep_for(std::chrono::milliseconds(6000)); env.ser_parallelism = 2; env.env_manager.configureThreads("serial", env.ser_parallelism); + // Wait for threads to finish + xclient.finishReceiving(); + env.env_manager.configureThreads("serial", 0); env.env_manager.joinThreads("serial"); env.env_manager.configureThreads("write", 0); env.env_manager.joinThreads("write"); - xclient.finishReceiving(); xclient.finalize(); spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); // *** Stop websocket client diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 8d7b34d..1a2c3b1 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -105,19 +105,13 @@ namespace xdbc void XClient::finishReceiving() { - spdlog::get("XDBC.CLIENT")->info("Finalizing XClient: {0}, shutting down {1} receive threads & {2} decomp threads", _xdbcenv->env_name, _xdbcenv->rcv_parallelism, _xdbcenv->decomp_parallelism); - - for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) - { - _decompThreads[i].join(); - } - for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) { _rcvThreads[i].join(); } - - _xdbcenv->env_manager.stop(); // *** Stop Reconfigurration handler + _xdbcenv->env_manager.configureThreads("decompress", 0); + _xdbcenv->env_manager.joinThreads("decompress"); + spdlog::get("XDBC.CLIENT")->info("Finalizing XClient: {0}, shutting down {1} receive threads & {2} decomp threads", _xdbcenv->env_name, _xdbcenv->rcv_parallelism, _xdbcenv->decomp_parallelism); } void XClient::finalize() @@ -202,6 +196,8 @@ namespace xdbc << component_metrics["write"].per_buffer_throughput << "," << std::get<3>(loads) << "\n"; csv_file.close(); + + _xdbcenv->env_manager.stop(); // *** Stop Reconfigurration handler } std::string XClient::get_name() const @@ -241,11 +237,20 @@ namespace xdbc _rcvThreads[i] = std::thread(&XClient::receive, this, i); } - // create decomp threads - for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) - { - _decompThreads[i] = std::thread(&XClient::decompress, this, i); - } + _xdbcenv->env_manager.registerOperation("decompress", [&](int thr) + { try { + if (thr >= _xdbcenv->max_threads) { + spdlog::get("XDBC.XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, _xdbcenv->max_threads); + return; // Prevent out-of-bounds access + } + decompress(thr); + } catch (const std::exception& e) { + spdlog::get("XDBC.XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XDBC.XCLIENT")->error("Unknown exception in thread {}", thr); + } }, _xdbcenv->compressedBufferIds); + + _xdbcenv->env_manager.configureThreads("decompress", _xdbcenv->decomp_parallelism); // start serial component threads spdlog::get("XDBC.CLIENT")->info("Initialized receiver & decomp threads"); @@ -460,11 +465,12 @@ namespace xdbc } _xdbcenv->finishedRcvThreads.fetch_add(1); - if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) - { - for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) - _xdbcenv->compressedBufferIds->push(-1); - } + // Not needed anymore + // if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) + // { + // for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) + // _xdbcenv->compressedBufferIds->push(-1); + // } socket.close(); spdlog::get("XDBC.CLIENT")->info("Receive thread {0} finished, #buffers: {1}", thr, buffers); @@ -557,11 +563,12 @@ namespace xdbc } _xdbcenv->finishedDecompThreads.fetch_add(1); - if (_xdbcenv->finishedDecompThreads == _xdbcenv->decomp_parallelism) - { - for (int i = 0; i < _xdbcenv->ser_parallelism; i++) - _xdbcenv->decompressedBufferIds->push(-1); - } + // *****************Not needed anymore ***************** + // if (_xdbcenv->finishedDecompThreads == _xdbcenv->decomp_parallelism) + // { + // for (int i = 0; i < _xdbcenv->ser_parallelism; i++) + // _xdbcenv->decompressedBufferIds->push(-1); + // } spdlog::get("XDBC.CLIENT")->warn("Decomp thread {0} finished, {1} buffers", thr, buffersDecompressed); _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "end"}); } From e2a2e298d787c93e64583076229d9d540203c2cf Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Fri, 28 Feb 2025 16:50:27 +0100 Subject: [PATCH 04/16] Create and join threads of receive using env manager --- Sinks/main.cpp | 8 +-- docker-xdbc.yml | 38 +++++++------- .../EnvironmentManager.h | 2 +- xdbc/RuntimeEnv.h | 2 +- xdbc/customQueue.h | 49 +++++++++++-------- xdbc/xclient.cpp | 24 +++++---- 6 files changed, 61 insertions(+), 62 deletions(-) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 1502230..2f828a2 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -8,7 +8,6 @@ #include #include #include "../xdbc/ControllerInterface/WebSocketClient.h" -#include "../xdbc/EnvironmentReconfigure/EnvironmentManager.h" #include "../xdbc/metrics_calculator.h" // Utility functions for schema handling @@ -275,8 +274,6 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("write", [&](int thr) { try { if (thr >= env.max_threads) { - spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); - return; // Prevent out-of-bounds access } csvSink.write(thr); } catch (const std::exception& e) { @@ -304,10 +301,7 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("write", [&](int thr) { try { - if (thr >= env.max_threads) { - spdlog::get("XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, env.max_threads); - return; // Prevent out-of-bounds access - } + parquetSink.write(thr); } catch (const std::exception& e) { spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); diff --git a/docker-xdbc.yml b/docker-xdbc.yml index 43bde76..7e0e4f3 100644 --- a/docker-xdbc.yml +++ b/docker-xdbc.yml @@ -1,23 +1,23 @@ services: - xdbc-server: - image: xdbc-server:latest - container_name: xdbcserver - restart: always - volumes: - - /dev/shm:/dev/shm - ports: - - 1234:1234 - - 1235:1235 - - 1236:1236 - - 1237:1237 - - 1238:1238 - shm_size: '16gb' - cap_add: - - NET_ADMIN - networks: - - xdbc-net - labels: - com.docker-tc.enabled: 1 + # xdbc-server: + # image: xdbc-server:latest + # container_name: xdbcserver + # restart: always + # volumes: + # - /dev/shm:/dev/shm + # ports: + # - 1234:1234 + # - 1235:1235 + # - 1236:1236 + # - 1237:1237 + # - 1238:1238 + # shm_size: '16gb' + # cap_add: + # - NET_ADMIN + # networks: + # - xdbc-net + # labels: + # com.docker-tc.enabled: 1 xdbc-client: image: xdbc-client:latest diff --git a/xdbc/EnvironmentReconfigure/EnvironmentManager.h b/xdbc/EnvironmentReconfigure/EnvironmentManager.h index cc70038..15bf1d9 100644 --- a/xdbc/EnvironmentReconfigure/EnvironmentManager.h +++ b/xdbc/EnvironmentReconfigure/EnvironmentManager.h @@ -11,7 +11,7 @@ #include #include #include -#include "customQueue.h" +#include "../customQueue.h" // #include "../xclient.h" class EnvironmentManager diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index 8771b11..c225d02 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -1,7 +1,6 @@ #ifndef XDBC_RUNTIMEENV_H #define XDBC_RUNTIMEENV_H -// #include "customQueue.h" #include #include #include @@ -10,6 +9,7 @@ #include #include #include "EnvironmentReconfigure/EnvironmentManager.h" +// #include "customQueue.h" namespace xdbc { diff --git a/xdbc/customQueue.h b/xdbc/customQueue.h index 462058d..8c83d8e 100644 --- a/xdbc/customQueue.h +++ b/xdbc/customQueue.h @@ -2,8 +2,9 @@ #include #include -template -class customQueue { +template +class customQueue +{ private: std::mutex d_mutex; std::condition_variable d_condition; @@ -15,30 +16,36 @@ class customQueue { public: explicit customQueue(size_t max_capacity = 0) : capacity(max_capacity) {} - void push(T const &value) { + void push(T const &value) + { { std::unique_lock lock(this->d_mutex); - this->d_space_available.wait(lock, [=] { return capacity == 0 || d_queue.size() < capacity; }); + this->d_space_available.wait(lock, [=] + { return capacity == 0 || d_queue.size() < capacity; }); d_queue.push_front(value); } this->d_condition.notify_all(); } - T pop() { + T pop() + { std::unique_lock lock(this->d_mutex); - this->d_condition.wait(lock, [=] { return !this->d_queue.empty(); }); + this->d_condition.wait(lock, [=] + { return !this->d_queue.empty(); }); T rc(std::move(this->d_queue.back())); this->d_queue.pop_back(); this->d_space_available.notify_all(); // Notify threads waiting for space return rc; } - [[nodiscard]] size_t size() { + [[nodiscard]] size_t size() + { std::unique_lock lock(this->d_mutex); return d_queue.size(); } - void setCapacity(size_t new_capacity) { + void setCapacity(size_t new_capacity) + { { std::unique_lock lock(this->d_mutex); capacity = new_capacity; @@ -47,23 +54,23 @@ class customQueue { } // Get the current capacity - [[nodiscard]] size_t getCapacity() const { + [[nodiscard]] size_t getCapacity() const + { return capacity; } - std::vector copy_newElements() { - static size_t lastCopiedIndex = 0; // Tracks the last copied position - std::vector new_elements; // To store new elements - auto current_index = d_queue.size(); + std::vector copy_newElements() + { + static size_t lastCopiedIndex = 0; + std::vector new_elements; + auto current_index = this->d_queue.size(); // Use this->d_queue + + if (lastCopiedIndex < current_index) { - // std::unique_lock lock(this->d_mutex); // Lock for thread safety - if (lastCopiedIndex < - current_index) { // Check if there are new elements - new_elements.assign(d_queue.rbegin(), d_queue.rbegin() + (d_queue.size() - - lastCopiedIndex)); // Reverse copy the new elements - lastCopiedIndex = current_index; // Update the index for the next call - } + new_elements.assign(this->d_queue.rbegin(), // Use this->d_queue + this->d_queue.rbegin() + (this->d_queue.size() - lastCopiedIndex)); + lastCopiedIndex = current_index; } - return new_elements; // Return new elements in reverse order + return new_elements; } }; diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 1a2c3b1..f7af655 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -105,10 +105,7 @@ namespace xdbc void XClient::finishReceiving() { - for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) - { - _rcvThreads[i].join(); - } + _xdbcenv->env_manager.joinThreads("receive"); _xdbcenv->env_manager.configureThreads("decompress", 0); _xdbcenv->env_manager.joinThreads("decompress"); spdlog::get("XDBC.CLIENT")->info("Finalizing XClient: {0}, shutting down {1} receive threads & {2} decomp threads", _xdbcenv->env_name, _xdbcenv->rcv_parallelism, _xdbcenv->decomp_parallelism); @@ -231,18 +228,19 @@ namespace xdbc _monitorThread = std::thread(&XClient::monitorQueues, this, _xdbcenv->profilingInterval); - // create rcv threads - for (int i = 0; i < _xdbcenv->rcv_parallelism; i++) - { - _rcvThreads[i] = std::thread(&XClient::receive, this, i); - } + _xdbcenv->env_manager.registerOperation("receive", [&](int thr) + { try { + receive(thr); + } catch (const std::exception& e) { + spdlog::get("XDBC.XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); + } catch (...) { + spdlog::get("XDBC.XCLIENT")->error("Unknown exception in thread {}", thr); + } }, _xdbcenv->freeBufferIds); + + _xdbcenv->env_manager.configureThreads("receive", _xdbcenv->rcv_parallelism); // start serial component threads _xdbcenv->env_manager.registerOperation("decompress", [&](int thr) { try { - if (thr >= _xdbcenv->max_threads) { - spdlog::get("XDBC.XCLIENT")->error("Thread index {} exceeds preallocated size {}", thr, _xdbcenv->max_threads); - return; // Prevent out-of-bounds access - } decompress(thr); } catch (const std::exception& e) { spdlog::get("XDBC.XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); From 6c96e65157bc2ad54ac220f1df7c499fc4e5445c Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sat, 1 Mar 2025 07:40:38 +0100 Subject: [PATCH 05/16] Modify to work with Controller --- README.md | 7 ++----- Sinks/main.cpp | 41 +++++++++++++++++++++++------------------ xdbc/xclient.cpp | 8 ++++++++ 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index a434da9..f2d3f8a 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ docker exec -it xdbcserver bash -c "./xdbc-server/build/xdbc-server" ``` The XDBC Server supports multiple options. For example, to transfer from a CSV source with a buffer size of 256 kb, a buffer pool size of 16384, and the parallelism for deserialization at 16, read at 1 and compression at 2 with a row format and snappy, run: ``` -docker exec -it xdbcserver bash -c "./xdbc-server/build/xdbc-server \ ---system csv -b 256 -p 16384 --deser-parallelism 16 --read-parallelism 1 \ ---compression-parallelism=2 -f1 -csnappy" +docker exec -it xdbcserver bash -c "./xdbc-server/build/xdbc-server --system csv -b 1024 -p 32000 --deser-parallelism 8 --read-parallelism 1 --compression-parallelism=2 --network-parallelism=1 -f1 -csnappy" ``` Currently, XDBC assumes your data is placed in `/dev/shm`, which is also mapped to the containers' `/dev/shm`. ### Then initiate the transfer through a client @@ -35,8 +33,7 @@ docker exec -it xdbcserver bash -c "./xdbc-server/tests/build/test --table ss13h ``` The XDBC Client also supports multiple options. For example to transfer the ss13husallm dataset with a buffer size of, a buffer pool size of 16384, and the parallelism for writing at 16, decompression at 1, run: ``` -docker exec -it xdbcclient bash -c "/xdbc-client/tests/build/test_xclient --table ss13husallm \ --b 256 -p 16384 --write-parallelism 16 --decomp-parallelism=2" +docker exec -it xdbcclient bash -c "/xdbc-client/Sinks/build/xdbcsinks --server-host="xdbcserver" --table ss13husallm -f1 -b 1024 -p 32000 -n1 -w1 -d1 -s1 --skip-serializer=0 --target=csv" ``` Please make sure that you have the `ss13husallm.csv` file in your `/dev/shm/` directory. Your output will be located at `/dev/shm/` ## Optimizer diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 2f828a2..15590f1 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -143,7 +143,7 @@ void handleSinkCMDParams(int argc, char *argv[], xdbc::RuntimeEnv &env, std::str nlohmann::json metrics_convert(xdbc::RuntimeEnv &env) { nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object - if ((env.pts)) + if ((env.pts) && (env.enable_updation == 1)) { std::vector env_pts; env_pts = env.pts->copy_newElements(); @@ -196,22 +196,18 @@ void env_convert(xdbc::RuntimeEnv &env, const nlohmann::json &env_json) env_.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); env_.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); env_.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); + env_.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); // Update the actual environment object if updates are allowed if (env.enable_updation == 1) { // std::lock_guard lock(env.env_mutex); - - env.transfer_id = env_.transfer_id; - env.table = env_.table; - env.server_host = env_.server_host; - env.iformat = env_.iformat; - env.sleep_time = env_.sleep_time; - env.buffer_size = env_.buffer_size; - env.buffers_in_bufferpool = env_.buffers_in_bufferpool; - env.rcv_parallelism = env_.rcv_parallelism; env.write_parallelism = env_.write_parallelism; env.decomp_parallelism = env_.decomp_parallelism; + // env.ser_parallelism = env_.ser_parallelism; + env.env_manager.configureThreads("decompress", env.write_parallelism); + // env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("write", env.write_parallelism); // Notify waiting threads about the update // env.env_condition.notify_all(); @@ -263,7 +259,10 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("serial", [&](int thr) { try { - + if (thr >= env.buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; + } csvSink.serialize(thr); } catch (const std::exception& e) { spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); @@ -273,7 +272,9 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("write", [&](int thr) { try { - if (thr >= env.max_threads) { + if (thr >= env.buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; } csvSink.write(thr); } catch (const std::exception& e) { @@ -291,7 +292,10 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("serial", [&](int thr) { try { - + if (thr >= env.buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; + } parquetSink.serialize(thr); } catch (const std::exception& e) { spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); @@ -301,7 +305,10 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("write", [&](int thr) { try { - + if (thr >= env.buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; + } parquetSink.write(thr); } catch (const std::exception& e) { spdlog::get("XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); @@ -313,10 +320,6 @@ int main(int argc, char *argv[]) env.env_manager.configureThreads("write", env.write_parallelism); // start write component threads } - std::this_thread::sleep_for(std::chrono::milliseconds(6000)); - env.ser_parallelism = 2; - env.env_manager.configureThreads("serial", env.ser_parallelism); - // Wait for threads to finish xclient.finishReceiving(); env.env_manager.configureThreads("serial", 0); @@ -324,6 +327,8 @@ int main(int argc, char *argv[]) env.env_manager.configureThreads("write", 0); env.env_manager.joinThreads("write"); + env.enable_updation = 0; + xclient.finalize(); spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); // *** Stop websocket client diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index f7af655..a9467aa 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -230,6 +230,10 @@ namespace xdbc _xdbcenv->env_manager.registerOperation("receive", [&](int thr) { try { + if (thr >= _xdbcenv->buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; + } receive(thr); } catch (const std::exception& e) { spdlog::get("XDBC.XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); @@ -241,6 +245,10 @@ namespace xdbc _xdbcenv->env_manager.registerOperation("decompress", [&](int thr) { try { + if (thr >= _xdbcenv->buffers_in_bufferpool) { + spdlog::get("XCLIENT")->error("No of threads exceed limit"); + return; + } decompress(thr); } catch (const std::exception& e) { spdlog::get("XDBC.XCLIENT")->error("Exception in thread {}: {}", thr, e.what()); From 1c887916deb9588103519011a0ac6d2c99f6f996 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sat, 1 Mar 2025 09:31:28 +0100 Subject: [PATCH 06/16] Modify main to reconfigure parallelism outside env_convert --- Sinks/CSVSink/CSVSink.cpp | 14 +-- Sinks/PQSink/PQSink.cpp | 243 +++++++++++++++++++++++--------------- Sinks/main.cpp | 55 +++++---- xdbc/RuntimeEnv.h | 2 +- xdbc/xclient.cpp | 12 +- 5 files changed, 192 insertions(+), 134 deletions(-) diff --git a/Sinks/CSVSink/CSVSink.cpp b/Sinks/CSVSink/CSVSink.cpp index 99bd9ac..a16f9f7 100644 --- a/Sinks/CSVSink/CSVSink.cpp +++ b/Sinks/CSVSink/CSVSink.cpp @@ -347,13 +347,13 @@ void CsvSink::serialize(int thr) spdlog::get("XDBC.CSVSINK")->info("CSV Serializer stopping thread {}, written buffers: {}, tuples: {}", thr, writtenBuffers, writtenTuples); runtimeEnv->finishedSerializerThreads.fetch_add(1); - if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) - { - for (int i = 0; i < runtimeEnv->write_parallelism; ++i) - { - // runtimeEnv->serializedBufferIds->push(-1); // Termination signal - } - } + // if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) + // { + // for (int i = 0; i < runtimeEnv->write_parallelism; ++i) + // { + // // runtimeEnv->serializedBufferIds->push(-1); // Termination signal + // } + // } } void CsvSink::write(int thr) diff --git a/Sinks/PQSink/PQSink.cpp b/Sinks/PQSink/PQSink.cpp index 87e5c8e..a87590d 100644 --- a/Sinks/PQSink/PQSink.cpp +++ b/Sinks/PQSink/PQSink.cpp @@ -13,106 +13,135 @@ #include "spdlog/sinks/stdout_color_sinks.h" std::shared_ptr -CreateParquetSchema(const std::vector &schemaAttributes) { +CreateParquetSchema(const std::vector &schemaAttributes) +{ parquet::schema::NodeVector fields; - for (const auto &attr: schemaAttributes) { - if (attr.tpe == "INT") { + for (const auto &attr : schemaAttributes) + { + if (attr.tpe == "INT") + { fields.push_back(parquet::schema::PrimitiveNode::Make( - attr.name, parquet::Repetition::REQUIRED, parquet::Type::INT32, parquet::ConvertedType::INT_32)); - } else if (attr.tpe == "DOUBLE") { + attr.name, parquet::Repetition::REQUIRED, parquet::Type::INT32, parquet::ConvertedType::INT_32)); + } + else if (attr.tpe == "DOUBLE") + { fields.push_back(parquet::schema::PrimitiveNode::Make( - attr.name, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); - } else if (attr.tpe == "STRING") { + attr.name, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); + } + else if (attr.tpe == "STRING") + { fields.push_back(parquet::schema::PrimitiveNode::Make( - attr.name, parquet::Repetition::REQUIRED, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8)); - } else if (attr.tpe == "CHAR") { - if (attr.size <= 0) { + attr.name, parquet::Repetition::REQUIRED, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8)); + } + else if (attr.tpe == "CHAR") + { + if (attr.size <= 0) + { throw std::invalid_argument("Fixed-size STRING/CHAR must have a positive size."); } fields.push_back(parquet::schema::PrimitiveNode::Make( - attr.name, parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, - parquet::ConvertedType::NONE, attr.size)); - } else { + attr.name, parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::NONE, attr.size)); + } + else + { throw std::invalid_argument("Unsupported type: " + attr.tpe); } } return std::static_pointer_cast( - parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); } - PQSink::PQSink(const std::string &baseFilename, xdbc::RuntimeEnv *runtimeEnv) - : baseFilename(baseFilename), runtimeEnv(runtimeEnv) { + : baseFilename(baseFilename), runtimeEnv(runtimeEnv) +{ bufferPool = runtimeEnv->bp; auto console = spdlog::stdout_color_mt("XDBC.PQSINK"); std::string folderPath = baseFilename + "_" + runtimeEnv->table; - try { + try + { // Check if the folder exists - if (std::filesystem::exists(folderPath)) { + if (std::filesystem::exists(folderPath)) + { spdlog::get("XDBC.PQSINK")->info("Folder exists, deleting: {}", folderPath); std::filesystem::remove_all(folderPath); // Delete the folder and its contents } std::filesystem::create_directories(folderPath); spdlog::get("XDBC.PQSINK")->info("Created directory: {}", folderPath); - } - catch (const std::filesystem::filesystem_error &e) { + catch (const std::filesystem::filesystem_error &e) + { spdlog::get("XDBC.PQSINK")->error("Error managing folder: {}", e.what()); } } -void PQSink::serialize(int thr) { +void PQSink::serialize(int thr) +{ std::vector sizes(runtimeEnv->schema.size()); - for (size_t i = 0; i < runtimeEnv->schema.size(); ++i) { + for (size_t i = 0; i < runtimeEnv->schema.size(); ++i) + { - if (runtimeEnv->schema[i].tpe[0] == 'I') { + if (runtimeEnv->schema[i].tpe[0] == 'I') + { sizes[i] = 4; // sizeof(int) - } else if (runtimeEnv->schema[i].tpe[0] == 'D') { + } + else if (runtimeEnv->schema[i].tpe[0] == 'D') + { sizes[i] = 8; // sizeof(double) - } else if (runtimeEnv->schema[i].tpe[0] == 'C') { + } + else if (runtimeEnv->schema[i].tpe[0] == 'C') + { sizes[i] = 1; // sizeof(char) - } else if (runtimeEnv->schema[i].tpe[0] == 'S') { + } + else if (runtimeEnv->schema[i].tpe[0] == 'S') + { sizes[i] = runtimeEnv->schema[i].size; } } runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "start"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "start"}); size_t writtenBuffers = 0; - if (runtimeEnv->skip_serializer) { + if (runtimeEnv->skip_serializer) + { - while (true) { + while (true) + { int bufferId = runtimeEnv->decompressedBufferIds->pop(); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - if (bufferId == -1) break; + if (bufferId == -1) + break; runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); runtimeEnv->serializedBufferIds->push(bufferId); writtenBuffers++; } - - } else if (runtimeEnv->iformat == 3) { // Format == 3: Arrow - //TODO: check + } + else if (runtimeEnv->iformat == 3) + { // Format == 3: Arrow + // TODO: check spdlog::get("XDBC.PQSINK")->info("PQSINK Parquet serialization started."); - while (true) { + while (true) + { int bufferId = runtimeEnv->decompressedBufferIds->pop(); int outBufferId = runtimeEnv->freeBufferIds->pop(); auto &outBuffer = (*bufferPool)[outBufferId]; runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - if (bufferId == -1) break; + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); + if (bufferId == -1) + break; const auto &inBufferPtr = (*bufferPool)[bufferId]; auto header = *reinterpret_cast(inBufferPtr.data()); @@ -122,7 +151,8 @@ void PQSink::serialize(int thr) { auto bufferReader = std::make_shared(arrowBuffer); std::shared_ptr fileReader; auto fileReaderResult = arrow::ipc::RecordBatchFileReader::Open(bufferReader); - if (!fileReaderResult.ok()) { + if (!fileReaderResult.ok()) + { spdlog::error("Error opening RecordBatchFileReader: {}", fileReaderResult.status().ToString()); return; } @@ -131,7 +161,8 @@ void PQSink::serialize(int thr) { // Assuming the file contains one RecordBatch, read it std::shared_ptr recordBatch; auto recordBatchResult = fileReader->ReadRecordBatch(0); - if (!recordBatchResult.ok()) { + if (!recordBatchResult.ok()) + { spdlog::error("Error reading RecordBatch: {}", recordBatchResult.status().ToString()); return; } @@ -139,52 +170,58 @@ void PQSink::serialize(int thr) { // Step 2: Prepare to write the RecordBatch as a Parquet file into memory auto outputBuffer = std::make_shared( - std::make_shared(reinterpret_cast(outBuffer.data()), - runtimeEnv->buffer_size * 1024 - sizeof(xdbc::Header)) - ); + std::make_shared(reinterpret_cast(outBuffer.data()), + runtimeEnv->buffer_size * 1024 - sizeof(xdbc::Header))); // Step 3: Configure Parquet writer properties auto writerProperties = parquet::WriterProperties::Builder() - .compression(arrow::Compression::SNAPPY)->build(); + .compression(arrow::Compression::SNAPPY) + ->build(); auto arrowWriterProperties = parquet::ArrowWriterProperties::Builder() - .store_schema()->build(); + .store_schema() + ->build(); // Step 4: Write the RecordBatch as a Parquet file auto tableResult = arrow::Table::FromRecordBatches({recordBatch}); - if (!tableResult.ok()) { + if (!tableResult.ok()) + { spdlog::error("Error converting RecordBatch to Table: {}", tableResult.status().ToString()); return; } auto table = tableResult.ValueOrDie(); auto writeStatus = parquet::arrow::WriteTable( - *table, // Arrow table - arrow::default_memory_pool(), // Memory pool - outputBuffer, // Output stream - /*chunk_size=*/3, // Chunk size - writerProperties, // Writer properties - arrowWriterProperties // Arrow writer properties + *table, // Arrow table + arrow::default_memory_pool(), // Memory pool + outputBuffer, // Output stream + /*chunk_size=*/3, // Chunk size + writerProperties, // Writer properties + arrowWriterProperties // Arrow writer properties ); - if (!writeStatus.ok()) { + if (!writeStatus.ok()) + { spdlog::error("Error writing Parquet file: {}", writeStatus.ToString()); } // Step 5: Finalize the buffer writer auto closeStatus = outputBuffer->Close(); - if (!closeStatus.ok()) { + if (!closeStatus.ok()) + { spdlog::error("Error finalizing output buffer: {}", closeStatus.ToString()); } runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); runtimeEnv->serializedBufferIds->push(outBufferId); runtimeEnv->freeBufferIds->push(bufferId); } + } + else if (runtimeEnv->iformat == 2) + { - } else if (runtimeEnv->iformat == 2) { - - while (true) { + while (true) + { int inBufferId = runtimeEnv->decompressedBufferIds->pop(); if (inBufferId == -1) @@ -198,65 +235,79 @@ void PQSink::serialize(int thr) { std::vector columnOffsets(runtimeEnv->schema.size()); size_t totalRowSize = 0; - for (size_t j = 0; j < runtimeEnv->schema.size(); ++j) { + for (size_t j = 0; j < runtimeEnv->schema.size(); ++j) + { columnOffsets[j] = totalRowSize; totalRowSize += sizes[j]; } std::vector columnStartPointers(runtimeEnv->schema.size()); size_t cumulativeOffset = 0; - for (size_t k = 0; k < runtimeEnv->schema.size(); ++k) { + for (size_t k = 0; k < runtimeEnv->schema.size(); ++k) + { columnStartPointers[k] = basePtr + cumulativeOffset; cumulativeOffset += runtimeEnv->tuples_per_buffer * sizes[k]; // Move by the total size of this column - } { auto outputBuffer = std::make_shared( - reinterpret_cast(outBuffer.data()), - runtimeEnv->buffer_size * 1024); + reinterpret_cast(outBuffer.data()), + runtimeEnv->buffer_size * 1024); auto bufferWriter = std::make_shared(outputBuffer); auto parquetSchema = CreateParquetSchema(runtimeEnv->schema); parquet::WriterProperties::Builder writerPropertiesBuilder; - //writerPropertiesBuilder.compression(parquet::Compression::SNAPPY); + // writerPropertiesBuilder.compression(parquet::Compression::SNAPPY); parquet::StreamWriter streamWriter{ - parquet::ParquetFileWriter::Open(bufferWriter, parquetSchema, writerPropertiesBuilder.build())}; + parquet::ParquetFileWriter::Open(bufferWriter, parquetSchema, writerPropertiesBuilder.build())}; - //streamWriter.SetMaxRowGroupSize(header.totalTuples); - for (size_t i = 0; i < header.totalTuples; ++i) { + // streamWriter.SetMaxRowGroupSize(header.totalTuples); + for (size_t i = 0; i < header.totalTuples; ++i) + { - for (size_t j = 0; j < runtimeEnv->schema.size(); ++j) { + for (size_t j = 0; j < runtimeEnv->schema.size(); ++j) + { const char *dataPtr; - if (runtimeEnv->iformat == 1) { + if (runtimeEnv->iformat == 1) + { dataPtr = basePtr + i * totalRowSize + columnOffsets[j]; - } else if (runtimeEnv->iformat == 2) { + } + else if (runtimeEnv->iformat == 2) + { dataPtr = columnStartPointers[j] + i * sizes[j]; } const auto &attr = runtimeEnv->schema[j]; - if (attr.tpe[0] == 'I') { + if (attr.tpe[0] == 'I') + { int32_t value = *reinterpret_cast(dataPtr); streamWriter << 1; - } else if (attr.tpe[0] == 'D') { + } + else if (attr.tpe[0] == 'D') + { double value = *reinterpret_cast(dataPtr); streamWriter << 2.5; - } else if (attr.tpe[0] == 'S') { + } + else if (attr.tpe[0] == 'S') + { std::string value(dataPtr, attr.size); // Fixed-size string streamWriter << "test"; - } else if (attr.tpe[0] == 'C') { + } + else if (attr.tpe[0] == 'C') + { char t; // Single character as string streamWriter << 'a'; - } else { + } + else + { throw std::invalid_argument("Unsupported type: " + attr.tpe); } - } streamWriter << parquet::EndRow; } - //streamWriter << parquet::EndRowGroup; + // streamWriter << parquet::EndRowGroup; // Finalize and close the buffer writer auto closeStatus = bufferWriter->Close(); @@ -270,24 +321,26 @@ void PQSink::serialize(int thr) { runtimeEnv->serializedBufferIds->push(outBufferId); runtimeEnv->freeBufferIds->push(inBufferId); } - - } else { + } + else + { spdlog::get("XDBC.PQSINK")->error("PQSINK currently does not support serialization"); } runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "end"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "end"}); runtimeEnv->finishedSerializerThreads.fetch_add(1); - if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) { - for (int i = 0; i < runtimeEnv->write_parallelism; ++i) { - runtimeEnv->serializedBufferIds->push(-1); // Termination signal - } - } + // if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) { + // for (int i = 0; i < runtimeEnv->write_parallelism; ++i) { + // runtimeEnv->serializedBufferIds->push(-1); // Termination signal + // } + // } } -void PQSink::write(int thr) { +void PQSink::write(int thr) +{ - //for now each buffer is a pq file + // for now each buffer is a pq file runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "start"}); spdlog::get("XDBC.PQSINK")->info("PQ Writer started thread {}", thr); @@ -295,20 +348,22 @@ void PQSink::write(int thr) { size_t buffersWritten = 0; int fileSuffix = 0; - while (true) { + while (true) + { int bufferId = runtimeEnv->serializedBufferIds->pop(); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); - if (bufferId == -1) break; + if (bufferId == -1) + break; std::string folderPath = baseFilename + "_" + runtimeEnv->table; - std::string fileName = folderPath + "/" + runtimeEnv->table + "_" + std::to_string(fileSuffix) + ".parquet"; std::ofstream outputFile; outputFile.open(fileName, std::ios::out | std::ios::binary); - if (!outputFile.is_open()) { + if (!outputFile.is_open()) + { throw std::runtime_error("Failed to open output file: " + fileName); } fileSuffix++; @@ -320,16 +375,14 @@ void PQSink::write(int thr) { outputFile.write(dataPtr, header.totalSize); runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); runtimeEnv->freeBufferIds->push(bufferId); buffersWritten++; outputFile.close(); } - runtimeEnv->finishedWriteThreads.fetch_add(1); runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "end"}); spdlog::get("XDBC.PQSINK")->info("PQ Writer thread {} wrote buffers: {}", thr, buffersWritten); - } \ No newline at end of file diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 15590f1..4168f6f 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -196,18 +196,20 @@ void env_convert(xdbc::RuntimeEnv &env, const nlohmann::json &env_json) env_.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); env_.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); env_.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); - env_.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); + // env_.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); // Update the actual environment object if updates are allowed if (env.enable_updation == 1) { // std::lock_guard lock(env.env_mutex); env.write_parallelism = env_.write_parallelism; - env.decomp_parallelism = env_.decomp_parallelism; + // env.decomp_parallelism = env_.decomp_parallelism; + + // env.env_manager.configureThreads("decompress", env.decomp_parallelism); + // env.env_manager.configureThreads("write", env.write_parallelism); + // env.ser_parallelism = env_.ser_parallelism; - env.env_manager.configureThreads("decompress", env.write_parallelism); // env.env_manager.configureThreads("serial", env.ser_parallelism); - env.env_manager.configureThreads("write", env.write_parallelism); // Notify waiting threads about the update // env.env_condition.notify_all(); @@ -229,25 +231,6 @@ int main(int argc, char *argv[]) handleSinkCMDParams(argc, argv, env, outputBasePath); - // *** Setup websocket interface for controller *** - env.enable_updation = 1; - std::thread io_thread; - WebSocketClient ws_client("xdbc-controller", "8002"); - if (env.spawn_source == 1) - { - ws_client.start(); - io_thread = std::thread([&]() - { ws_client.run( - std::bind(&metrics_convert, std::ref(env)), - std::bind(&additional_msg, std::ref(env)), - std::bind(&env_convert, std::ref(env), std::placeholders::_1)); }); - while (!ws_client.is_active()) - { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - } - // *** Finished Setup websocket interface for controller *** - //*** // Initialize XClient xdbc::XClient xclient(env); @@ -320,15 +303,37 @@ int main(int argc, char *argv[]) env.env_manager.configureThreads("write", env.write_parallelism); // start write component threads } + // *** Setup websocket interface for controller *** + env.enable_updation = 1; + std::thread io_thread; + WebSocketClient ws_client("xdbc-controller", "8002"); + if (env.spawn_source == 1) + { + ws_client.start(); + io_thread = std::thread([&]() + { ws_client.run( + std::bind(&metrics_convert, std::ref(env)), + std::bind(&additional_msg, std::ref(env)), + std::bind(&env_convert, std::ref(env), std::placeholders::_1)); }); + while (!ws_client.is_active()) + { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + // *** Finished Setup websocket interface for controller *** // Wait for threads to finish + while (env.enable_updation == 1) + { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + env.env_manager.configureThreads("write", env.write_parallelism); + } + xclient.finishReceiving(); env.env_manager.configureThreads("serial", 0); env.env_manager.joinThreads("serial"); env.env_manager.configureThreads("write", 0); env.env_manager.joinThreads("write"); - env.enable_updation = 0; - xclient.finalize(); spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); // *** Stop websocket client diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index c225d02..5214e53 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -83,7 +83,7 @@ namespace xdbc int spawn_source; transfer_details tf_paras; - std::atomic enable_updation; + std::atomic enable_updation = 0; int max_threads = 16; EnvironmentManager env_manager; diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index a9467aa..7ff2ff2 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -259,7 +259,6 @@ namespace xdbc _xdbcenv->env_manager.configureThreads("decompress", _xdbcenv->decomp_parallelism); // start serial component threads spdlog::get("XDBC.CLIENT")->info("Initialized receiver & decomp threads"); - return 1; } @@ -472,11 +471,12 @@ namespace xdbc _xdbcenv->finishedRcvThreads.fetch_add(1); // Not needed anymore - // if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) - // { - // for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) - // _xdbcenv->compressedBufferIds->push(-1); - // } + if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) + { + // for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) + // _xdbcenv->compressedBufferIds->push(-1); + _xdbcenv->enable_updation = 0; + } socket.close(); spdlog::get("XDBC.CLIENT")->info("Receive thread {0} finished, #buffers: {1}", thr, buffers); From a5ecf560b5323c14cb5a7b8f8fd21cf7338ecd89 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sat, 1 Mar 2025 15:33:19 +0100 Subject: [PATCH 07/16] Remove unused codes --- Sinks/main.cpp | 53 ++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 4168f6f..beb0ebb 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -182,37 +182,24 @@ void env_convert(xdbc::RuntimeEnv &env, const nlohmann::json &env_json) { try { - // Temporary environment object to load values from JSON - xdbc::RuntimeEnv env_; - - // Parse JSON values into the temporary object - env_.transfer_id = std::stoll(env_json.at("transferID").get()); - env_.table = env_json.at("table").get(); - env_.server_host = env_json.at("serverHost").get(); - env_.iformat = std::stoi(env_json.at("intermediateFormat").get()); - env_.sleep_time = std::chrono::milliseconds(std::stoll(env_json.at("sleepTime").get())); - env_.buffer_size = std::stoi(env_json.at("bufferSize").get()); - env_.buffers_in_bufferpool = std::stoi(env_json.at("bufferpoolSize").get()) / env_.buffer_size; - env_.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); - env_.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); - env_.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); - // env_.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); + // env.transfer_id = std::stoll(env_json.at("transferID").get()); + // env.table = env_json.at("table").get(); + // env.server_host = env_json.at("serverHost").get(); + // env.iformat = std::stoi(env_json.at("intermediateFormat").get()); + // env.sleep_time = std::chrono::milliseconds(std::stoll(env_json.at("sleepTime").get())); + // env.buffer_size = std::stoi(env_json.at("bufferSize").get()); + // env.buffers_in_bufferpool = std::stoi(env_json.at("bufferpoolSize").get()) / env_.buffer_size; + // env.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); + // env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); + // env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); + // env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); // Update the actual environment object if updates are allowed if (env.enable_updation == 1) { - // std::lock_guard lock(env.env_mutex); - env.write_parallelism = env_.write_parallelism; - // env.decomp_parallelism = env_.decomp_parallelism; - - // env.env_manager.configureThreads("decompress", env.decomp_parallelism); - // env.env_manager.configureThreads("write", env.write_parallelism); - - // env.ser_parallelism = env_.ser_parallelism; - // env.env_manager.configureThreads("serial", env.ser_parallelism); - - // Notify waiting threads about the update - // env.env_condition.notify_all(); + env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); + env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); + env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); } } catch (const std::exception &e) @@ -315,19 +302,17 @@ int main(int argc, char *argv[]) std::bind(&metrics_convert, std::ref(env)), std::bind(&additional_msg, std::ref(env)), std::bind(&env_convert, std::ref(env), std::placeholders::_1)); }); - while (!ws_client.is_active()) - { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } } - // *** Finished Setup websocket interface for controller *** - // Wait for threads to finish - while (env.enable_updation == 1) + while (env.enable_updation == 1) // Reconfigure threads as long as it is allowed { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); env.env_manager.configureThreads("write", env.write_parallelism); + env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("decompress", env.decomp_parallelism); } + // *** Finished Setup websocket interface for controller *** + // Wait for receive threads to finish, then kill the remaining components in proper sequence : decompress-serial-write xclient.finishReceiving(); env.env_manager.configureThreads("serial", 0); env.env_manager.joinThreads("serial"); From cf7c4554e91502610fcbe094b4555bbf70a82b5a Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sat, 1 Mar 2025 17:58:55 +0100 Subject: [PATCH 08/16] Add transfer info providing additional details to controller --- Sinks/CSVSink/CSVSink.cpp | 1 + Sinks/main.cpp | 6 ++++-- xdbc/RuntimeEnv.h | 2 +- xdbc/xclient.cpp | 9 +++++++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Sinks/CSVSink/CSVSink.cpp b/Sinks/CSVSink/CSVSink.cpp index a16f9f7..8e6b7a2 100644 --- a/Sinks/CSVSink/CSVSink.cpp +++ b/Sinks/CSVSink/CSVSink.cpp @@ -394,6 +394,7 @@ void CsvSink::write(int thr) runtimeEnv->freeBufferIds->push(bufferId); buffersWritten++; + runtimeEnv->tf_paras.bufProcessed.at(thr) = buffersWritten; } outputFile.close(); diff --git a/Sinks/main.cpp b/Sinks/main.cpp index beb0ebb..f3e2dd1 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -135,6 +135,7 @@ void handleSinkCMDParams(int argc, char *argv[], xdbc::RuntimeEnv &env, std::str }); env.tuples_per_buffer = (env.buffer_size * 1024) / env.tuple_size; + env.max_threads = env.buffers_in_bufferpool; env.startTime = std::chrono::steady_clock::now(); spdlog::get("XDBC.SINK")->info("Table: {0}, Tuple size: {1}, Schema:\n{2}", env.table, env.tuple_size, formatSchema(env.schema)); @@ -174,6 +175,7 @@ nlohmann::json additional_msg(xdbc::RuntimeEnv &env) metrics_json["freeBufferQ_load"] = std::get<0>(env.tf_paras.latest_queueSizes); metrics_json["compressedBufferQ_load"] = std::get<1>(env.tf_paras.latest_queueSizes); metrics_json["decompressedBufferQ_load"] = std::get<2>(env.tf_paras.latest_queueSizes); + metrics_json["serializedBufferQ_load"] = std::get<3>(env.tf_paras.latest_queueSizes); return metrics_json; } @@ -229,7 +231,7 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("serial", [&](int thr) { try { - if (thr >= env.buffers_in_bufferpool) { + if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } @@ -242,7 +244,7 @@ int main(int argc, char *argv[]) env.env_manager.registerOperation("write", [&](int thr) { try { - if (thr >= env.buffers_in_bufferpool) { + if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index 5214e53..15e16fd 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -38,7 +38,7 @@ namespace xdbc { float elapsed_time = 0.0f; // Default value for elapsed_time std::vector bufProcessed; // Default value: vector with one element, 0 - std::tuple latest_queueSizes; + std::tuple latest_queueSizes; }; class RuntimeEnv diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 7ff2ff2..33ef880 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -94,6 +94,8 @@ namespace xdbc _xdbcenv->freeBufferIds->push(i); } + _xdbcenv->tf_paras.bufProcessed.resize(_xdbcenv->max_threads); + spdlog::get("XDBC.CLIENT")->info("Initialized queues, " "freeBuffersQ: {0}, " "compQ: {1}, " @@ -121,6 +123,7 @@ namespace xdbc auto end = std::chrono::steady_clock::now(); auto total_time = std::chrono::duration_cast(end - _xdbcenv->startTime).count(); + _xdbcenv->tf_paras.elapsed_time = static_cast(total_time); spdlog::get("XDBC.CLIENT")->info("Total elapsed time: {0} ms", total_time); auto pts = std::vector(_xdbcenv->pts->size()); @@ -230,7 +233,7 @@ namespace xdbc _xdbcenv->env_manager.registerOperation("receive", [&](int thr) { try { - if (thr >= _xdbcenv->buffers_in_bufferpool) { + if (thr >= _xdbcenv->max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } @@ -245,7 +248,7 @@ namespace xdbc _xdbcenv->env_manager.registerOperation("decompress", [&](int thr) { try { - if (thr >= _xdbcenv->buffers_in_bufferpool) { + if (thr >= _xdbcenv->max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } @@ -297,6 +300,8 @@ namespace xdbc _xdbcenv->queueSizes.emplace_back(curTimeInterval, freeBufferTotalSize, compressedBufferTotalSize, decompressedBufferTotalSize, serializedBufferTotalSize); + _xdbcenv->tf_paras.latest_queueSizes = std::make_tuple(freeBufferTotalSize, compressedBufferTotalSize, decompressedBufferTotalSize, serializedBufferTotalSize); + std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms)); curTimeInterval += interval_ms / 1000; } From 487f6767d057fa3fea97f6ce8678441b69736aa3 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sun, 2 Mar 2025 22:11:36 +0100 Subject: [PATCH 09/16] Remove commented code --- Sinks/CSVSink/CSVSink.cpp | 731 +++++++++--------- Sinks/main.cpp | 474 ++++++------ .../EnvironmentManager.cpp | 4 +- xdbc/RuntimeEnv.h | 1 - xdbc/xclient.cpp | 9 - 5 files changed, 597 insertions(+), 622 deletions(-) diff --git a/Sinks/CSVSink/CSVSink.cpp b/Sinks/CSVSink/CSVSink.cpp index 8e6b7a2..9488094 100644 --- a/Sinks/CSVSink/CSVSink.cpp +++ b/Sinks/CSVSink/CSVSink.cpp @@ -12,393 +12,386 @@ #include "deserializers_parquet.h" CsvSink::CsvSink(std::string baseFilename, xdbc::RuntimeEnv *runtimeEnv) - : baseFilename(std::move(baseFilename)), runtimeEnv(runtimeEnv) + : baseFilename(std::move(baseFilename)), runtimeEnv(runtimeEnv) { - bufferPool = runtimeEnv->bp; - auto console = spdlog::stdout_color_mt("XDBC.CSVSINK"); + bufferPool = runtimeEnv->bp; + auto console = spdlog::stdout_color_mt("XDBC.CSVSINK"); } void CsvSink::serialize(int thr) { - runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "start"}); - - size_t writtenBuffers = 0; - size_t writtenTuples = 0; - - spdlog::get("XDBC.CSVSINK")->info("CSV Serializer started thread {}", thr); - - if (runtimeEnv->skip_serializer) - { - - while (true) - { - int bufferId = runtimeEnv->decompressedBufferIds->pop(); - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - - if (bufferId == -1) - break; - - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); - - runtimeEnv->serializedBufferIds->push(bufferId); - writtenBuffers++; - } - } - else - { - const auto &schema = runtimeEnv->schema; - size_t schemaSize = schema.size(); - - // Precompute sizes, serializers, and maximum tuple size - std::vector sizes(schemaSize); - std::vector delimiters(schemaSize); - using SerializeFunc = size_t (*)(const void *, char *, size_t, char); - std::vector serializers(schemaSize); - - using ParquetSerializeFunc = size_t (*)(parquet::StreamReader &stream, char *, size_t, char); - std::vector parquetSerializers(schemaSize); - - size_t maxTupleSize = 0; - for (size_t i = 0; i < schemaSize; ++i) - { - - if (schema[i].tpe[0] == 'I') - { - sizes[i] = 4; // sizeof(int) - serializers[i] = SerializeAttribute; - parquetSerializers[i] = SerializeParquetAttribute; - maxTupleSize += 12; // Pessimistic size for integer serialization - } - else if (schema[i].tpe[0] == 'D') - { - sizes[i] = 8; // sizeof(double) - serializers[i] = SerializeAttribute; - parquetSerializers[i] = SerializeParquetAttribute; - maxTupleSize += 24; // Pessimistic size for double serialization - } - else if (schema[i].tpe[0] == 'C') - { - sizes[i] = 1; // sizeof(char) - serializers[i] = SerializeAttribute; - parquetSerializers[i] = SerializeParquetAttribute; - maxTupleSize += 2; // Single character + delimiter - } - else if (schema[i].tpe[0] == 'S') - { - sizes[i] = schema[i].size; - serializers[i] = SerializeAttribute; - parquetSerializers[i] = SerializeParquetAttribute; - maxTupleSize += schema[i].size + 1; // Fixed string size + delimiter - } - - delimiters[i] = (i == schemaSize - 1) ? '\n' : ','; // Newline for the last attribute, commas for others - } - - // TODO: only for format 1 - std::vector columnOffsets(schemaSize); - size_t totalRowSize = 0; - for (size_t j = 0; j < schemaSize; ++j) - { - columnOffsets[j] = totalRowSize; - totalRowSize += sizes[j]; - } - - // TODO: only for format 3 (arrow) - std::vector> dataExtractors(schemaSize); - - size_t bufferSizeInBytes = runtimeEnv->buffer_size * 1024; - - while (true) - { - - int bufferId = runtimeEnv->decompressedBufferIds->pop(); - - if (bufferId == -1) - break; - - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); - - const auto &inBufferPtr = (*bufferPool)[bufferId]; - auto header = *reinterpret_cast(inBufferPtr.data()); - if (header.totalTuples > runtimeEnv->tuples_per_buffer || header.totalSize > runtimeEnv->buffer_size * 1024) - spdlog::get("XDBC.CSVSINK")->error("Size of buffer larger than expected tuples:{}/{}, size {}/{}", header.totalTuples, runtimeEnv->tuples_per_buffer, header.totalSize, runtimeEnv->buffer_size * 1024); - - const char *basePtr = reinterpret_cast(inBufferPtr.data() + sizeof(xdbc::Header)); - - if (header.intermediateFormat == 1 || header.intermediateFormat == 2 || header.intermediateFormat == 3) - { - // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); - - std::vector> arrays; // To store Arrow arrays for `iformat == 3` - - if (runtimeEnv->iformat == 3) - { - // Deserialize Arrow RecordBatch from raw memory - const auto *bufferData = reinterpret_cast(inBufferPtr.data() + - sizeof(xdbc::Header)); - - auto arrowBuffer = std::make_shared(bufferData, header.totalSize); - auto bufferReader = std::make_shared(arrowBuffer); - - // Open a FileReader or StreamReader - auto reader = arrow::ipc::RecordBatchFileReader::Open(bufferReader).ValueOrDie(); - auto arrowSchema = reader->schema(); - - auto recordBatch = reader->ReadRecordBatch(0).ValueOrDie(); - - // Extract column arrays - arrays = recordBatch->columns(); - - // Precompute accessors for Arrow arrays - for (size_t j = 0; j < schemaSize; ++j) - { - switch (arrays[j]->type_id()) - { - case arrow::Type::INT32: - { - auto intArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [intArray](int i) - { - return reinterpret_cast(intArray->raw_values() + i); - }; - break; - } - case arrow::Type::DOUBLE: - { - auto doubleArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [doubleArray](int i) - { - return reinterpret_cast(doubleArray->raw_values() + i); - }; - break; - } - case arrow::Type::STRING: - { - auto stringArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [stringArray](int i) - { - return stringArray->GetString(i).c_str(); - }; - break; - } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixedArray = std::static_pointer_cast(arrays[j]); - dataExtractors[j] = [fixedArray](int i) - { - return reinterpret_cast(fixedArray->GetValue(i)); - }; - break; - } - default: - throw std::runtime_error("Unsupported Arrow array type for serialization."); - } - } - } - - int serializedBufferId = runtimeEnv->freeBufferIds->pop(); - - auto &outBuffer = (*bufferPool)[serializedBufferId]; - char *writePtr = reinterpret_cast(outBuffer.data() + sizeof(xdbc::Header)); - size_t totalSerializedBytes = 0; - - std::vector columnStartPointers(schemaSize); - size_t cumulativeOffset = 0; - for (size_t k = 0; k < schemaSize; ++k) - { - columnStartPointers[k] = basePtr + cumulativeOffset; - // TODO: check this, maybe write header.totalTuples instead of tuples_per_buffer - cumulativeOffset += - runtimeEnv->tuples_per_buffer * sizes[k]; // Move by the total size of this column - } - - for (size_t i = 0; i < header.totalTuples; ++i) - { - if (totalSerializedBytes + maxTupleSize > bufferSizeInBytes) - { - // Buffer is full, push it to the queue - xdbc::Header head{}; - head.totalSize = totalSerializedBytes; - std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", - "push"}); - runtimeEnv->serializedBufferIds->push(serializedBufferId); - - // Fetch a new buffer - serializedBufferId = runtimeEnv->freeBufferIds->pop(); - outBuffer = (*bufferPool)[serializedBufferId]; - writePtr = reinterpret_cast(outBuffer.data() + sizeof(xdbc::Header)); - totalSerializedBytes = 0; - writtenBuffers++; - } - - for (size_t j = 0; j < schemaSize; ++j) - { - const char *dataPtr; - if (runtimeEnv->iformat == 1) - { - dataPtr = basePtr + i * totalRowSize + columnOffsets[j]; - } - else if (runtimeEnv->iformat == 2) - { - dataPtr = columnStartPointers[j] + i * sizes[j]; - } - else if (runtimeEnv->iformat == 3) - { - dataPtr = dataExtractors[j](i); - } - - totalSerializedBytes += serializers[j]( - dataPtr, writePtr + totalSerializedBytes, sizes[j], delimiters[j]); - } - } - writtenTuples += header.totalTuples; - - // Write any remaining data to the buffer - if (totalSerializedBytes > 0) - { - - xdbc::Header head{}; - head.totalSize = totalSerializedBytes; - std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); - - runtimeEnv->serializedBufferIds->push(serializedBufferId); - writtenBuffers++; - } - } - if (header.intermediateFormat == 4) - { - // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); - auto writeBuff = runtimeEnv->freeBufferIds->pop(); - - char *writeBuffPtr = reinterpret_cast((*bufferPool)[writeBuff].data() + sizeof(xdbc::Header)); - - int numRows = 0; - size_t parquetFileSize; - - parquetFileSize = header.totalSize; - - auto arrow_buffer = std::make_shared(reinterpret_cast(basePtr), - parquetFileSize); - auto buffer_reader = std::make_shared(arrow_buffer); - - // Initialize the StreamReader - parquet::StreamReader stream{parquet::ParquetFileReader::Open(buffer_reader)}; - - size_t totalSerializedBytes = 0; - while (!stream.eof()) - { - - for (size_t j = 0; j < schemaSize; ++j) - { - totalSerializedBytes += parquetSerializers[j](stream, writeBuffPtr + totalSerializedBytes, - sizes[j], delimiters[j]); - } - stream >> parquet::EndRow; - numRows++; - - if (totalSerializedBytes + 1000 > runtimeEnv->buffer_size * 1024) - { - xdbc::Header head{}; - head.totalSize = totalSerializedBytes; - head.totalTuples = numRows; - - std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); - - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", - "push"}); - - runtimeEnv->serializedBufferIds->push(writeBuff); - - writeBuff = runtimeEnv->freeBufferIds->pop(); - writeBuffPtr = reinterpret_cast((*bufferPool)[writeBuff].data() + sizeof(xdbc::Header)); - - totalSerializedBytes = 0; - numRows = 0; - } - } - - // write remaining - if (totalSerializedBytes > 0) - { - xdbc::Header head{}; - head.totalSize = totalSerializedBytes; - head.totalTuples = numRows; - - std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); - runtimeEnv->serializedBufferIds->push(writeBuff); - } - } - - // Release decompressed buffer back to freeBufferIds - runtimeEnv->freeBufferIds->push(bufferId); - } - } - - runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "end"}); - - spdlog::get("XDBC.CSVSINK")->info("CSV Serializer stopping thread {}, written buffers: {}, tuples: {}", thr, writtenBuffers, writtenTuples); - - runtimeEnv->finishedSerializerThreads.fetch_add(1); - // if (runtimeEnv->finishedSerializerThreads == runtimeEnv->ser_parallelism) - // { - // for (int i = 0; i < runtimeEnv->write_parallelism; ++i) - // { - // // runtimeEnv->serializedBufferIds->push(-1); // Termination signal - // } - // } + runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "start"}); + + size_t writtenBuffers = 0; + size_t writtenTuples = 0; + + spdlog::get("XDBC.CSVSINK")->info("CSV Serializer started thread {}", thr); + + if (runtimeEnv->skip_serializer) + { + + while (true) + { + int bufferId = runtimeEnv->decompressedBufferIds->pop(); + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); + + if (bufferId == -1) + break; + + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + + runtimeEnv->serializedBufferIds->push(bufferId); + writtenBuffers++; + } + } + else + { + const auto &schema = runtimeEnv->schema; + size_t schemaSize = schema.size(); + + // Precompute sizes, serializers, and maximum tuple size + std::vector sizes(schemaSize); + std::vector delimiters(schemaSize); + using SerializeFunc = size_t (*)(const void *, char *, size_t, char); + std::vector serializers(schemaSize); + + using ParquetSerializeFunc = size_t (*)(parquet::StreamReader &stream, char *, size_t, char); + std::vector parquetSerializers(schemaSize); + + size_t maxTupleSize = 0; + for (size_t i = 0; i < schemaSize; ++i) + { + + if (schema[i].tpe[0] == 'I') + { + sizes[i] = 4; // sizeof(int) + serializers[i] = SerializeAttribute; + parquetSerializers[i] = SerializeParquetAttribute; + maxTupleSize += 12; // Pessimistic size for integer serialization + } + else if (schema[i].tpe[0] == 'D') + { + sizes[i] = 8; // sizeof(double) + serializers[i] = SerializeAttribute; + parquetSerializers[i] = SerializeParquetAttribute; + maxTupleSize += 24; // Pessimistic size for double serialization + } + else if (schema[i].tpe[0] == 'C') + { + sizes[i] = 1; // sizeof(char) + serializers[i] = SerializeAttribute; + parquetSerializers[i] = SerializeParquetAttribute; + maxTupleSize += 2; // Single character + delimiter + } + else if (schema[i].tpe[0] == 'S') + { + sizes[i] = schema[i].size; + serializers[i] = SerializeAttribute; + parquetSerializers[i] = SerializeParquetAttribute; + maxTupleSize += schema[i].size + 1; // Fixed string size + delimiter + } + + delimiters[i] = (i == schemaSize - 1) ? '\n' : ','; // Newline for the last attribute, commas for others + } + + // TODO: only for format 1 + std::vector columnOffsets(schemaSize); + size_t totalRowSize = 0; + for (size_t j = 0; j < schemaSize; ++j) + { + columnOffsets[j] = totalRowSize; + totalRowSize += sizes[j]; + } + + // TODO: only for format 3 (arrow) + std::vector> dataExtractors(schemaSize); + + size_t bufferSizeInBytes = runtimeEnv->buffer_size * 1024; + + while (true) + { + + int bufferId = runtimeEnv->decompressedBufferIds->pop(); + + if (bufferId == -1) + break; + + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "pop"}); + + const auto &inBufferPtr = (*bufferPool)[bufferId]; + auto header = *reinterpret_cast(inBufferPtr.data()); + if (header.totalTuples > runtimeEnv->tuples_per_buffer || header.totalSize > runtimeEnv->buffer_size * 1024) + spdlog::get("XDBC.CSVSINK")->error("Size of buffer larger than expected tuples:{}/{}, size {}/{}", header.totalTuples, runtimeEnv->tuples_per_buffer, header.totalSize, runtimeEnv->buffer_size * 1024); + + const char *basePtr = reinterpret_cast(inBufferPtr.data() + sizeof(xdbc::Header)); + + if (header.intermediateFormat == 1 || header.intermediateFormat == 2 || header.intermediateFormat == 3) + { + // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); + + std::vector> arrays; // To store Arrow arrays for `iformat == 3` + + if (runtimeEnv->iformat == 3) + { + // Deserialize Arrow RecordBatch from raw memory + const auto *bufferData = reinterpret_cast(inBufferPtr.data() + + sizeof(xdbc::Header)); + + auto arrowBuffer = std::make_shared(bufferData, header.totalSize); + auto bufferReader = std::make_shared(arrowBuffer); + + // Open a FileReader or StreamReader + auto reader = arrow::ipc::RecordBatchFileReader::Open(bufferReader).ValueOrDie(); + auto arrowSchema = reader->schema(); + + auto recordBatch = reader->ReadRecordBatch(0).ValueOrDie(); + + // Extract column arrays + arrays = recordBatch->columns(); + + // Precompute accessors for Arrow arrays + for (size_t j = 0; j < schemaSize; ++j) + { + switch (arrays[j]->type_id()) + { + case arrow::Type::INT32: + { + auto intArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [intArray](int i) + { + return reinterpret_cast(intArray->raw_values() + i); + }; + break; + } + case arrow::Type::DOUBLE: + { + auto doubleArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [doubleArray](int i) + { + return reinterpret_cast(doubleArray->raw_values() + i); + }; + break; + } + case arrow::Type::STRING: + { + auto stringArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [stringArray](int i) + { + return stringArray->GetString(i).c_str(); + }; + break; + } + case arrow::Type::FIXED_SIZE_BINARY: + { + auto fixedArray = std::static_pointer_cast(arrays[j]); + dataExtractors[j] = [fixedArray](int i) + { + return reinterpret_cast(fixedArray->GetValue(i)); + }; + break; + } + default: + throw std::runtime_error("Unsupported Arrow array type for serialization."); + } + } + } + + int serializedBufferId = runtimeEnv->freeBufferIds->pop(); + + auto &outBuffer = (*bufferPool)[serializedBufferId]; + char *writePtr = reinterpret_cast(outBuffer.data() + sizeof(xdbc::Header)); + size_t totalSerializedBytes = 0; + + std::vector columnStartPointers(schemaSize); + size_t cumulativeOffset = 0; + for (size_t k = 0; k < schemaSize; ++k) + { + columnStartPointers[k] = basePtr + cumulativeOffset; + // TODO: check this, maybe write header.totalTuples instead of tuples_per_buffer + cumulativeOffset += + runtimeEnv->tuples_per_buffer * sizes[k]; // Move by the total size of this column + } + + for (size_t i = 0; i < header.totalTuples; ++i) + { + if (totalSerializedBytes + maxTupleSize > bufferSizeInBytes) + { + // Buffer is full, push it to the queue + xdbc::Header head{}; + head.totalSize = totalSerializedBytes; + std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", + "push"}); + runtimeEnv->serializedBufferIds->push(serializedBufferId); + + // Fetch a new buffer + serializedBufferId = runtimeEnv->freeBufferIds->pop(); + outBuffer = (*bufferPool)[serializedBufferId]; + writePtr = reinterpret_cast(outBuffer.data() + sizeof(xdbc::Header)); + totalSerializedBytes = 0; + writtenBuffers++; + } + + for (size_t j = 0; j < schemaSize; ++j) + { + const char *dataPtr; + if (runtimeEnv->iformat == 1) + { + dataPtr = basePtr + i * totalRowSize + columnOffsets[j]; + } + else if (runtimeEnv->iformat == 2) + { + dataPtr = columnStartPointers[j] + i * sizes[j]; + } + else if (runtimeEnv->iformat == 3) + { + dataPtr = dataExtractors[j](i); + } + + totalSerializedBytes += serializers[j]( + dataPtr, writePtr + totalSerializedBytes, sizes[j], delimiters[j]); + } + } + writtenTuples += header.totalTuples; + + // Write any remaining data to the buffer + if (totalSerializedBytes > 0) + { + + xdbc::Header head{}; + head.totalSize = totalSerializedBytes; + std::memcpy(outBuffer.data(), &head, sizeof(xdbc::Header)); + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "push"}); + + runtimeEnv->serializedBufferIds->push(serializedBufferId); + writtenBuffers++; + } + } + if (header.intermediateFormat == 4) + { + // spdlog::get("XDBC.CSVSINK")->info("using iformat 1,2,3"); + auto writeBuff = runtimeEnv->freeBufferIds->pop(); + + char *writeBuffPtr = reinterpret_cast((*bufferPool)[writeBuff].data() + sizeof(xdbc::Header)); + + int numRows = 0; + size_t parquetFileSize; + + parquetFileSize = header.totalSize; + + auto arrow_buffer = std::make_shared(reinterpret_cast(basePtr), + parquetFileSize); + auto buffer_reader = std::make_shared(arrow_buffer); + + // Initialize the StreamReader + parquet::StreamReader stream{parquet::ParquetFileReader::Open(buffer_reader)}; + + size_t totalSerializedBytes = 0; + while (!stream.eof()) + { + + for (size_t j = 0; j < schemaSize; ++j) + { + totalSerializedBytes += parquetSerializers[j](stream, writeBuffPtr + totalSerializedBytes, + sizes[j], delimiters[j]); + } + stream >> parquet::EndRow; + numRows++; + + if (totalSerializedBytes + 1000 > runtimeEnv->buffer_size * 1024) + { + xdbc::Header head{}; + head.totalSize = totalSerializedBytes; + head.totalTuples = numRows; + + std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); + + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", + "push"}); + + runtimeEnv->serializedBufferIds->push(writeBuff); + + writeBuff = runtimeEnv->freeBufferIds->pop(); + writeBuffPtr = reinterpret_cast((*bufferPool)[writeBuff].data() + sizeof(xdbc::Header)); + + totalSerializedBytes = 0; + numRows = 0; + } + } + + // write remaining + if (totalSerializedBytes > 0) + { + xdbc::Header head{}; + head.totalSize = totalSerializedBytes; + head.totalTuples = numRows; + + std::memcpy((*bufferPool)[writeBuff].data(), &head, sizeof(xdbc::Header)); + runtimeEnv->serializedBufferIds->push(writeBuff); + } + } + + // Release decompressed buffer back to freeBufferIds + runtimeEnv->freeBufferIds->push(bufferId); + } + } + + runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "ser", "end"}); + + spdlog::get("XDBC.CSVSINK")->info("CSV Serializer stopping thread {}, written buffers: {}, tuples: {}", thr, writtenBuffers, writtenTuples); + + runtimeEnv->finishedSerializerThreads.fetch_add(1); } void CsvSink::write(int thr) { - runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "start"}); + runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "start"}); - spdlog::get("XDBC.CSVSINK")->info("CSV Writer started thread {}", thr); - std::ofstream outputFile; - std::string fileName = baseFilename + "_thread_" + std::to_string(thr) + ".csv"; - size_t buffersWritten = 0; + spdlog::get("XDBC.CSVSINK")->info("CSV Writer started thread {}", thr); + std::ofstream outputFile; + std::string fileName = baseFilename + "_thread_" + std::to_string(thr) + ".csv"; + size_t buffersWritten = 0; - outputFile.open(fileName, std::ios::out | std::ios::binary); - if (!outputFile.is_open()) - { - throw std::runtime_error("Failed to open output file: " + fileName); - } + outputFile.open(fileName, std::ios::out | std::ios::binary); + if (!outputFile.is_open()) + { + throw std::runtime_error("Failed to open output file: " + fileName); + } - while (true) - { - int bufferId = runtimeEnv->serializedBufferIds->pop(); - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); + while (true) + { + int bufferId = runtimeEnv->serializedBufferIds->pop(); + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "pop"}); - // spdlog::get("XDBC.CSVSINK")->info("CSV Writer {} got serialized buff {}", thr, bufferId); + // spdlog::get("XDBC.CSVSINK")->info("CSV Writer {} got serialized buff {}", thr, bufferId); - if (bufferId == -1) - break; + if (bufferId == -1) + break; - const auto &serializedBuffer = (*bufferPool)[bufferId]; - auto header = *reinterpret_cast(serializedBuffer.data()); + const auto &serializedBuffer = (*bufferPool)[bufferId]; + auto header = *reinterpret_cast(serializedBuffer.data()); - const char *dataPtr = reinterpret_cast(serializedBuffer.data() + sizeof(xdbc::Header)); - outputFile.write(dataPtr, header.totalSize); + const char *dataPtr = reinterpret_cast(serializedBuffer.data() + sizeof(xdbc::Header)); + outputFile.write(dataPtr, header.totalSize); - runtimeEnv->pts->push( - xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); + runtimeEnv->pts->push( + xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "push"}); - runtimeEnv->freeBufferIds->push(bufferId); - buffersWritten++; - runtimeEnv->tf_paras.bufProcessed.at(thr) = buffersWritten; - } + runtimeEnv->freeBufferIds->push(bufferId); + buffersWritten++; + runtimeEnv->tf_paras.bufProcessed.at(thr) = buffersWritten; + } - outputFile.close(); - runtimeEnv->finishedWriteThreads.fetch_add(1); - runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "end"}); - spdlog::get("XDBC.CSVSINK")->info("CSV Writer thread {} wrote buffers: {}", thr, buffersWritten); + outputFile.close(); + runtimeEnv->finishedWriteThreads.fetch_add(1); + runtimeEnv->pts->push(xdbc::ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "write", "end"}); + spdlog::get("XDBC.CSVSINK")->info("CSV Writer thread {} wrote buffers: {}", thr, buffersWritten); } \ No newline at end of file diff --git a/Sinks/main.cpp b/Sinks/main.cpp index f3e2dd1..9107b16 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -13,224 +13,216 @@ // Utility functions for schema handling static xdbc::SchemaAttribute createSchemaAttribute(std::string name, std::string tpe, int size) { - xdbc::SchemaAttribute att; - att.name = std::move(name); - att.tpe = std::move(tpe); - att.size = size; - return att; + xdbc::SchemaAttribute att; + att.name = std::move(name); + att.tpe = std::move(tpe); + att.size = size; + return att; } std::string formatSchema(const std::vector &schema) { - std::stringstream ss; - ss << std::setw(20) << std::left << "Name" - << std::setw(15) << std::left << "Type" - << std::setw(10) << std::left << "Size" << '\n'; - - for (const auto &tuple : schema) - { - ss << std::setw(20) << std::left << tuple.name - << std::setw(15) << std::left << tuple.tpe - << std::setw(10) << std::left << tuple.size << '\n'; - } - return ss.str(); + std::stringstream ss; + ss << std::setw(20) << std::left << "Name" + << std::setw(15) << std::left << "Type" + << std::setw(10) << std::left << "Size" << '\n'; + + for (const auto &tuple : schema) + { + ss << std::setw(20) << std::left << tuple.name + << std::setw(15) << std::left << tuple.tpe + << std::setw(10) << std::left << tuple.size << '\n'; + } + return ss.str(); } std::vector createSchemaFromConfig(const std::string &configFile) { - std::ifstream file(configFile); - if (!file.is_open()) - { - spdlog::get("XDBC.SINK")->error("Failed to open schema file: {0}", configFile); - exit(EXIT_FAILURE); - } - - nlohmann::json schemaJson; - file >> schemaJson; - - std::vector schema; - for (const auto &item : schemaJson) - { - schema.emplace_back(xdbc::SchemaAttribute{ - item["name"], item["type"], item["size"]}); - } - return schema; + std::ifstream file(configFile); + if (!file.is_open()) + { + spdlog::get("XDBC.SINK")->error("Failed to open schema file: {0}", configFile); + exit(EXIT_FAILURE); + } + + nlohmann::json schemaJson; + file >> schemaJson; + + std::vector schema; + for (const auto &item : schemaJson) + { + schema.emplace_back(xdbc::SchemaAttribute{ + item["name"], item["type"], item["size"]}); + } + return schema; } std::string readJsonFileIntoString(const std::string &filePath) { - std::ifstream file(filePath); - if (!file.is_open()) - { - spdlog::get("XDBC.SINK")->error("Failed to open schema file: {0}", filePath); - exit(EXIT_FAILURE); - } - - std::stringstream buffer; - buffer << file.rdbuf(); - return buffer.str(); + std::ifstream file(filePath); + if (!file.is_open()) + { + spdlog::get("XDBC.SINK")->error("Failed to open schema file: {0}", filePath); + exit(EXIT_FAILURE); + } + + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); } void handleSinkCMDParams(int argc, char *argv[], xdbc::RuntimeEnv &env, std::string &outputBasePath) { - namespace po = boost::program_options; - - po::options_description desc("Usage: ./csvsink [options]\n\nAllowed options"); - desc.add_options()("help,h", "Produce help message.")("server-host,a", po::value()->default_value("xdbcserver"), - "Server Host: \nDefault:\n xdbcserver")("server-port", po::value()->default_value("1234"), - "Server port: \nDefault:\n 1234")("table,e", po::value()->default_value("lineitem_sf10"), "Input table name.")("output,o", po::value()->default_value("/dev/shm/output"), "Output CSV base file path.")("buffer-size,b", po::value()->default_value(64), "Buffer size in KiB.")("bufferpool-size,p", po::value()->default_value(4096), "Buffer pool size in KiB.")("net-parallelism,n", po::value()->default_value(1), "Set the network parallelism grade.\nDefault: 1")("decomp-parallelism,d", po::value()->default_value(1), "Decompression Parallelism.\nDefault: 1")("serialize-parallelism,s", po::value()->default_value(1), "Number of serializer threads.")("write-parallelism,w", po::value()->default_value(1), "Number of write threads.")("intermediate-format,f", po::value()->default_value(1), - "Intermediate format: 1 (row) or 2 (column).")("transfer-id,tid", po::value()->default_value(0), - "Set the transfer id.\nDefault: 0")("profiling-interval", po::value()->default_value(1000), - "Set profiling interval.\nDefault: 1000")("skip-serializer", po::value()->default_value(0), - "Skip serialization (0/1).\nDefault: false")("target", po::value()->default_value("csv"), - "Target (csv, parquet).\nDefault: csv")("spawn-source", po::value()->default_value(0), - "Set spawn source (0 means direct launch or 1 means spawned using controller).\nDefault: 0"); - - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, desc), vm); - - if (vm.count("help")) - { - std::cout << desc << std::endl; - exit(0); - } - - try - { - po::notify(vm); - } - catch (po::required_option &e) - { - spdlog::get("XDBC.SINK")->error("Missing required options: {0}", e.what()); - exit(EXIT_FAILURE); - } - - env.env_name = "Sink"; - env.server_host = vm["server-host"].as(); - env.server_port = vm["server-port"].as(); - env.transfer_id = vm["transfer-id"].as(); - env.table = vm["table"].as(); - env.buffer_size = vm["buffer-size"].as(); - env.buffers_in_bufferpool = vm["bufferpool-size"].as() / vm["buffer-size"].as(); - env.rcv_parallelism = vm["net-parallelism"].as(); - env.decomp_parallelism = vm["decomp-parallelism"].as(); - env.ser_parallelism = vm["serialize-parallelism"].as(); - env.write_parallelism = vm["write-parallelism"].as(); - env.iformat = vm["intermediate-format"].as(); - env.target = vm["target"].as(); - env.profilingInterval = vm["profiling-interval"].as(); - outputBasePath = vm["output"].as(); - - env.skip_serializer = vm["skip-serializer"].as(); - env.spawn_source = vm["spawn-source"].as(); - - std::string schemaFile = "/xdbc-client/tests/schemas/" + env.table + ".json"; - - env.schema = createSchemaFromConfig(schemaFile); - env.schemaJSON = readJsonFileIntoString(schemaFile); - env.tuple_size = std::accumulate(env.schema.begin(), env.schema.end(), 0, - [](int acc, const xdbc::SchemaAttribute &attr) - { - return acc + attr.size; - }); - - env.tuples_per_buffer = (env.buffer_size * 1024) / env.tuple_size; - env.max_threads = env.buffers_in_bufferpool; - env.startTime = std::chrono::steady_clock::now(); - - spdlog::get("XDBC.SINK")->info("Table: {0}, Tuple size: {1}, Schema:\n{2}", env.table, env.tuple_size, formatSchema(env.schema)); + namespace po = boost::program_options; + + po::options_description desc("Usage: ./csvsink [options]\n\nAllowed options"); + desc.add_options()("help,h", "Produce help message.")("server-host,a", po::value()->default_value("xdbcserver"), + "Server Host: \nDefault:\n xdbcserver")("server-port", po::value()->default_value("1234"), + "Server port: \nDefault:\n 1234")("table,e", po::value()->default_value("lineitem_sf10"), "Input table name.")("output,o", po::value()->default_value("/dev/shm/output"), "Output CSV base file path.")("buffer-size,b", po::value()->default_value(64), "Buffer size in KiB.")("bufferpool-size,p", po::value()->default_value(4096), "Buffer pool size in KiB.")("net-parallelism,n", po::value()->default_value(1), "Set the network parallelism grade.\nDefault: 1")("decomp-parallelism,d", po::value()->default_value(1), "Decompression Parallelism.\nDefault: 1")("serialize-parallelism,s", po::value()->default_value(1), "Number of serializer threads.")("write-parallelism,w", po::value()->default_value(1), "Number of write threads.")("intermediate-format,f", po::value()->default_value(1), + "Intermediate format: 1 (row) or 2 (column).")("transfer-id,tid", po::value()->default_value(0), + "Set the transfer id.\nDefault: 0")("profiling-interval", po::value()->default_value(1000), + "Set profiling interval.\nDefault: 1000")("skip-serializer", po::value()->default_value(0), + "Skip serialization (0/1).\nDefault: false")("target", po::value()->default_value("csv"), + "Target (csv, parquet).\nDefault: csv")("spawn-source", po::value()->default_value(0), + "Set spawn source (0 means direct launch or 1 means spawned using controller).\nDefault: 0"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + + if (vm.count("help")) + { + std::cout << desc << std::endl; + exit(0); + } + + try + { + po::notify(vm); + } + catch (po::required_option &e) + { + spdlog::get("XDBC.SINK")->error("Missing required options: {0}", e.what()); + exit(EXIT_FAILURE); + } + + env.env_name = "Sink"; + env.server_host = vm["server-host"].as(); + env.server_port = vm["server-port"].as(); + env.transfer_id = vm["transfer-id"].as(); + env.table = vm["table"].as(); + env.buffer_size = vm["buffer-size"].as(); + env.buffers_in_bufferpool = vm["bufferpool-size"].as() / vm["buffer-size"].as(); + env.rcv_parallelism = vm["net-parallelism"].as(); + env.decomp_parallelism = vm["decomp-parallelism"].as(); + env.ser_parallelism = vm["serialize-parallelism"].as(); + env.write_parallelism = vm["write-parallelism"].as(); + env.iformat = vm["intermediate-format"].as(); + env.target = vm["target"].as(); + env.profilingInterval = vm["profiling-interval"].as(); + outputBasePath = vm["output"].as(); + + env.skip_serializer = vm["skip-serializer"].as(); + env.spawn_source = vm["spawn-source"].as(); + + std::string schemaFile = "/xdbc-client/tests/schemas/" + env.table + ".json"; + + env.schema = createSchemaFromConfig(schemaFile); + env.schemaJSON = readJsonFileIntoString(schemaFile); + env.tuple_size = std::accumulate(env.schema.begin(), env.schema.end(), 0, + [](int acc, const xdbc::SchemaAttribute &attr) + { + return acc + attr.size; + }); + + env.tuples_per_buffer = (env.buffer_size * 1024) / env.tuple_size; + env.max_threads = env.buffers_in_bufferpool; + env.startTime = std::chrono::steady_clock::now(); + + spdlog::get("XDBC.SINK")->info("Table: {0}, Tuple size: {1}, Schema:\n{2}", env.table, env.tuple_size, formatSchema(env.schema)); } nlohmann::json metrics_convert(xdbc::RuntimeEnv &env) { - nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object - if ((env.pts) && (env.enable_updation == 1)) - { - std::vector env_pts; - env_pts = env.pts->copy_newElements(); - auto component_metrics_ = calculate_metrics(env_pts, env.buffer_size); - for (const auto &pair : component_metrics_) - { - nlohmann::json metric_object = nlohmann::json::object(); - const Metrics &metric = pair.second; - - metric_object["waitingTime_ms"] = metric.waiting_time_ms; - metric_object["processingTime_ms"] = metric.processing_time_ms; - metric_object["totalTime_ms"] = metric.overall_time_ms; - - metric_object["totalThroughput"] = metric.total_throughput; - metric_object["perBufferThroughput"] = metric.per_buffer_throughput; - - metrics_json[pair.first] = metric_object; - } - } - return metrics_json; + nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object + if ((env.pts) && (env.enable_updation == 1)) + { + std::vector env_pts; + env_pts = env.pts->copy_newElements(); + auto component_metrics_ = calculate_metrics(env_pts, env.buffer_size); + for (const auto &pair : component_metrics_) + { + nlohmann::json metric_object = nlohmann::json::object(); + const Metrics &metric = pair.second; + + metric_object["waitingTime_ms"] = metric.waiting_time_ms; + metric_object["processingTime_ms"] = metric.processing_time_ms; + metric_object["totalTime_ms"] = metric.overall_time_ms; + + metric_object["totalThroughput"] = metric.total_throughput; + metric_object["perBufferThroughput"] = metric.per_buffer_throughput; + + metrics_json[pair.first] = metric_object; + } + } + return metrics_json; } nlohmann::json additional_msg(xdbc::RuntimeEnv &env) { - nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object - metrics_json["totalTime_ms"] = env.tf_paras.elapsed_time; - metrics_json["bufTransferred"] = std::accumulate(env.tf_paras.bufProcessed.begin(), env.tf_paras.bufProcessed.end(), 0); - metrics_json["freeBufferQ_load"] = std::get<0>(env.tf_paras.latest_queueSizes); - metrics_json["compressedBufferQ_load"] = std::get<1>(env.tf_paras.latest_queueSizes); - metrics_json["decompressedBufferQ_load"] = std::get<2>(env.tf_paras.latest_queueSizes); - metrics_json["serializedBufferQ_load"] = std::get<3>(env.tf_paras.latest_queueSizes); - - return metrics_json; + nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object + metrics_json["totalTime_ms"] = env.tf_paras.elapsed_time; + metrics_json["bufTransferred"] = std::accumulate(env.tf_paras.bufProcessed.begin(), env.tf_paras.bufProcessed.end(), 0); + metrics_json["freeBufferQ_load"] = std::get<0>(env.tf_paras.latest_queueSizes); + metrics_json["compressedBufferQ_load"] = std::get<1>(env.tf_paras.latest_queueSizes); + metrics_json["decompressedBufferQ_load"] = std::get<2>(env.tf_paras.latest_queueSizes); + metrics_json["serializedBufferQ_load"] = std::get<3>(env.tf_paras.latest_queueSizes); + + return metrics_json; } void env_convert(xdbc::RuntimeEnv &env, const nlohmann::json &env_json) { - try - { - // env.transfer_id = std::stoll(env_json.at("transferID").get()); - // env.table = env_json.at("table").get(); - // env.server_host = env_json.at("serverHost").get(); - // env.iformat = std::stoi(env_json.at("intermediateFormat").get()); - // env.sleep_time = std::chrono::milliseconds(std::stoll(env_json.at("sleepTime").get())); - // env.buffer_size = std::stoi(env_json.at("bufferSize").get()); - // env.buffers_in_bufferpool = std::stoi(env_json.at("bufferpoolSize").get()) / env_.buffer_size; - // env.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); - // env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); - // env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); - // env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); - - // Update the actual environment object if updates are allowed - if (env.enable_updation == 1) - { - env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); - env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); - env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); - } - } - catch (const std::exception &e) - { - std::cerr << "Error converting env JSON: " << e.what() << std::endl; - } + try + { + // env.buffer_size = std::stoi(env_json.at("bufferSize").get()); + // env.buffers_in_bufferpool = std::stoi(env_json.at("bufferpoolSize").get()) / env_.buffer_size; + // env.rcv_parallelism = std::stoi(env_json.at("netParallelism").get()); + + // Update the actual environment object if updates are allowed + if (env.enable_updation == 1) + { + env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); + env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); + env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); + } + } + catch (const std::exception &e) + { + std::cerr << "Error converting env JSON: " << e.what() << std::endl; + } } int main(int argc, char *argv[]) { - auto console = spdlog::stdout_color_mt("XDBC.SINK"); - spdlog::set_level(spdlog::level::info); + auto console = spdlog::stdout_color_mt("XDBC.SINK"); + spdlog::set_level(spdlog::level::info); - xdbc::RuntimeEnv env; - std::string outputBasePath; + xdbc::RuntimeEnv env; + std::string outputBasePath; - handleSinkCMDParams(argc, argv, env, outputBasePath); + handleSinkCMDParams(argc, argv, env, outputBasePath); - //*** - // Initialize XClient - xdbc::XClient xclient(env); - xclient.startReceiving(env.table); + //*** + // Initialize XClient + xdbc::XClient xclient(env); + xclient.startReceiving(env.table); - if (env.target == "csv") - { - CsvSink csvSink(outputBasePath, &env); + if (env.target == "csv") + { + CsvSink csvSink(outputBasePath, &env); - env.env_manager.registerOperation("serial", [&](int thr) - { try { + env.env_manager.registerOperation("serial", [&](int thr) + { try { if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; @@ -242,8 +234,8 @@ int main(int argc, char *argv[]) spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.decompressedBufferIds); - env.env_manager.registerOperation("write", [&](int thr) - { try { + env.env_manager.registerOperation("write", [&](int thr) + { try { if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; @@ -255,16 +247,16 @@ int main(int argc, char *argv[]) spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.serializedBufferIds); - env.env_manager.configureThreads("serial", env.ser_parallelism); - env.env_manager.configureThreads("write", env.write_parallelism); - } - else if (env.target == "parquet") - { - PQSink parquetSink(outputBasePath, &env); + env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("write", env.write_parallelism); + } + else if (env.target == "parquet") + { + PQSink parquetSink(outputBasePath, &env); - env.env_manager.registerOperation("serial", [&](int thr) - { try { - if (thr >= env.buffers_in_bufferpool) { + env.env_manager.registerOperation("serial", [&](int thr) + { try { + if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } @@ -275,9 +267,9 @@ int main(int argc, char *argv[]) spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.decompressedBufferIds); - env.env_manager.registerOperation("write", [&](int thr) - { try { - if (thr >= env.buffers_in_bufferpool) { + env.env_manager.registerOperation("write", [&](int thr) + { try { + if (thr >= env.max_threads) { spdlog::get("XCLIENT")->error("No of threads exceed limit"); return; } @@ -288,50 +280,50 @@ int main(int argc, char *argv[]) spdlog::get("XCLIENT")->error("Unknown exception in thread {}", thr); } }, env.serializedBufferIds); - env.env_manager.configureThreads("serial", env.ser_parallelism); // start serial component threads - env.env_manager.configureThreads("write", env.write_parallelism); // start write component threads - } - - // *** Setup websocket interface for controller *** - env.enable_updation = 1; - std::thread io_thread; - WebSocketClient ws_client("xdbc-controller", "8002"); - if (env.spawn_source == 1) - { - ws_client.start(); - io_thread = std::thread([&]() - { ws_client.run( - std::bind(&metrics_convert, std::ref(env)), - std::bind(&additional_msg, std::ref(env)), - std::bind(&env_convert, std::ref(env), std::placeholders::_1)); }); - } - while (env.enable_updation == 1) // Reconfigure threads as long as it is allowed - { - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - env.env_manager.configureThreads("write", env.write_parallelism); - env.env_manager.configureThreads("serial", env.ser_parallelism); - env.env_manager.configureThreads("decompress", env.decomp_parallelism); - } - // *** Finished Setup websocket interface for controller *** - - // Wait for receive threads to finish, then kill the remaining components in proper sequence : decompress-serial-write - xclient.finishReceiving(); - env.env_manager.configureThreads("serial", 0); - env.env_manager.joinThreads("serial"); - env.env_manager.configureThreads("write", 0); - env.env_manager.joinThreads("write"); - - xclient.finalize(); - spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); - // *** Stop websocket client - if (env.spawn_source == 1) - { - ws_client.stop(); - if (io_thread.joinable()) - { - io_thread.join(); - } - } - - return 0; + env.env_manager.configureThreads("serial", env.ser_parallelism); // start serial component threads + env.env_manager.configureThreads("write", env.write_parallelism); // start write component threads + } + + // *** Setup websocket interface for controller *** + std::thread io_thread; + WebSocketClient ws_client("xdbc-controller", "8002"); + if (env.spawn_source == 1) + { + env.enable_updation = 1; + ws_client.start(); + io_thread = std::thread([&]() + { ws_client.run( + std::bind(&metrics_convert, std::ref(env)), + std::bind(&additional_msg, std::ref(env)), + std::bind(&env_convert, std::ref(env), std::placeholders::_1)); }); + } + while (env.enable_updation == 1) // Reconfigure threads as long as it is allowed + { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + env.env_manager.configureThreads("write", env.write_parallelism); + env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("decompress", env.decomp_parallelism); + } + // *** Finished Setup websocket interface for controller *** + + // Wait for receive threads to finish, then kill the remaining components in proper sequence : decompress-serial-write + xclient.finishReceiving(); + env.env_manager.configureThreads("serial", 0); + env.env_manager.joinThreads("serial"); + env.env_manager.configureThreads("write", 0); + env.env_manager.joinThreads("write"); + + xclient.finalize(); + spdlog::get("XDBC.CSVSINK")->info("{} serialization completed. Output files are available at: {}", env.target, outputBasePath); + // *** Stop websocket client + if (env.spawn_source == 1) + { + ws_client.stop(); + if (io_thread.joinable()) + { + io_thread.join(); + } + } + + return 0; } diff --git a/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp b/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp index 3cf9790..9abdacf 100644 --- a/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp +++ b/xdbc/EnvironmentReconfigure/EnvironmentManager.cpp @@ -137,7 +137,7 @@ void EnvironmentManager::run() spdlog::error("Unknown exception in thread {}", thread_id); } }); } - spdlog::info("Reconfigure thread for operation {0} by {1}", name, delta_threads); + spdlog::info("Reconfigure threads for operation {0} by +{1}", name, delta_threads); } else if (delta_threads < 0) { @@ -151,7 +151,7 @@ void EnvironmentManager::run() } operation.poisonQueue->push(-1); } - spdlog::info("Reconfigure thread for operation {0} by {1}", name, delta_threads); + spdlog::info("Reconfigure threads for operation {0} by {1}", name, delta_threads); } operation.active_threads = operation.desired_threads; diff --git a/xdbc/RuntimeEnv.h b/xdbc/RuntimeEnv.h index 15e16fd..638b003 100644 --- a/xdbc/RuntimeEnv.h +++ b/xdbc/RuntimeEnv.h @@ -9,7 +9,6 @@ #include #include #include "EnvironmentReconfigure/EnvironmentManager.h" -// #include "customQueue.h" namespace xdbc { diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 33ef880..8be3ae3 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -475,11 +475,8 @@ namespace xdbc } _xdbcenv->finishedRcvThreads.fetch_add(1); - // Not needed anymore if (_xdbcenv->finishedRcvThreads == _xdbcenv->rcv_parallelism) { - // for (int i = 0; i < _xdbcenv->decomp_parallelism; i++) - // _xdbcenv->compressedBufferIds->push(-1); _xdbcenv->enable_updation = 0; } socket.close(); @@ -574,12 +571,6 @@ namespace xdbc } _xdbcenv->finishedDecompThreads.fetch_add(1); - // *****************Not needed anymore ***************** - // if (_xdbcenv->finishedDecompThreads == _xdbcenv->decomp_parallelism) - // { - // for (int i = 0; i < _xdbcenv->ser_parallelism; i++) - // _xdbcenv->decompressedBufferIds->push(-1); - // } spdlog::get("XDBC.CLIENT")->warn("Decomp thread {0} finished, {1} buffers", thr, buffersDecompressed); _xdbcenv->pts->push(ProfilingTimestamps{std::chrono::high_resolution_clock::now(), thr, "decomp", "end"}); } From b2375a00da10cf1fdfd9d74c45876df0cdad51e0 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Sun, 9 Mar 2025 22:31:32 +0100 Subject: [PATCH 10/16] Reconfigure inside websocket --- Sinks/main.cpp | 18 +++++++++-------- xdbc/customQueue.h | 49 ++++++++++++++++++++-------------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 9107b16..9cdc2c2 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -172,11 +172,13 @@ nlohmann::json additional_msg(xdbc::RuntimeEnv &env) nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object metrics_json["totalTime_ms"] = env.tf_paras.elapsed_time; metrics_json["bufTransferred"] = std::accumulate(env.tf_paras.bufProcessed.begin(), env.tf_paras.bufProcessed.end(), 0); - metrics_json["freeBufferQ_load"] = std::get<0>(env.tf_paras.latest_queueSizes); - metrics_json["compressedBufferQ_load"] = std::get<1>(env.tf_paras.latest_queueSizes); - metrics_json["decompressedBufferQ_load"] = std::get<2>(env.tf_paras.latest_queueSizes); - metrics_json["serializedBufferQ_load"] = std::get<3>(env.tf_paras.latest_queueSizes); - + if (env.enable_updation == 1) + { + metrics_json["freeBufferQ_load"] = std::get<0>(env.tf_paras.latest_queueSizes); + metrics_json["compressedBufferQ_load"] = std::get<1>(env.tf_paras.latest_queueSizes); + metrics_json["decompressedBufferQ_load"] = std::get<2>(env.tf_paras.latest_queueSizes); + metrics_json["serializedBufferQ_load"] = std::get<3>(env.tf_paras.latest_queueSizes); + } return metrics_json; } @@ -194,6 +196,9 @@ void env_convert(xdbc::RuntimeEnv &env, const nlohmann::json &env_json) env.write_parallelism = std::stoi(env_json.at("writeParallelism").get()); env.decomp_parallelism = std::stoi(env_json.at("decompParallelism").get()); env.ser_parallelism = std::stoi(env_json.at("serParallelism").get()); + env.env_manager.configureThreads("write", env.write_parallelism); + env.env_manager.configureThreads("serial", env.ser_parallelism); + env.env_manager.configureThreads("decompress", env.decomp_parallelism); } } catch (const std::exception &e) @@ -300,9 +305,6 @@ int main(int argc, char *argv[]) while (env.enable_updation == 1) // Reconfigure threads as long as it is allowed { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - env.env_manager.configureThreads("write", env.write_parallelism); - env.env_manager.configureThreads("serial", env.ser_parallelism); - env.env_manager.configureThreads("decompress", env.decomp_parallelism); } // *** Finished Setup websocket interface for controller *** diff --git a/xdbc/customQueue.h b/xdbc/customQueue.h index 8c83d8e..462058d 100644 --- a/xdbc/customQueue.h +++ b/xdbc/customQueue.h @@ -2,9 +2,8 @@ #include #include -template -class customQueue -{ +template +class customQueue { private: std::mutex d_mutex; std::condition_variable d_condition; @@ -16,36 +15,30 @@ class customQueue public: explicit customQueue(size_t max_capacity = 0) : capacity(max_capacity) {} - void push(T const &value) - { + void push(T const &value) { { std::unique_lock lock(this->d_mutex); - this->d_space_available.wait(lock, [=] - { return capacity == 0 || d_queue.size() < capacity; }); + this->d_space_available.wait(lock, [=] { return capacity == 0 || d_queue.size() < capacity; }); d_queue.push_front(value); } this->d_condition.notify_all(); } - T pop() - { + T pop() { std::unique_lock lock(this->d_mutex); - this->d_condition.wait(lock, [=] - { return !this->d_queue.empty(); }); + this->d_condition.wait(lock, [=] { return !this->d_queue.empty(); }); T rc(std::move(this->d_queue.back())); this->d_queue.pop_back(); this->d_space_available.notify_all(); // Notify threads waiting for space return rc; } - [[nodiscard]] size_t size() - { + [[nodiscard]] size_t size() { std::unique_lock lock(this->d_mutex); return d_queue.size(); } - void setCapacity(size_t new_capacity) - { + void setCapacity(size_t new_capacity) { { std::unique_lock lock(this->d_mutex); capacity = new_capacity; @@ -54,23 +47,23 @@ class customQueue } // Get the current capacity - [[nodiscard]] size_t getCapacity() const - { + [[nodiscard]] size_t getCapacity() const { return capacity; } - std::vector copy_newElements() - { - static size_t lastCopiedIndex = 0; - std::vector new_elements; - auto current_index = this->d_queue.size(); // Use this->d_queue - - if (lastCopiedIndex < current_index) + std::vector copy_newElements() { + static size_t lastCopiedIndex = 0; // Tracks the last copied position + std::vector new_elements; // To store new elements + auto current_index = d_queue.size(); { - new_elements.assign(this->d_queue.rbegin(), // Use this->d_queue - this->d_queue.rbegin() + (this->d_queue.size() - lastCopiedIndex)); - lastCopiedIndex = current_index; + // std::unique_lock lock(this->d_mutex); // Lock for thread safety + if (lastCopiedIndex < + current_index) { // Check if there are new elements + new_elements.assign(d_queue.rbegin(), d_queue.rbegin() + (d_queue.size() - + lastCopiedIndex)); // Reverse copy the new elements + lastCopiedIndex = current_index; // Update the index for the next call + } } - return new_elements; + return new_elements; // Return new elements in reverse order } }; From e66248009e5042d4caef216d7b86c285bb736db9 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Tue, 18 Mar 2025 21:48:51 +0100 Subject: [PATCH 11/16] Modify websocket to wait for command from controller --- xdbc/ControllerInterface/WebSocketClient.cpp | 117 ++++++++++++------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/xdbc/ControllerInterface/WebSocketClient.cpp b/xdbc/ControllerInterface/WebSocketClient.cpp index 8dcc538..16dcf7d 100644 --- a/xdbc/ControllerInterface/WebSocketClient.cpp +++ b/xdbc/ControllerInterface/WebSocketClient.cpp @@ -3,11 +3,13 @@ #include WebSocketClient::WebSocketClient(const std::string &host, const std::string &port) - : host_(host), port_(port), resolver_(ioc_), ws_(ioc_), timer_(ioc_), active_(false), stop_thread_(false), - operation_started_(false) {} + : host_(host), port_(port), resolver_(ioc_), ws_(ioc_), timer_(ioc_), active_(false), stop_thread_(false), + operation_started_(false) {} -void WebSocketClient::start() { - try { +void WebSocketClient::start() +{ + try + { // Resolve host and port auto results = resolver_.resolve(host_, port_); // Connect to the first resolved endpoint @@ -26,62 +28,59 @@ void WebSocketClient::start() { auto start_time = std::chrono::steady_clock::now(); const std::chrono::seconds timeout(10); // Set a timeout duration - while (!acknowledged) { + while (!acknowledged) + { // Check for timeout auto elapsed = std::chrono::steady_clock::now() - start_time; - if (elapsed > timeout) { + if (elapsed > timeout) + { spdlog::error("Timeout waiting for server acknowledgment."); throw std::runtime_error("Server acknowledgment timeout"); } // Attempt to read the acknowledgment - try { + try + { ws_.read(buffer); std::string ack_response = beast::buffers_to_string(buffer.data()); spdlog::info("Received acknowledgment: {}", ack_response); // Parse and check acknowledgment json ack_json = json::parse(ack_response); - if (ack_json["operation"] == "acknowledged") { + if (ack_json["operation"] == "acknowledged") + { acknowledged = true; operation_started_ = true; // Set flag indicating acknowledgment received spdlog::info("Server acknowledged the start request."); - } else { + } + else + { spdlog::warn("Server response does not acknowledge start: {}", ack_json.dump()); // throw std::runtime_error("Server rejected start request"); } } - catch (const std::exception &e) { + catch (const std::exception &e) + { spdlog::error("Error while waiting for acknowledgment: {}", e.what()); // Optional: Retry after a short delay std::this_thread::sleep_for(std::chrono::milliseconds(500)); } } } - catch (const std::exception &e) { + catch (const std::exception &e) + { spdlog::error("WebSocket Client Error during start: {}", e.what()); throw; // Rethrow the exception to notify the caller } } -void WebSocketClient::periodic_communication() { - try { - while (!stop_thread_) { - // Convert metrics to JSON and send it - json metrics_json = metrics_convert_(); - json addtnl_info = additional_msg_(); - json combined_payload = metrics_json; - for (auto &[key, value]: addtnl_info.items()) { - combined_payload[key] = value; - } - // json metrics_json = {{"waiting_time", "100ms"}}; - json request_json = { - {"operation", "get_environment"}, - {"payload", combined_payload} // Include metrics in the payload - }; - ws_.write(asio::buffer(request_json.dump())); - - // Read response from server +void WebSocketClient::periodic_communication() +{ + try + { + while (!stop_thread_) + { + // Read command from server beast::flat_buffer buffer; ws_.read(buffer); std::string env_response = beast::buffers_to_string(buffer.data()); @@ -89,28 +88,50 @@ void WebSocketClient::periodic_communication() { // Parse and process the response json env_json = json::parse(env_response); - if (env_json["operation"] == "set_environment") { + if (env_json["operation"] == "set_environment") + { json payload = env_json["payload"]; env_convert_(payload); // Process environment data from payload - } else { + } + else + { spdlog::warn("Unexpected operation received: {}", env_json["operation"]); } - // Wait for 1 second before next communication - std::this_thread::sleep_for(std::chrono::seconds(2)); + // Convert metrics to JSON and send it + json metrics_json = metrics_convert_(); + json addtnl_info = additional_msg_(); + json combined_payload = metrics_json; + for (auto &[key, value] : addtnl_info.items()) + { + combined_payload[key] = value; + } + // json metrics_json = {{"waiting_time", "100ms"}}; + json request_json = { + {"operation", "get_environment"}, + {"payload", combined_payload} // Include metrics in the payload + }; + ws_.write(asio::buffer(request_json.dump())); + + // // Wait for 1 second before next communication + // std::this_thread::sleep_for(std::chrono::seconds(2)); active_ = true; } } - catch (const std::exception &e) { + catch (const std::exception &e) + { std::cerr << "Error in periodic communication: " << e.what() << std::endl; } } void WebSocketClient::run(std::function metrics_convert, std::function additional_msg, - std::function env_convert) { - try { + std::function env_convert) +{ + try + { // Wait until the operation has started and acknowledgment is received - while (!operation_started_) { + while (!operation_started_) + { std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Wait briefly before checking again } @@ -124,21 +145,25 @@ void WebSocketClient::run(std::function metrics_convert, std::function Date: Wed, 19 Mar 2025 19:51:25 +0100 Subject: [PATCH 12/16] Fix bug in percentage --- Sinks/CSVSink/CSVSink.cpp | 2 +- xdbc/xclient.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Sinks/CSVSink/CSVSink.cpp b/Sinks/CSVSink/CSVSink.cpp index 9488094..5b4b198 100644 --- a/Sinks/CSVSink/CSVSink.cpp +++ b/Sinks/CSVSink/CSVSink.cpp @@ -387,7 +387,7 @@ void CsvSink::write(int thr) runtimeEnv->freeBufferIds->push(bufferId); buffersWritten++; - runtimeEnv->tf_paras.bufProcessed.at(thr) = buffersWritten; + runtimeEnv->tf_paras.bufProcessed.at(thr)++; } outputFile.close(); diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 8be3ae3..a044ab3 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -94,7 +94,7 @@ namespace xdbc _xdbcenv->freeBufferIds->push(i); } - _xdbcenv->tf_paras.bufProcessed.resize(_xdbcenv->max_threads); + _xdbcenv->tf_paras.bufProcessed.resize(_xdbcenv->max_threads, 0); spdlog::get("XDBC.CLIENT")->info("Initialized queues, " "freeBuffersQ: {0}, " From a857ec135b9f9fecbc06c60c27527c4f6c808036 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Mon, 24 Mar 2025 07:29:51 +0100 Subject: [PATCH 13/16] Calculate latest metrics without copying --- Sinks/main.cpp | 5 +- xdbc/customQueue.h | 53 ++++++++------- xdbc/metrics_calculator.h | 134 ++++++++++++++++++++++++++------------ xdbc/xclient.cpp | 7 +- 4 files changed, 128 insertions(+), 71 deletions(-) diff --git a/Sinks/main.cpp b/Sinks/main.cpp index 9cdc2c2..b4501ad 100644 --- a/Sinks/main.cpp +++ b/Sinks/main.cpp @@ -146,9 +146,8 @@ nlohmann::json metrics_convert(xdbc::RuntimeEnv &env) nlohmann::json metrics_json = nlohmann::json::object(); // Use a JSON object if ((env.pts) && (env.enable_updation == 1)) { - std::vector env_pts; - env_pts = env.pts->copy_newElements(); - auto component_metrics_ = calculate_metrics(env_pts, env.buffer_size); + auto &env_pts = *(env.pts); + auto component_metrics_ = calculate_metrics(env_pts, env.buffer_size, true); for (const auto &pair : component_metrics_) { nlohmann::json metric_object = nlohmann::json::object(); diff --git a/xdbc/customQueue.h b/xdbc/customQueue.h index 462058d..15d04a4 100644 --- a/xdbc/customQueue.h +++ b/xdbc/customQueue.h @@ -2,8 +2,9 @@ #include #include -template -class customQueue { +template +class customQueue +{ private: std::mutex d_mutex; std::condition_variable d_condition; @@ -15,30 +16,36 @@ class customQueue { public: explicit customQueue(size_t max_capacity = 0) : capacity(max_capacity) {} - void push(T const &value) { + void push(T const &value) + { { std::unique_lock lock(this->d_mutex); - this->d_space_available.wait(lock, [=] { return capacity == 0 || d_queue.size() < capacity; }); + this->d_space_available.wait(lock, [=] + { return capacity == 0 || d_queue.size() < capacity; }); d_queue.push_front(value); } this->d_condition.notify_all(); } - T pop() { + T pop() + { std::unique_lock lock(this->d_mutex); - this->d_condition.wait(lock, [=] { return !this->d_queue.empty(); }); + this->d_condition.wait(lock, [=] + { return !this->d_queue.empty(); }); T rc(std::move(this->d_queue.back())); this->d_queue.pop_back(); this->d_space_available.notify_all(); // Notify threads waiting for space return rc; } - [[nodiscard]] size_t size() { + [[nodiscard]] size_t size() + { std::unique_lock lock(this->d_mutex); return d_queue.size(); } - void setCapacity(size_t new_capacity) { + void setCapacity(size_t new_capacity) + { { std::unique_lock lock(this->d_mutex); capacity = new_capacity; @@ -47,23 +54,23 @@ class customQueue { } // Get the current capacity - [[nodiscard]] size_t getCapacity() const { + [[nodiscard]] size_t getCapacity() const + { return capacity; } - std::vector copy_newElements() { - static size_t lastCopiedIndex = 0; // Tracks the last copied position - std::vector new_elements; // To store new elements - auto current_index = d_queue.size(); - { - // std::unique_lock lock(this->d_mutex); // Lock for thread safety - if (lastCopiedIndex < - current_index) { // Check if there are new elements - new_elements.assign(d_queue.rbegin(), d_queue.rbegin() + (d_queue.size() - - lastCopiedIndex)); // Reverse copy the new elements - lastCopiedIndex = current_index; // Update the index for the next call - } - } - return new_elements; // Return new elements in reverse order + auto begin() + { + return d_queue.rbegin(); + } + + auto beginFrom(size_t offset) + { + return d_queue.rbegin() + offset; + } + + auto end() + { + return d_queue.rend(); } }; diff --git a/xdbc/metrics_calculator.h b/xdbc/metrics_calculator.h index ee31f97..e1880ec 100644 --- a/xdbc/metrics_calculator.h +++ b/xdbc/metrics_calculator.h @@ -10,7 +10,8 @@ #include // Define the Metrics struct -struct Metrics { +struct Metrics +{ double waiting_time_ms; double processing_time_ms; double overall_time_ms; @@ -24,9 +25,11 @@ struct Metrics { }; // Helper function to calculate standard deviation -double calculate_stddev(const std::vector &values, double mean) { +double calculate_stddev(const std::vector &values, double mean) +{ double sum = 0.0; - for (const auto &value: values) { + for (const auto &value : values) + { sum += (value - mean) * (value - mean); } return std::sqrt(sum / values.size()); @@ -34,23 +37,42 @@ double calculate_stddev(const std::vector &values, double mean) { // Function to calculate metrics per component and then aggregate them std::unordered_map -calculate_metrics(const std::vector ×tamps, size_t buffer_size_kb) { +calculate_metrics(customQueue ×tamps, size_t buffer_size_kb, bool is_latest = false) +{ size_t buffer_size_bytes = buffer_size_kb * 1024; // Convert buffer size to bytes std::unordered_map>> events_per_component_thread; + static size_t lastCopiedIndex = 0; // Tracks the last copied position + if (is_latest == true) + { + // Create a map to keep track of the count of timestamps per component and thread + std::map, int> timestamp_counts; - // Group timestamps by component and thread - for (const auto &ts: timestamps) { - events_per_component_thread[ts.component][ts.thread].push_back(ts); + for (auto it = timestamps.beginFrom(lastCopiedIndex); it != timestamps.end(); ++it) + { + const auto &ts = *it; + events_per_component_thread[ts.component][ts.thread].push_back(ts); + } + lastCopiedIndex = timestamps.size(); + } + else + { + for (auto it = timestamps.begin(); it != timestamps.end(); ++it) + { + const auto &ts = *it; + events_per_component_thread[ts.component][ts.thread].push_back(ts); + } } std::unordered_map component_metrics; // Calculate metrics per component - for (const auto &[component, events_per_thread]: events_per_component_thread) { + for (const auto &[component, events_per_thread] : events_per_component_thread) + { std::vector thread_metrics; size_t total_buffers_processed = 0; - for (const auto &[thread_id, events]: events_per_thread) { + for (const auto &[thread_id, events] : events_per_thread) + { Metrics metrics = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; std::chrono::high_resolution_clock::time_point start_time, push_time, pop_time, end_time; @@ -65,29 +87,42 @@ calculate_metrics(const std::vector ×tamps, size const auto &first_element = events.front(); // Store the first event in the loop const auto &last_element = events.back(); // Store the last event in the loop - for (const auto &event: events) { - if (event.event == "start") { + for (const auto &event : events) + { + if (event.event == "start") + { start_time = event.timestamp; has_start_time = true; - } else if (event.event == "pop") { + } + else if (event.event == "pop") + { pop_time = event.timestamp; - if (has_push_time) { + if (has_push_time) + { waiting_time += pop_time - push_time; // Waiting time is pop_time - previous push_time - } else if (has_start_time) { + } + else if (has_start_time) + { waiting_time += pop_time - start_time; // Initial waiting time is pop_time - start_time } has_pop_time = true; - } else if (event.event == "push") { + } + else if (event.event == "push") + { push_time = event.timestamp; has_push_time = true; - if (has_pop_time) { + if (has_pop_time) + { processing_time += push_time - pop_time; // Processing time is push_time - pop_time thread_buffers_processed++; } - } else if (event.event == "end") { + } + else if (event.event == "end") + { end_time = event.timestamp; has_end_time = true; - if (has_pop_time) { + if (has_push_time) + { processing_time += end_time - push_time; // Finalize the processing time } } @@ -95,28 +130,37 @@ calculate_metrics(const std::vector ×tamps, size metrics.waiting_time_ms = std::chrono::duration_cast(waiting_time).count(); metrics.processing_time_ms = std::chrono::duration_cast(processing_time).count(); - if (has_end_time && has_start_time) { + if (has_end_time && has_start_time) + { metrics.overall_time_ms = std::chrono::duration_cast( - std::chrono::duration(end_time - start_time)) - .count(); - } else { + std::chrono::duration(end_time - start_time)) + .count(); + } + else + { metrics.overall_time_ms = std::chrono::duration_cast( - std::chrono::duration(last_element.timestamp - first_element.timestamp)) - .count(); + std::chrono::duration(last_element.timestamp - first_element.timestamp)) + .count(); + } + if (metrics.processing_time_ms == 0) + { + metrics.processing_time_ms = 1; } total_buffers_processed += thread_buffers_processed; // Calculate the total throughput in bytes per second for this thread - if (metrics.overall_time_ms > 0) { + if (metrics.overall_time_ms > 0) + { metrics.total_throughput = - (thread_buffers_processed * buffer_size_bytes) / (metrics.overall_time_ms / 1000.0); + (thread_buffers_processed * buffer_size_bytes) / (metrics.overall_time_ms / 1000.0); } // Calculate the per buffer throughput in bytes per second for this thread - if (metrics.processing_time_ms > 0) { + if (metrics.processing_time_ms > 0) + { metrics.per_buffer_throughput = - (thread_buffers_processed * buffer_size_bytes) / (metrics.processing_time_ms / 1000.0); + (thread_buffers_processed * buffer_size_bytes) / (metrics.processing_time_ms / 1000.0); } // Convert throughput to MB/s @@ -130,33 +174,39 @@ calculate_metrics(const std::vector ×tamps, size Metrics aggregated_metrics; size_t num_threads = thread_metrics.size(); aggregated_metrics.waiting_time_ms = std::accumulate(thread_metrics.begin(), thread_metrics.end(), 0.0, - [](const auto &sum, const auto &m) { + [](const auto &sum, const auto &m) + { return sum + m.waiting_time_ms; }) / num_threads; aggregated_metrics.processing_time_ms = std::accumulate(thread_metrics.begin(), thread_metrics.end(), 0.0, - [](const auto &sum, const auto &m) { + [](const auto &sum, const auto &m) + { return sum + m.processing_time_ms; }) / num_threads; aggregated_metrics.overall_time_ms = std::accumulate(thread_metrics.begin(), thread_metrics.end(), 0.0, - [](const auto &sum, const auto &m) { + [](const auto &sum, const auto &m) + { return sum + m.overall_time_ms; }) / num_threads; aggregated_metrics.total_throughput = std::accumulate(thread_metrics.begin(), thread_metrics.end(), 0.0, - [](const auto &sum, const auto &m) { + [](const auto &sum, const auto &m) + { return sum + m.total_throughput; }); aggregated_metrics.per_buffer_throughput = std::accumulate(thread_metrics.begin(), thread_metrics.end(), 0.0, - [](const auto &sum, const auto &m) { + [](const auto &sum, const auto &m) + { return sum + m.per_buffer_throughput; }) / num_threads; // Calculate standard deviations std::vector waiting_times, processing_times, overall_times, total_throughputs, per_buffer_throughputs; - for (const auto &m: thread_metrics) { + for (const auto &m : thread_metrics) + { waiting_times.push_back(m.waiting_time_ms); processing_times.push_back(m.processing_time_ms); overall_times.push_back(m.overall_time_ms); @@ -179,7 +229,8 @@ calculate_metrics(const std::vector ×tamps, size return component_metrics; } -std::tuple printAndReturnAverageLoad(xdbc::RuntimeEnv &_xdbcenv) { +std::tuple printAndReturnAverageLoad(xdbc::RuntimeEnv &_xdbcenv) +{ long long totalTimestamps = 0; size_t totalFreeBufferIdsSize = 0; size_t totalCompressedBufferIdsSize = 0; @@ -189,7 +240,8 @@ std::tuple printAndReturnAverageLoad(xdbc::Runti auto ret = std::tuple(0, 0, 0, 0); - for (const auto &record: _xdbcenv.queueSizes) { + for (const auto &record : _xdbcenv.queueSizes) + { totalTimestamps += std::get<0>(record); totalFreeBufferIdsSize += std::get<1>(record); totalCompressedBufferIdsSize += std::get<2>(record); @@ -197,7 +249,8 @@ std::tuple printAndReturnAverageLoad(xdbc::Runti totalSerializedBufferIdsSize += std::get<4>(record); } - if (recordCount > 0) { + if (recordCount > 0) + { double avgFreeBufferIdsSize = static_cast(totalFreeBufferIdsSize) / recordCount; double avgCompressedBufferIdsSize = static_cast(totalCompressedBufferIdsSize) / recordCount; double avgDecompressedBufferIdsSize = static_cast(totalDecompressedBufferIdsSize) / recordCount; @@ -206,9 +259,10 @@ std::tuple printAndReturnAverageLoad(xdbc::Runti ret = std::tuple(avgFreeBufferIdsSize, avgCompressedBufferIdsSize, avgDecompressedBufferIdsSize, avgSerializedBufferIdsSize); spdlog::get("XDBC.SINK")->info("Average Load of Queues: Free, Decompressor, Serializer, Writer"); - spdlog::get("XDBC.SINK")->info("{0}\t{1}\t{2}\t{3}", avgFreeBufferIdsSize, avgCompressedBufferIdsSize, - avgDecompressedBufferIdsSize, avgSerializedBufferIdsSize); - } else { + spdlog::get("XDBC.SINK")->info("{0}\t{1}\t{2}\t{3}", avgFreeBufferIdsSize, avgCompressedBufferIdsSize, avgDecompressedBufferIdsSize, avgSerializedBufferIdsSize); + } + else + { spdlog::get("XDBC.SINK")->info("No records available to calculate averages."); } diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index a044ab3..0dc77e6 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -126,11 +126,8 @@ namespace xdbc _xdbcenv->tf_paras.elapsed_time = static_cast(total_time); spdlog::get("XDBC.CLIENT")->info("Total elapsed time: {0} ms", total_time); - auto pts = std::vector(_xdbcenv->pts->size()); - while (_xdbcenv->pts->size() != 0) - pts.push_back(_xdbcenv->pts->pop()); - - auto component_metrics = calculate_metrics(pts, _xdbcenv->buffer_size); + auto &env_pts = *(_xdbcenv->pts); + auto component_metrics = calculate_metrics(env_pts, _xdbcenv->buffer_size); std::ostringstream totalTimes; std::ostringstream procTimes; std::ostringstream waitingTimes; From d9ea2cdea929f26c95f2e13cda07a8a2027007a7 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Mon, 31 Mar 2025 14:50:34 +0200 Subject: [PATCH 14/16] Convert load to percent --- xdbc/xclient.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index 0dc77e6..d5b400e 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -74,6 +74,7 @@ namespace xdbc // Unified receive queue _xdbcenv->freeBufferIds = std::make_shared>(); + _xdbcenv->freeBufferIds->setCapacity(_xdbcenv->buffers_in_bufferpool); // Unified decompression queue _xdbcenv->compressedBufferIds = std::make_shared>(); _xdbcenv->compressedBufferIds->setCapacity(queueCapacityPerComp); @@ -278,26 +279,22 @@ namespace xdbc size_t decompressedBufferTotalSize = _xdbcenv->decompressedBufferIds->size(); size_t serializedBufferTotalSize = _xdbcenv->serializedBufferIds->size(); - // size_t freeBufferTotalSize = 0; - // for (auto &queue_ptr: _xdbcenv->freeBufferIds) { - // freeBufferTotalSize += queue_ptr->size(); - // } + float freeBufferLoadFloat = (freeBufferTotalSize * 100.0f) / _xdbcenv->freeBufferIds->getCapacity(); + float compressedBufferLoadFloat = (compressedBufferTotalSize * 100.0f) / _xdbcenv->compressedBufferIds->getCapacity(); + float decompressedBufferLoadFloat = (decompressedBufferTotalSize * 100.0f) / _xdbcenv->decompressedBufferIds->getCapacity(); + float serializedBufferLoadFloat = (serializedBufferTotalSize * 100.0f) / _xdbcenv->serializedBufferIds->getCapacity(); - // size_t compressedBufferTotalSize = 0; - // for (auto &queue_ptr: _xdbcenv->compressedBufferIds) { - // compressedBufferTotalSize += queue_ptr->size(); - // } - - // size_t decompressedBufferTotalSize = 0; - // for (auto &queue_ptr: _xdbcenv->decompressedBufferIds) { - // decompressedBufferTotalSize += queue_ptr->size(); - // } + size_t freeBufferLoad = static_cast(freeBufferLoadFloat); + size_t compressedBufferLoad = static_cast(compressedBufferLoadFloat); + size_t decompressedBufferLoad = static_cast(decompressedBufferLoadFloat); + size_t serializedBufferLoad = static_cast(serializedBufferLoadFloat); // Store the measurement as a tuple _xdbcenv->queueSizes.emplace_back(curTimeInterval, freeBufferTotalSize, compressedBufferTotalSize, decompressedBufferTotalSize, serializedBufferTotalSize); - _xdbcenv->tf_paras.latest_queueSizes = std::make_tuple(freeBufferTotalSize, compressedBufferTotalSize, decompressedBufferTotalSize, serializedBufferTotalSize); + _xdbcenv->tf_paras.latest_queueSizes = std::make_tuple(freeBufferLoad, compressedBufferLoad, + decompressedBufferLoad, serializedBufferLoad); std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms)); curTimeInterval += interval_ms / 1000; From 26df4b814eecd51189661d9b25b32d4f3c864fb0 Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Wed, 2 Apr 2025 10:47:21 +0200 Subject: [PATCH 15/16] Disable docker-xdbc.yml --- docker-xdbc.yml | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docker-xdbc.yml b/docker-xdbc.yml index 7e0e4f3..43bde76 100644 --- a/docker-xdbc.yml +++ b/docker-xdbc.yml @@ -1,23 +1,23 @@ services: - # xdbc-server: - # image: xdbc-server:latest - # container_name: xdbcserver - # restart: always - # volumes: - # - /dev/shm:/dev/shm - # ports: - # - 1234:1234 - # - 1235:1235 - # - 1236:1236 - # - 1237:1237 - # - 1238:1238 - # shm_size: '16gb' - # cap_add: - # - NET_ADMIN - # networks: - # - xdbc-net - # labels: - # com.docker-tc.enabled: 1 + xdbc-server: + image: xdbc-server:latest + container_name: xdbcserver + restart: always + volumes: + - /dev/shm:/dev/shm + ports: + - 1234:1234 + - 1235:1235 + - 1236:1236 + - 1237:1237 + - 1238:1238 + shm_size: '16gb' + cap_add: + - NET_ADMIN + networks: + - xdbc-net + labels: + com.docker-tc.enabled: 1 xdbc-client: image: xdbc-client:latest From 2329e7bf8f0116d38d0c9f00e830ca643b68a72c Mon Sep 17 00:00:00 2001 From: midhun_kv Date: Wed, 2 Apr 2025 20:55:11 +0200 Subject: [PATCH 16/16] Resolve conflict in xclient --- xdbc/xclient.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xdbc/xclient.cpp b/xdbc/xclient.cpp index abd02d7..d5b400e 100644 --- a/xdbc/xclient.cpp +++ b/xdbc/xclient.cpp @@ -95,11 +95,7 @@ namespace xdbc _xdbcenv->freeBufferIds->push(i); } -<<<<<<< HEAD _xdbcenv->tf_paras.bufProcessed.resize(_xdbcenv->max_threads, 0); -======= - _xdbcenv->tf_paras.bufProcessed.resize(_xdbcenv->max_threads); ->>>>>>> main spdlog::get("XDBC.CLIENT")->info("Initialized queues, " "freeBuffersQ: {0}, "