From 1d383ef4a78ceba6d87751f25f6ab3ccdb6ecf67 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Sat, 9 Aug 2025 17:07:44 +0530 Subject: [PATCH 01/12] IR2Vec.cpp refactor --- src/IR2Vec.cpp | 207 ++++++++++++++++++++++++++++--------------------- 1 file changed, 120 insertions(+), 87 deletions(-) diff --git a/src/IR2Vec.cpp b/src/IR2Vec.cpp index 6fe0b3a98..f48f150d6 100644 --- a/src/IR2Vec.cpp +++ b/src/IR2Vec.cpp @@ -73,9 +73,102 @@ void printVersion(raw_ostream &ostream) { cl::PrintVersionMessage(); } -int main(int argc, char **argv) { - cl::SetVersionPrinter(printVersion); - cl::HideUnrelatedOptions(category); +struct SymOutputs { + std::ofstream out; +}; + +struct FAOutputs : SymOutputs { + std::ofstream miss; + std::ofstream cyclic; +}; + +inline SymOutputs openSymOutputs(const std::string &baseName) { + SymOutputs f; + f.out.open(baseName, std::ios_base::app); + return f; +} + +inline FAOutputs openFAOutputs(const std::string &baseName) { + FAOutputs f; + f.out.open(baseName, std::ios_base::app); + f.miss.open("missCount_" + baseName, std::ios_base::app); + f.cyclic.open("cyclicCount_" + baseName, std::ios_base::app); + return f; +} + +template +inline void runMaybeTimed(bool shouldTime, const char *timingMsgFmt, F &&job) { + if (shouldTime) { + const clock_t start = clock(); + std::forward(job)(); + const clock_t end = clock(); + const double elapsed = static_cast(end - start) / CLOCKS_PER_SEC; + std::printf(timingMsgFmt, elapsed); + } else { + std::forward(job)(); + } +} + +template +inline void executeEncoder(const char *timingMsgFmt, bool shouldTime, + OutputsFactory &&makeOutputs, Body &&body) { + auto M = getLLVMIR(); + auto vocabulary = VocabularyFactory::createVocabulary(DIM)->getVocabulary(); + Encoder encoder(*M, vocabulary); + auto files = std::forward(makeOutputs)(oname); + + auto job = [&] { std::forward(body)(encoder, files); }; + runMaybeTimed(shouldTime, timingMsgFmt, job); +} + +void generateFAEncodingsFunction(std::string funcName) { + executeEncoder( + "Time taken by on-demand generation of flow-aware encodings is: %.6f " + "seconds.\n", + printTime, openFAOutputs, [&, funcName](IR2Vec_FA &FA, FAOutputs &files) { + FA.generateFlowAwareEncodingsForFunction(&files.out, funcName, + &files.miss, &files.cyclic); + }); +} + +void generateFAEncodings() { + executeEncoder( + "Time taken by normal generation of flow-aware encodings is: %.6f " + "seconds.\n", + printTime, openFAOutputs, [&](IR2Vec_FA &FA, FAOutputs &files) { + FA.generateFlowAwareEncodings(&files.out, &files.miss, &files.cyclic); + }); +} + +void generateSymEncodingsFunction(std::string funcName) { + executeEncoder( + "Time taken by on-demand generation of symbolic encodings is: %.6f " + "seconds.\n", + printTime, openSymOutputs, + [&, funcName](IR2Vec_Symbolic &SYM, SymOutputs &files) { + SYM.generateSymbolicEncodingsForFunction(&files.out, funcName); + }); +} + +void generateSYMEncodings() { + executeEncoder( + "Time taken by normal generation of symbolic encodings is: %.6f " + "seconds.\n", + printTime, openSymOutputs, [&](IR2Vec_Symbolic &SYM, SymOutputs &files) { + SYM.generateSymbolicEncodings(&files.out); + }); +} + +void collectIRfunc() { + auto M = getLLVMIR(); + CollectIR cir(M); + std::ofstream o; + o.open(oname, std::ios_base::app); + cir.generateTriplets(o); + o.close(); +} + +void setGlobalVars(int argc, char **argv) { cl::ParseCommandLineOptions(argc, argv); fa = cl_fa; @@ -92,113 +185,53 @@ int main(int argc, char **argv) { WT = cl_WT; debug = cl_debug; printTime = cl_printTime; +} +void checkFailureConditions() { bool failed = false; - if (!((sym ^ fa) ^ collectIR)) { - errs() << "Either of sym, fa or collectIR should be specified\n"; + + if (!(sym || fa || collectIR)) { + errs() << "Either of sym, fa, or collectIR should be specified\n"; failed = true; } + if (failed) + exit(1); + if (sym || fa) { if (level != 'p' && level != 'f') { errs() << "Invalid level specified: Use either p or f\n"; failed = true; } } else { - if (!collectIR) { - errs() << "Either of sym, fa or collectIR should be specified\n"; - failed = true; - } else if (level) + assert(collectIR == true); + + if (collectIR && level) { errs() << "[WARNING] level would not be used in collectIR mode\n"; + } } if (failed) exit(1); +} - auto M = getLLVMIR(); - auto vocabulary = VocabularyFactory::createVocabulary(DIM)->getVocabulary(); +int main(int argc, char **argv) { + cl::SetVersionPrinter(printVersion); + cl::HideUnrelatedOptions(category); + setGlobalVars(argc, argv); + checkFailureConditions(); - // newly added if (sym && !(funcName.empty())) { - IR2Vec_Symbolic SYM(*M, vocabulary); - std::ofstream o; - o.open(oname, std::ios_base::app); - if (printTime) { - clock_t start = clock(); - SYM.generateSymbolicEncodingsForFunction(&o, funcName); - clock_t end = clock(); - double elapsed = double(end - start) / CLOCKS_PER_SEC; - printf("Time taken by on-demand generation of symbolic encodings " - "is: %.6f " - "seconds.\n", - elapsed); - } else { - SYM.generateSymbolicEncodingsForFunction(&o, funcName); - } - o.close(); + generateSymEncodingsFunction(funcName); } else if (fa && !(funcName.empty())) { - IR2Vec_FA FA(*M, vocabulary); - std::ofstream o, missCount, cyclicCount; - o.open(oname, std::ios_base::app); - missCount.open("missCount_" + oname, std::ios_base::app); - cyclicCount.open("cyclicCount_" + oname, std::ios_base::app); - if (printTime) { - clock_t start = clock(); - FA.generateFlowAwareEncodingsForFunction(&o, funcName, &missCount, - &cyclicCount); - clock_t end = clock(); - double elapsed = double(end - start) / CLOCKS_PER_SEC; - printf("Time taken by on-demand generation of flow-aware encodings " - "is: %.6f " - "seconds.\n", - elapsed); - } else { - FA.generateFlowAwareEncodingsForFunction(&o, funcName, &missCount, - &cyclicCount); - } - o.close(); + generateFAEncodingsFunction(funcName); } else if (fa) { - IR2Vec_FA FA(*M, vocabulary); - std::ofstream o, missCount, cyclicCount; - o.open(oname, std::ios_base::app); - missCount.open("missCount_" + oname, std::ios_base::app); - cyclicCount.open("cyclicCount_" + oname, std::ios_base::app); - if (printTime) { - clock_t start = clock(); - FA.generateFlowAwareEncodings(&o, &missCount, &cyclicCount); - clock_t end = clock(); - double elapsed = double(end - start) / CLOCKS_PER_SEC; - printf("Time taken by normal generation of flow-aware encodings " - "is: %.6f " - "seconds.\n", - elapsed); - } else { - FA.generateFlowAwareEncodings(&o, &missCount, &cyclicCount); - } - o.close(); + generateFAEncodings(); } else if (sym) { - IR2Vec_Symbolic SYM(*M, vocabulary); - std::ofstream o; - o.open(oname, std::ios_base::app); - if (printTime) { - clock_t start = clock(); - SYM.generateSymbolicEncodings(&o); - clock_t end = clock(); - double elapsed = double(end - start) / CLOCKS_PER_SEC; - printf("Time taken by normal generation of symbolic encodings is: " - "%.6f " - "seconds.\n", - elapsed); - } else { - SYM.generateSymbolicEncodings(&o); - } - o.close(); + generateSYMEncodings(); } else if (collectIR) { - CollectIR cir(M); - std::ofstream o; - o.open(oname, std::ios_base::app); - cir.generateTriplets(o); - o.close(); + collectIRfunc(); } + return 0; } From 27d0c01da78229dcacacd2aaef3347d73dca6552 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Sat, 9 Aug 2025 17:20:24 +0530 Subject: [PATCH 02/12] upload-artifact and download-artifact version upgrade --- .github/workflows/publish.yml | 2 +- .github/workflows/upload-pypi.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3962e63ee..bd1e2967f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -41,7 +41,7 @@ jobs: run: cd build && cmake -DEigen3_DIR=./eigen-build .. - name: make run: cd build && make -j8 - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: name: ir2vec path: | diff --git a/.github/workflows/upload-pypi.yml b/.github/workflows/upload-pypi.yml index 1448d00cd..5bd0bf1ab 100644 --- a/.github/workflows/upload-pypi.yml +++ b/.github/workflows/upload-pypi.yml @@ -33,7 +33,7 @@ jobs: - name: Build sdist run: cd Manylinux2014_Compliant_Source/pkg && pipx run build --sdist - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: path: Manylinux2014_Compliant_Source/pkg/dist/*.tar.gz @@ -44,7 +44,7 @@ jobs: needs: [build_wheels, build_sdist] runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: artifact path: dist From 4e208b5dc7c5d6bd0279144ade92d3b8bcc28e40 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Mon, 18 Aug 2025 18:59:02 +0530 Subject: [PATCH 03/12] using vector reference instead of passing vector object --- src/Symbolic.cpp | 135 ++++++++++++----------------------------- src/include/Symbolic.h | 2 +- 2 files changed, 40 insertions(+), 97 deletions(-) diff --git a/src/Symbolic.cpp b/src/Symbolic.cpp index d383513a3..a9ebe3f61 100644 --- a/src/Symbolic.cpp +++ b/src/Symbolic.cpp @@ -27,13 +27,16 @@ using namespace llvm; using namespace IR2Vec; using abi::__cxa_demangle; -Vector IR2Vec_Symbolic::getValue(std::string key) { +bool IR2Vec_Symbolic::getValue(std::string key, IR2Vec::Vector &out) { Vector vec(DIM, 0); - if (vocabulary.find(key) == vocabulary.end()) - IR2VEC_DEBUG(errs() << "cannot find key in map : " << key << "\n"); - else - vec = vocabulary[key]; - return vec; + if (auto it = vocabulary.find(std::string(key)); it != vocabulary.end()) { + out = it->second; + return true; + } + + out.assign(DIM, 0); + IR2VEC_DEBUG(errs() << "cannot find key in map : " << key << "\n"); + return false; } void IR2Vec_Symbolic::generateSymbolicEncodings(std::ostream *o) { @@ -49,11 +52,10 @@ void IR2Vec_Symbolic::generateSymbolicEncodings(std::ostream *o) { noOfFunc++; } - // else if (level == 'p') { + // assert(level == 'p' && "This block should only be executed when level + // == 'p'"); std::transform(pgmVector.begin(), pgmVector.end(), tmp.begin(), pgmVector.begin(), std::plus()); - - // } } } @@ -138,118 +140,59 @@ Vector IR2Vec_Symbolic::bb2Vec(BasicBlock &B, Vector bbVector(DIM, 0); for (auto &I : B) { - Vector instVector(DIM, 0); - auto vec = getValue(I.getOpcodeName()); - // if (isa(I)) { - // auto ci = dyn_cast(&I); - // // ci->dump(); - // Function *func = ci->getCalledFunction(); - // if (func) { - // // if(!func->isDeclaration()) - // // if(func != I.getParent()->getParent()) - // // errs() << func->getName() << "\t" << - // // I.getParent()->getParent()->getName() << "\n"; - // if (!func->isDeclaration() && - // std::find(funcStack.begin(), funcStack.end(), func) == - // funcStack.end()) { - // auto funcVec = func2Vec(*func, funcStack); - - // std::transform(vec.begin(), vec.end(), funcVec.begin(), - // vec.begin(), - // std::plus()); - // } - // } else { - // IR2VEC_DEBUG(I.dump()); - // IR2VEC_DEBUG(errs() << "==========================Function - // definition - // " - // "not found==================\n"); - // } - // } - scaleVector(vec, WO); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + Vector instVector(DIM, 0), opcode_vec; + getValue(I.getOpcodeName(), opcode_vec); + scaleVector(opcode_vec, WO); + std::transform(instVector.begin(), instVector.end(), opcode_vec.begin(), instVector.begin(), std::plus()); - auto type = I.getType(); + Vector type_vec; + auto type = I.getType(); if (type->isVoidTy()) { - vec = getValue("voidTy"); + getValue("voidTy", type_vec); } else if (type->isFloatingPointTy()) { - vec = getValue("floatTy"); + getValue("floatTy", type_vec); } else if (type->isIntegerTy()) { - vec = getValue("integerTy"); + getValue("integerTy", type_vec); } else if (type->isFunctionTy()) { - vec = getValue("functionTy"); + getValue("functionTy", type_vec); } else if (type->isStructTy()) { - vec = getValue("structTy"); + getValue("structTy", type_vec); } else if (type->isArrayTy()) { - vec = getValue("arrayTy"); + getValue("arrayTy", type_vec); } else if (type->isPointerTy()) { - vec = getValue("pointerTy"); + getValue("pointerTy", type_vec); } else if (type->isVectorTy()) { - vec = getValue("vectorTy"); + getValue("vectorTy", type_vec); } else if (type->isEmptyTy()) { - vec = getValue("emptyTy"); + getValue("emptyTy", type_vec); } else if (type->isLabelTy()) { - vec = getValue("labelTy"); + getValue("labelTy", type_vec); } else if (type->isTokenTy()) { - vec = getValue("tokenTy"); + getValue("tokenTy", type_vec); } else if (type->isMetadataTy()) { - vec = getValue("metadataTy"); + getValue("metadataTy", type_vec); } else { - vec = getValue("unknownTy"); + getValue("unknownTy", type_vec); } - /*switch (I.getType()->getTypeID()) { - case 0: - vec = getValue("voidTy"); - break; - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - vec = getValue("floatTy"); - break; - case 11: - vec = getValue("integerTy"); - break; - case 12: - vec = getValue("functionTy"); - break; - case 13: - vec = getValue("structTy"); - break; - case 14: - vec = getValue("arrayTy"); - break; - case 15: - vec = getValue("pointerTy"); - break; - case 16: - vec = getValue("vectorTy"); - break; - default: - vec = getValue("unknownTy"); - }*/ - - scaleVector(vec, WT); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + scaleVector(type_vec, WT); + std::transform(instVector.begin(), instVector.end(), type_vec.begin(), instVector.begin(), std::plus()); for (unsigned i = 0; i < I.getNumOperands(); i++) { - Vector vec; + Vector operand_vec; if (isa(I.getOperand(i))) { - vec = getValue("function"); + getValue("function", operand_vec); } else if (isa(I.getOperand(i)->getType())) { - vec = getValue("pointer"); + getValue("pointer", operand_vec); } else if (isa(I.getOperand(i))) { - vec = getValue("constant"); + getValue("constant", operand_vec); } else { - vec = getValue("variable"); + getValue("variable", operand_vec); } - scaleVector(vec, WA); + scaleVector(operand_vec, WA); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + std::transform(instVector.begin(), instVector.end(), operand_vec.begin(), instVector.begin(), std::plus()); instVecMap[&I] = instVector; } diff --git a/src/include/Symbolic.h b/src/include/Symbolic.h index 93b3aa896..d06dcde46 100644 --- a/src/include/Symbolic.h +++ b/src/include/Symbolic.h @@ -27,7 +27,7 @@ class IR2Vec_Symbolic { IR2Vec::VocabTy &vocabulary; IR2Vec::Vector pgmVector; - IR2Vec::Vector getValue(std::string key); + bool getValue(std::string key, IR2Vec::Vector &out); IR2Vec::Vector bb2Vec(llvm::BasicBlock &B, llvm::SmallVector &funcStack); IR2Vec::Vector func2Vec(llvm::Function &F, From 01b3e41c0f18765bf98fbd2a331f2503424a58e6 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Mon, 18 Aug 2025 19:16:01 +0530 Subject: [PATCH 04/12] using unordered map for vocab, pass vector pointer to updatedRes --- src/include/utils.h | 4 ++-- src/utils.cpp | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/include/utils.h b/src/include/utils.h index 1541f1c64..27d4c905a 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -30,7 +30,7 @@ namespace IR2Vec { }) using Vector = std::vector; -using VocabTy = std::map; +using VocabTy = std::unordered_map; using abi::__cxa_demangle; extern bool fa; @@ -52,7 +52,7 @@ void scaleVector(Vector &vec, float factor); // newly added std::string getDemagledName(const llvm::Function *function); char *getActualName(llvm::Function *function); -std::string updatedRes(IR2Vec::Vector tmp, llvm::Function *f, llvm::Module *M); +std::string updatedRes(IR2Vec::Vector &tmp, llvm::Function *f, llvm::Module *M); } // namespace IR2Vec #endif diff --git a/src/utils.cpp b/src/utils.cpp index b9fc0e580..551b5f6c7 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -42,9 +42,7 @@ std::unique_ptr IR2Vec::getLLVMIR() { } void IR2Vec::scaleVector(Vector &vec, float factor) { - for (unsigned i = 0; i < vec.size(); i++) { - vec[i] = vec[i] * factor; - } + std::for_each(vec.begin(), vec.end(), [factor](auto &x) { x *= factor; }); } // Function to get demangled function name @@ -78,7 +76,7 @@ char *IR2Vec::getActualName(llvm::Function *function) { } // Function to return updated res -std::string IR2Vec::updatedRes(IR2Vec::Vector tmp, llvm::Function *f, +std::string IR2Vec::updatedRes(IR2Vec::Vector &tmp, llvm::Function *f, llvm::Module *M) { std::string res = ""; auto demangledName = getDemagledName(f); From af98964f74f73d2943bd883de954dc6e949acfab Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Mon, 18 Aug 2025 19:34:00 +0530 Subject: [PATCH 05/12] replacing float check with std::abs check --- src/Symbolic.cpp | 5 ++--- src/utils.cpp | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Symbolic.cpp b/src/Symbolic.cpp index a9ebe3f61..8d8cd9d75 100644 --- a/src/Symbolic.cpp +++ b/src/Symbolic.cpp @@ -66,9 +66,8 @@ void IR2Vec_Symbolic::generateSymbolicEncodings(std::ostream *o) { res += std::to_string(cls) + "\t"; for (auto i : pgmVector) { - if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { - i = 0; - } + if (std::abs(i) <= 1e-4f) + i = 0.0f; res += std::to_string(i) + "\t"; } res += "\n"; diff --git a/src/utils.cpp b/src/utils.cpp index 551b5f6c7..611a013d5 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -85,9 +85,8 @@ std::string IR2Vec::updatedRes(IR2Vec::Vector &tmp, llvm::Function *f, res += "=\t"; for (auto i : tmp) { - if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { - i = 0; - } + if (std::abs(i) <= 1e-4f) + i = 0.0f; res += std::to_string(i) + "\t"; } From 4ae23652cb2ea3ff70bb122ab9b309be36ddf306 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Mon, 18 Aug 2025 19:39:49 +0530 Subject: [PATCH 06/12] codebase should use std::unordered_map instead of std::map for faster lookup --- src/generate_vocabulary.py | 14 ++++++-------- src/include/IR2Vec.h | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/generate_vocabulary.py b/src/generate_vocabulary.py index 80068717e..ad117d600 100644 --- a/src/generate_vocabulary.py +++ b/src/generate_vocabulary.py @@ -41,7 +41,7 @@ def generate_base_vocabulary_header(): f"{HEADER_CLANG_FORMAT_OFF}\n" "#ifndef __VOCABULARY_H__\n" "#define __VOCABULARY_H__\n\n" - "#include \n" + "#include \n" "#include \n" "#include \n" "#include \n" @@ -51,7 +51,7 @@ def generate_base_vocabulary_header(): "class VocabularyBase {\n" "public:\n" " virtual ~VocabularyBase() {}\n" - " virtual const std::map& getVocabulary() const = 0;\n" + " virtual const std::unordered_map& getVocabulary() const = 0;\n" "};\n\n" "class VocabularyFactory {\n" "public:\n" @@ -69,24 +69,22 @@ def generate_vocabulary_class(vocab_file, class_name): f"{HEADER_CLANG_FORMAT_OFF}\n" f"#ifndef __{class_name.upper()}__\n" f"#define __{class_name.upper()}__\n\n" - f"#include \n" + f"#include \n" f"#include \n" f"#include \n" f'#include "Vocabulary.h" // Include the base class\n\n' f"namespace IR2Vec {{\n\n" f"class {class_name} : public VocabularyBase {{\n" f"public:\n" - f" const std::map& getVocabulary() const override{{\n" + f" const std::unordered_map& getVocabulary() const override{{\n" f" return vocabulary;\n" f" }}\n" f"private:\n" - f" static const std::map vocabulary;\n" + f" static const std::unordered_map vocabulary;\n" f"}};\n" ) - opening = ( - f"\nconst std::map {class_name}::vocabulary = {{\n" - ) + opening = f"\nconst std::unordered_map {class_name}::vocabulary = {{\n" closing = """\ }; } // namespace IR2Vec diff --git a/src/include/IR2Vec.h b/src/include/IR2Vec.h index f8d13c0d8..e0b39503e 100644 --- a/src/include/IR2Vec.h +++ b/src/include/IR2Vec.h @@ -28,7 +28,7 @@ class Embeddings { llvm::SmallMapVector bbVecMap; llvm::SmallMapVector funcVecMap; Vector pgmVector; - std::map vocabulary; + std::unordered_map vocabulary; public: Embeddings() = default; From 56d5a1bd1caf0f4664d9871565a95f458de80733 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Mon, 18 Aug 2025 20:06:41 +0530 Subject: [PATCH 07/12] Similar optimizations and refactoring for Flowaware --- src/FlowAware.cpp | 149 +++++++++++++++++++++------------------- src/Symbolic.cpp | 3 +- src/include/FlowAware.h | 2 +- src/utils.cpp | 2 +- 4 files changed, 81 insertions(+), 75 deletions(-) diff --git a/src/FlowAware.cpp b/src/FlowAware.cpp index 4a1ec6b26..43629ec45 100644 --- a/src/FlowAware.cpp +++ b/src/FlowAware.cpp @@ -96,14 +96,16 @@ void IR2Vec_FA::collectWriteDefsMap(Module &M) { } } -Vector IR2Vec_FA::getValue(std::string key) { - Vector vec(DIM, 0); - if (vocabulary.find(key) == vocabulary.end()) { - IR2VEC_DEBUG(errs() << "cannot find key in map : " << key << "\n"); - dataMissCounter++; - } else - vec = vocabulary[key]; - return vec; +bool IR2Vec_FA::getValue(std::string key, IR2Vec::Vector &out) { + if (auto it = vocabulary.find(std::string(key)); it != vocabulary.end()) { + out = it->second; + return true; + } + + out.assign(DIM, 0); + dataMissCounter++; + IR2VEC_DEBUG(errs() << "cannot find key in map : " << key << "\n"); + return false; } // Function to update funcVecMap of function with vectors of it's callee list @@ -169,9 +171,8 @@ void IR2Vec_FA::generateFlowAwareEncodings(std::ostream *o, res += std::to_string(cls) + "\t"; for (auto i : pgmVector) { - if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { - i = 0; - } + if (std::abs(i) <= 1e-4f) + i = 0.0f; res += std::to_string(i) + "\t"; } res += "\n"; @@ -537,11 +538,11 @@ Vector IR2Vec_FA::func2Vec(Function &F, } bbVecMap[b] = bbVector; IR2VEC_DEBUG(outs() << "-------------------------------------------\n"); - for (auto i : bbVector) { - if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { - i = 0; - } - } + + std::for_each(bbVector.begin(), bbVector.end(), [](double &x) { + if (std::abs(x) <= 1e-4f) + x = 0; + }); std::transform(funcVector.begin(), funcVector.end(), bbVector.begin(), funcVector.begin(), std::plus()); @@ -864,11 +865,11 @@ void IR2Vec_FA::getPartialVec( return; } - Vector instVector(DIM, 0); + Vector instVector(DIM, 0), opcode_vec; StringRef opcodeName = I.getOpcodeName(); - auto vec = getValue(opcodeName.str()); + getValue(opcodeName.str(), opcode_vec); IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + std::transform(instVector.begin(), instVector.end(), opcode_vec.begin(), instVector.begin(), std::plus()); partialInstValMap[&I] = instVector; @@ -878,38 +879,39 @@ void IR2Vec_FA::getPartialVec( i.first->print(outs()); outs() << "\n"; }); - auto type = I.getType(); + auto type = I.getType(); + Vector type_vec; if (type->isVoidTy()) { - vec = getValue("voidTy"); + getValue("voidTy", type_vec); } else if (type->isFloatingPointTy()) { - vec = getValue("floatTy"); + getValue("floatTy", type_vec); } else if (type->isIntegerTy()) { - vec = getValue("integerTy"); + getValue("integerTy", type_vec); } else if (type->isFunctionTy()) { - vec = getValue("functionTy"); + getValue("functionTy", type_vec); } else if (type->isStructTy()) { - vec = getValue("structTy"); + getValue("structTy", type_vec); } else if (type->isArrayTy()) { - vec = getValue("arrayTy"); + getValue("arrayTy", type_vec); } else if (type->isPointerTy()) { - vec = getValue("pointerTy"); + getValue("pointerTy", type_vec); } else if (type->isVectorTy()) { - vec = getValue("vectorTy"); + getValue("vectorTy", type_vec); } else if (type->isEmptyTy()) { - vec = getValue("emptyTy"); + getValue("emptyTy", type_vec); } else if (type->isLabelTy()) { - vec = getValue("labelTy"); + getValue("labelTy", type_vec); } else if (type->isTokenTy()) { - vec = getValue("tokenTy"); + getValue("tokenTy", type_vec); } else if (type->isMetadataTy()) { - vec = getValue("metadataTy"); + getValue("metadataTy", type_vec); } else { - vec = getValue("unknownTy"); + getValue("unknownTy", type_vec); } - scaleVector(vec, WT); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + scaleVector(type_vec, WT); + std::transform(instVector.begin(), instVector.end(), type_vec.begin(), instVector.begin(), std::plus()); partialInstValMap[&I] = instVector; @@ -940,7 +942,8 @@ void IR2Vec_FA::solveInsts( B.push_back(tmp); for (unsigned i = 0; i < inst->getNumOperands(); i++) { if (isa(inst->getOperand(i))) { - auto f = getValue("function"); + Vector f; + getValue("function", f); if (isa(inst)) { auto ci = dyn_cast(inst); Function *func = ci->getCalledFunction(); @@ -965,7 +968,8 @@ void IR2Vec_FA::solveInsts( B.push_back(vec); } else if (isa(inst->getOperand(i)) && !isa(inst->getOperand(i)->getType())) { - auto c = getValue("constant"); + Vector c; + getValue("constant", c); auto svtmp = c; scaleVector(svtmp, WA); std::vector vtmp(svtmp.begin(), svtmp.end()); @@ -978,7 +982,8 @@ void IR2Vec_FA::solveInsts( IR2VEC_DEBUG(outs() << vec.back() << "\n"); B.push_back(vec); } else if (isa(inst->getOperand(i))) { - auto l = getValue("label"); + Vector l; + getValue("label", l); auto svtmp = l; scaleVector(svtmp, WA); std::vector vtmp(svtmp.begin(), svtmp.end()); @@ -1022,7 +1027,8 @@ void IR2Vec_FA::solveInsts( } } } else if (isa(inst->getOperand(i)->getType())) { - auto l = getValue("pointer"); + Vector l; + getValue("pointer", l); auto svtmp = l; scaleVector(svtmp, WA); std::vector vtmp(svtmp.begin(), svtmp.end()); @@ -1035,7 +1041,8 @@ void IR2Vec_FA::solveInsts( IR2VEC_DEBUG(outs() << vec.back() << "\n"); B.push_back(vec); } else { - auto l = getValue("variable"); + Vector l; + getValue("variable", l); auto svtmp = l; scaleVector(svtmp, WA); std::vector vtmp(svtmp.begin(), svtmp.end()); @@ -1137,9 +1144,9 @@ void IR2Vec_FA::solveSingleComponent( RDList.clear(); for (unsigned i = 0; i < I.getNumOperands() /*&& !isCyclic*/; i++) { - Vector vecOp(DIM, 0); + Vector vecOp; if (isa(I.getOperand(i))) { - vecOp = getValue("function"); + getValue("function", vecOp); if (isa(I)) { auto ci = dyn_cast(&I); Function *func = ci->getCalledFunction(); @@ -1156,17 +1163,17 @@ void IR2Vec_FA::solveSingleComponent( // non-numeric/alphabetic constants are also caught as pointer types else if (isa(I.getOperand(i)) && !isa(I.getOperand(i)->getType())) { - vecOp = getValue("constant"); + getValue("constant", vecOp); } else if (isa(I.getOperand(i))) { - vecOp = getValue("label"); + getValue("label", vecOp); } else { if (isa(I.getOperand(i))) { auto RD = getReachingDefs(&I, i); RDList.insert(RDList.end(), RD.begin(), RD.end()); } else if (isa(I.getOperand(i)->getType())) { - vecOp = getValue("pointer"); + getValue("pointer", vecOp); } else - vecOp = getValue("variable"); + getValue("variable", vecOp); } std::transform(VecArgs.begin(), VecArgs.end(), vecOp.begin(), @@ -1237,11 +1244,11 @@ void IR2Vec_FA::inst2Vec( return; } - Vector instVector(DIM, 0); + Vector instVector(DIM, 0), opcode_vec; StringRef opcodeName = I.getOpcodeName(); - auto vec = getValue(opcodeName.str()); + getValue(opcodeName.str(), opcode_vec); IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + std::transform(instVector.begin(), instVector.end(), opcode_vec.begin(), instVector.begin(), std::plus()); partialInstValMap[&I] = instVector; @@ -1253,36 +1260,36 @@ void IR2Vec_FA::inst2Vec( }); auto type = I.getType(); - + Vector type_vec; if (type->isVoidTy()) { - vec = getValue("voidTy"); + getValue("voidTy", type_vec); } else if (type->isFloatingPointTy()) { - vec = getValue("floatTy"); + getValue("floatTy", type_vec); } else if (type->isIntegerTy()) { - vec = getValue("integerTy"); + getValue("integerTy", type_vec); } else if (type->isFunctionTy()) { - vec = getValue("functionTy"); + getValue("functionTy", type_vec); } else if (type->isStructTy()) { - vec = getValue("structTy"); + getValue("structTy", type_vec); } else if (type->isArrayTy()) { - vec = getValue("arrayTy"); + getValue("arrayTy", type_vec); } else if (type->isPointerTy()) { - vec = getValue("pointerTy"); + getValue("pointerTy", type_vec); } else if (type->isVectorTy()) { - vec = getValue("vectorTy"); + getValue("vectorTy", type_vec); } else if (type->isEmptyTy()) { - vec = getValue("emptyTy"); + getValue("emptyTy", type_vec); } else if (type->isLabelTy()) { - vec = getValue("labelTy"); + getValue("labelTy", type_vec); } else if (type->isTokenTy()) { - vec = getValue("tokenTy"); + getValue("tokenTy", type_vec); } else if (type->isMetadataTy()) { - vec = getValue("metadataTy"); + getValue("metadataTy", type_vec); } else { - vec = getValue("unknownTy"); + getValue("unknownTy", type_vec); } - scaleVector(vec, WT); - std::transform(instVector.begin(), instVector.end(), vec.begin(), + scaleVector(type_vec, WT); + std::transform(instVector.begin(), instVector.end(), type_vec.begin(), instVector.begin(), std::plus()); partialInstValMap[&I] = instVector; @@ -1295,9 +1302,9 @@ void IR2Vec_FA::inst2Vec( RDList.clear(); for (unsigned i = 0; i < I.getNumOperands() /*&& !isCyclic*/; i++) { - Vector vecOp(DIM, 0); + Vector vecOp; if (isa(I.getOperand(i))) { - vecOp = getValue("function"); + getValue("function", vecOp); if (isa(I)) { auto ci = dyn_cast(&I); Function *func = ci->getCalledFunction(); @@ -1314,17 +1321,17 @@ void IR2Vec_FA::inst2Vec( // non-numeric/alphabetic constants are also caught as pointer types else if (isa(I.getOperand(i)) && !isa(I.getOperand(i)->getType())) { - vecOp = getValue("constant"); + getValue("constant", vecOp); } else if (isa(I.getOperand(i))) { - vecOp = getValue("label"); + getValue("label", vecOp); } else { if (isa(I.getOperand(i))) { auto RD = getReachingDefs(&I, i); RDList.insert(RDList.end(), RD.begin(), RD.end()); } else if (isa(I.getOperand(i)->getType())) - vecOp = getValue("pointer"); + getValue("pointer", vecOp); else - vecOp = getValue("variable"); + getValue("variable", vecOp); } std::transform(VecArgs.begin(), VecArgs.end(), vecOp.begin(), diff --git a/src/Symbolic.cpp b/src/Symbolic.cpp index 8d8cd9d75..bc7ef6e6a 100644 --- a/src/Symbolic.cpp +++ b/src/Symbolic.cpp @@ -28,7 +28,6 @@ using namespace IR2Vec; using abi::__cxa_demangle; bool IR2Vec_Symbolic::getValue(std::string key, IR2Vec::Vector &out) { - Vector vec(DIM, 0); if (auto it = vocabulary.find(std::string(key)); it != vocabulary.end()) { out = it->second; return true; @@ -67,7 +66,7 @@ void IR2Vec_Symbolic::generateSymbolicEncodings(std::ostream *o) { for (auto i : pgmVector) { if (std::abs(i) <= 1e-4f) - i = 0.0f; + i = 0; res += std::to_string(i) + "\t"; } res += "\n"; diff --git a/src/include/FlowAware.h b/src/include/FlowAware.h index 43f20ca06..2594924a4 100644 --- a/src/include/FlowAware.h +++ b/src/include/FlowAware.h @@ -72,7 +72,7 @@ class IR2Vec_FA { void getAllSCC(); - IR2Vec::Vector getValue(std::string key); + bool getValue(std::string key, IR2Vec::Vector &out); void collectWriteDefsMap(llvm::Module &M); void getTransitiveUse( const llvm::Instruction *root, const llvm::Instruction *def, diff --git a/src/utils.cpp b/src/utils.cpp index 611a013d5..cf88b10b6 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -86,7 +86,7 @@ std::string IR2Vec::updatedRes(IR2Vec::Vector &tmp, llvm::Function *f, res += "=\t"; for (auto i : tmp) { if (std::abs(i) <= 1e-4f) - i = 0.0f; + i = 0; res += std::to_string(i) + "\t"; } From 58b9a4bdbb0dc6b2e93b88bafe8f6cdc8cb5283e Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Wed, 20 Aug 2025 16:44:14 +0530 Subject: [PATCH 08/12] Changes for a basic pybind11 skeleton code setup --- CMakeLists.txt | 15 +++++++++++++-- bindings/CMakeLists.txt | 42 +++++++++++++++++++++++++++++++++++++++++ bindings/py_module.cpp | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 bindings/CMakeLists.txt create mode 100644 bindings/py_module.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index bc46bbf68..7aeb771f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,8 +18,19 @@ endif() set(CMAKE_CXX_STANDARD 17 CACHE STRING "") # LLVM is normally built without RTTI. Be consistent with that. -if(NOT LLVM_ENABLE_RTTI) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +if (NOT LLVM_ENABLE_RTTI) + if (TARGET IR2Vec) + target_compile_options(IR2Vec PRIVATE -fno-rtti) + endif() + if (TARGET IR2Vec_Static) + target_compile_options(IR2Vec_Static PRIVATE -fno-rtti) + endif() endif() add_subdirectory(src) + +# ---- NEW: toggle Python bindings build ---- +option(BUILD_PYTHON "Build the pybind11 Python extension" ON) +if (BUILD_PYTHON) + add_subdirectory(bindings) +endif() diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt new file mode 100644 index 000000000..f61e8f08b --- /dev/null +++ b/bindings/CMakeLists.txt @@ -0,0 +1,42 @@ +# bindings/CMakeLists.txt +cmake_minimum_required(VERSION 3.13) + +find_package(pybind11 CONFIG REQUIRED) +pybind11_add_module(_core py_module.cpp) + +# Headers from src tree (public ones) +target_include_directories(_core + PRIVATE + ${CMAKE_SOURCE_DIR}/src/include + ${CMAKE_BINARY_DIR}/src/include # where generated headers usually land +) + +# Link to your IR2Vec lib target if present, and inherit its usage reqs +set(_ir2vec_tgt "") +if (TARGET ${IR2VEC_LIB}) + set(_ir2vec_tgt ${IR2VEC_LIB}) +elseif (TARGET ${IR2VEC_LIB_STATIC}) + set(_ir2vec_tgt ${IR2VEC_LIB_STATIC}) +endif() + +if (_ir2vec_tgt) + target_link_libraries(_core PRIVATE ${_ir2vec_tgt}) + + # Ensure generation runs before compiling _core + add_dependencies(_core ${_ir2vec_tgt}) + + # Inherit the library target's include dirs & compile defs (covers generated dirs) + target_include_directories(_core PRIVATE + $) + target_compile_definitions(_core PRIVATE + $) +else() + message(WARNING "IR2Vec library target not found; _core may miss generated headers.") +endif() + +if(UNIX AND NOT APPLE) + target_compile_options(_core PRIVATE -fvisibility=hidden) +endif() + +# Install into a Python package dir +install(TARGETS _core LIBRARY DESTINATION ir2vec) diff --git a/bindings/py_module.cpp b/bindings/py_module.cpp new file mode 100644 index 000000000..160e1d19c --- /dev/null +++ b/bindings/py_module.cpp @@ -0,0 +1,33 @@ +// bindings/py_module.cpp +#include +#include + +// Pull in your IR2Vec public headers: +#include "IR2Vec.h" +#include "utils.h" + +namespace py = pybind11; + +PYBIND11_MODULE(_core, m) { + m.doc() = "IR2Vec Python bindings (pybind11 skeleton)"; + + // Simple canary to prove the module loads + m.def("ping", [] { return std::string("ir2vec bindings alive"); }); + + // Example: expose a version string if you have one + // (Uncomment if you have a configured header like version.h) + // m.def("version", [] { return std::string(IR2VEC_VERSION_STRING); }); + + // TODO: add your real API. For example (adjust signatures to your API): + // + // m.def("get_program_vector", + // [](const std::string& ir_path) { + // py::gil_scoped_release release; + // return getProgramVector(ir_path); // declared in IR2Vec.h + // }, + // py::arg("ir_path"), + // "Compute program vector for an LLVM IR file"); + // + // m.def("get_instruction_vectors", + // &getInstructionVectors, py::arg("ir_path")); +} From fc8849346393f40c5e25fecb9d1a3e6973565cee Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Thu, 21 Aug 2025 11:27:26 +0530 Subject: [PATCH 09/12] debug commit - wrapper c++ code for ir2vec api, pybind module --- bindings/py_module.cpp | 122 ++++++++++++++++++++++++++++++++++++++--- src/include/IR2Vec.h | 14 +++-- src/libIR2Vec.cpp | 24 ++++++-- 3 files changed, 141 insertions(+), 19 deletions(-) diff --git a/bindings/py_module.cpp b/bindings/py_module.cpp index 160e1d19c..f9020c610 100644 --- a/bindings/py_module.cpp +++ b/bindings/py_module.cpp @@ -5,21 +5,127 @@ // Pull in your IR2Vec public headers: #include "IR2Vec.h" #include "utils.h" +#include "version.h" +#include +#include +#include namespace py = pybind11; +class IR2VecHandler { +private: + std::string fileName; + std::string outputFile; + std::string mode; + std::string level; + IR2Vec::Embeddings *emb = nullptr; + unsigned dim; + +public: + IR2VecHandler(std::string fileName, std::string outputFile, std::string mode, + std::string level, unsigned dim) + : fileName(std::move(fileName)), outputFile(std::move(outputFile)), + mode(std::move(mode)), level(std::move(level)), + emb(new IR2Vec::Embeddings()), dim(dim) {} + + ~IR2VecHandler() { delete emb; } + + std::string getFile() { return fileName; } + std::string getOutputFile() { return outputFile; } + std::string getMode() { return mode; } + std::string getLevel() { return level; } + + IR2Vec::Vector getProgramVector() const { return emb->getProgramVector(); } + + llvm::SmallMapVector & + getInstVecMap() { + return emb->getInstVecMap(); + } + + llvm::SmallMapVector & + getFunctionVecMap() { + return emb->getFunctionVecMap(); + } + + bool fileNotValid(std::string filename) { + std::ifstream temp; + temp.open(filename, std::ios_base::in); + if (temp.peek() == std::ifstream::traits_type::eof() || + temp.bad() == true || temp.fail() == true) { + return true; + } + temp.close(); + return false; + } + + void initEncodings(std::string function_name = "") { + IR2Vec::iname = fileName; + IR2Vec::IR2VecMode ir2vecMode = + (mode == std::string("sym") ? IR2Vec::Symbolic : IR2Vec::FlowAware); + + std::unique_ptr Module = IR2Vec::getLLVMIR(); + + emb = std::make_unique(*Module, ir2vecMode, level.at(0), + outputFile, dim, function_name) + .get(); + + if (!emb) { + throw std::runtime_error("Failed to create embeddings"); + } + } +}; + +IR2VecHandler *createIR2VecObject(std::string filename, std::string output_file, + std::string mode, std::string level, + unsigned dim) { + IR2VecHandler *ir2vecObj = + new IR2VecHandler(filename, output_file, mode, level, dim); + if (!ir2vecObj) { + throw std::runtime_error("Failed to Create embeddings"); + } + return ir2vecObj; +} + +IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", + std::string level = "", + std::string output_file = "", + std::string function_name = "", + unsigned dim = 300) { + + if (fileNotValid(filename)) + throw std::runtime_error("Invalid File Path"); + + if (not output_file.empty()) + if (fileNotValid(output_file)) + throw std::runtime_error("Invalid Output File Path"); + + if (not(mode == std::string("sym") or mode == std::string("fa"))) + throw std::runtime_error( + "Eroneous mode entered . Either of sym, fa should be " + "specified"); + + if (not(level.at(0) == 'p' or level.at(0) == 'f')) + throw std::runtime_error("Invalid level specified: Use either p or f"); + + IR2VecHandler *ir2vecObj = + createIR2VecObject(filename, output_file, mode, level, dim); + + ir2vecObj->initEncodings(function_name); + + return ir2vecObj; +} + PYBIND11_MODULE(_core, m) { - m.doc() = "IR2Vec Python bindings (pybind11 skeleton)"; + m.doc() = R"pbdoc( + IR2Vec Python bindings. - // Simple canary to prove the module loads - m.def("ping", [] { return std::string("ir2vec bindings alive"); }); + Exposes selected APIs and utilities from the IR2Vec C++ library. + )pbdoc"; - // Example: expose a version string if you have one - // (Uncomment if you have a configured header like version.h) - // m.def("version", [] { return std::string(IR2VEC_VERSION_STRING); }); + m.def( + "getVersion", [] { return std::string(IR2VEC_VERSION); }, + "Get IR2Vec Version"); - // TODO: add your real API. For example (adjust signatures to your API): - // // m.def("get_program_vector", // [](const std::string& ir_path) { // py::gil_scoped_release release; diff --git a/src/include/IR2Vec.h b/src/include/IR2Vec.h index e0b39503e..b9926fad5 100644 --- a/src/include/IR2Vec.h +++ b/src/include/IR2Vec.h @@ -21,7 +21,7 @@ enum IR2VecMode { FlowAware, Symbolic }; class Embeddings { int generateEncodings(llvm::Module &M, IR2VecMode mode, char level = '\0', std::string funcName = "", unsigned dim = 300, - std::ostream *o = nullptr, int cls = -1, float WO = 1, + std::string outputFile = "", int cls = -1, float WO = 1, float WA = 0.2, float WT = 0.5); llvm::SmallMapVector instVecMap; @@ -36,17 +36,19 @@ class Embeddings { std::string funcName = "", float WO = 1, float WA = 0.2, float WT = 0.5) { vocabulary = VocabularyFactory::createVocabulary(dim)->getVocabulary(); - generateEncodings(M, mode, '\0', funcName, dim, nullptr, -1, WO, WA, WT); + generateEncodings(M, mode, '\0', funcName, dim, "", -1, WO, WA, WT); } // Use this constructor if the representations ought to be written to a // file. Analogous to the command line options that are being used in IR2Vec // binary. - Embeddings(llvm::Module &M, IR2VecMode mode, char level, std::ostream *o, - unsigned dim = 300, std::string funcName = "", float WO = 1, - float WA = 0.2, float WT = 0.5) { + Embeddings(llvm::Module &M, IR2VecMode mode, char level, + std::string outputFile = "", unsigned dim = 300, + std::string funcName = "", float WO = 1, float WA = 0.2, + float WT = 0.5) { vocabulary = VocabularyFactory::createVocabulary(dim)->getVocabulary(); - generateEncodings(M, mode, level, funcName, dim, o, -1, WO, WA, WT); + generateEncodings(M, mode, level, funcName, dim, outputFile, -1, WO, WA, + WT); } // Returns a map containing instructions and the corresponding vector diff --git a/src/libIR2Vec.cpp b/src/libIR2Vec.cpp index e62346f33..8e00c583c 100644 --- a/src/libIR2Vec.cpp +++ b/src/libIR2Vec.cpp @@ -18,8 +18,8 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, IR2Vec::IR2VecMode mode, char level, std::string funcName, unsigned dim, - std::ostream *o, int cls, float WO, - float WA, float WT) { + std::string outputFile, int cls, + float WO, float WA, float WT) { IR2Vec::level = level; IR2Vec::cls = cls; @@ -29,15 +29,29 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, IR2Vec::funcName = funcName; IR2Vec::DIM = dim; + std::optional outStream; + std::ostream *os = [&]() -> std::ostream * { + if (outputFile.empty()) { + outStream.reset(); + return nullptr; + } + + outStream.emplace(outputFile, std::ios_base::app); + if (!outStream->is_open()) + throw std::runtime_error("Failed to open " + outputFile); + + return std::addressof(outStream.value()); + }(); + if (mode == IR2Vec::IR2VecMode::FlowAware && !funcName.empty()) { IR2Vec_FA FA(M, vocabulary); - FA.generateFlowAwareEncodingsForFunction(o, funcName); + FA.generateFlowAwareEncodingsForFunction(os, funcName); instVecMap = FA.getInstVecMap(); funcVecMap = FA.getFuncVecMap(); bbVecMap = FA.getBBVecMap(); } else if (mode == IR2Vec::IR2VecMode::FlowAware) { IR2Vec_FA FA(M, vocabulary); - FA.generateFlowAwareEncodings(o); + FA.generateFlowAwareEncodings(os); instVecMap = FA.getInstVecMap(); funcVecMap = FA.getFuncVecMap(); bbVecMap = FA.getBBVecMap(); @@ -50,7 +64,7 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, bbVecMap = SYM.getBBVecMap(); } else if (mode == IR2Vec::IR2VecMode::Symbolic) { IR2Vec_Symbolic SYM(M, vocabulary); - SYM.generateSymbolicEncodings(o); + SYM.generateSymbolicEncodings(os); instVecMap = SYM.getInstVecMap(); funcVecMap = SYM.getFuncVecMap(); bbVecMap = SYM.getBBVecMap(); From b0547bb3fceca09f16f32d49afe4eb814278c2f6 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Thu, 21 Aug 2025 12:19:54 +0530 Subject: [PATCH 10/12] First Draft - pybind bindings for IR2Vec --- bindings/py_module.cpp | 139 ++++++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 42 deletions(-) diff --git a/bindings/py_module.cpp b/bindings/py_module.cpp index f9020c610..6f6265df4 100644 --- a/bindings/py_module.cpp +++ b/bindings/py_module.cpp @@ -12,23 +12,38 @@ namespace py = pybind11; +bool fileNotValid(std::string filename) { + std::ifstream temp; + temp.open(filename, std::ios_base::in); + if (temp.peek() == std::ifstream::traits_type::eof() || temp.bad() == true || + temp.fail() == true) { + return true; + } + temp.close(); + return false; +} + class IR2VecHandler { private: std::string fileName; std::string outputFile; std::string mode; std::string level; - IR2Vec::Embeddings *emb = nullptr; - unsigned dim; + std::unique_ptr emb; + unsigned dim = 300; public: IR2VecHandler(std::string fileName, std::string outputFile, std::string mode, std::string level, unsigned dim) : fileName(std::move(fileName)), outputFile(std::move(outputFile)), mode(std::move(mode)), level(std::move(level)), - emb(new IR2Vec::Embeddings()), dim(dim) {} + emb(std::make_unique()), dim(dim) {} - ~IR2VecHandler() { delete emb; } + ~IR2VecHandler() = default; + IR2VecHandler(const IR2VecHandler &) = delete; + IR2VecHandler &operator=(const IR2VecHandler &) = delete; + IR2VecHandler(IR2VecHandler &&) noexcept = default; + IR2VecHandler &operator=(IR2VecHandler &&) noexcept = default; std::string getFile() { return fileName; } std::string getOutputFile() { return outputFile; } @@ -47,18 +62,7 @@ class IR2VecHandler { return emb->getFunctionVecMap(); } - bool fileNotValid(std::string filename) { - std::ifstream temp; - temp.open(filename, std::ios_base::in); - if (temp.peek() == std::ifstream::traits_type::eof() || - temp.bad() == true || temp.fail() == true) { - return true; - } - temp.close(); - return false; - } - - void initEncodings(std::string function_name = "") { + void generateEmbeddings(std::string function_name = "") { IR2Vec::iname = fileName; IR2Vec::IR2VecMode ir2vecMode = (mode == std::string("sym") ? IR2Vec::Symbolic : IR2Vec::FlowAware); @@ -66,26 +70,13 @@ class IR2VecHandler { std::unique_ptr Module = IR2Vec::getLLVMIR(); emb = std::make_unique(*Module, ir2vecMode, level.at(0), - outputFile, dim, function_name) - .get(); - + outputFile, dim, function_name); if (!emb) { throw std::runtime_error("Failed to create embeddings"); } } }; -IR2VecHandler *createIR2VecObject(std::string filename, std::string output_file, - std::string mode, std::string level, - unsigned dim) { - IR2VecHandler *ir2vecObj = - new IR2VecHandler(filename, output_file, mode, level, dim); - if (!ir2vecObj) { - throw std::runtime_error("Failed to Create embeddings"); - } - return ir2vecObj; -} - IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", std::string level = "", std::string output_file = "", @@ -108,9 +99,12 @@ IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", throw std::runtime_error("Invalid level specified: Use either p or f"); IR2VecHandler *ir2vecObj = - createIR2VecObject(filename, output_file, mode, level, dim); + new IR2VecHandler(filename, output_file, mode, level, dim); + if (!ir2vecObj) { + throw std::runtime_error("Failed to Create embeddings"); + } - ir2vecObj->initEncodings(function_name); + ir2vecObj->generateEmbeddings(function_name); return ir2vecObj; } @@ -122,18 +116,79 @@ PYBIND11_MODULE(_core, m) { Exposes selected APIs and utilities from the IR2Vec C++ library. )pbdoc"; + m.attr("__version__") = IR2VEC_VERSION; m.def( "getVersion", [] { return std::string(IR2VEC_VERSION); }, "Get IR2Vec Version"); - // m.def("get_program_vector", - // [](const std::string& ir_path) { - // py::gil_scoped_release release; - // return getProgramVector(ir_path); // declared in IR2Vec.h - // }, - // py::arg("ir_path"), - // "Compute program vector for an LLVM IR file"); - // - // m.def("get_instruction_vectors", - // &getInstructionVectors, py::arg("ir_path")); + m.def( + "initEmbeddings", + [](const std::string &filename, const std::string &mode, + const std::string &level, const std::string &output_file = "", + const std::string &function_name = "", unsigned dim = 300) { + py::gil_scoped_release release; + IR2VecHandler *ptr = initEmbedding(filename, mode, level, output_file, + function_name, dim); + return ptr; + }, + py::arg("filename"), py::arg("mode"), py::arg("level"), + py::arg("output_file") = "", py::arg("function_name") = "", + py::arg("dim") = 300, py::return_value_policy::take_ownership, + R"pbdoc( + Create an IR2VecHandler by invoking the C++ initEmbedding() factory. + Runs validation and generates embeddings before returning the object. + )pbdoc"); + + py::class_(m, "IR2VecHandler") + // (constructor binding optional, since users will usually call + // initEmbeddings) + .def(py::init(), + py::arg("filename"), py::arg("output_file"), py::arg("mode"), + py::arg("level"), py::arg("dim")) + + .def("generateEmbeddings", &IR2VecHandler::generateEmbeddings, + py::arg("function_name") = std::string{}, + py::call_guard()) + + .def("getProgVector", &IR2VecHandler::getProgramVector, + py::call_guard(), + R"pbdoc(Return the program vector as a list of floats.)pbdoc") + + // getFuncVectorMap() -> dict[str, list[float]] + .def( + "getFuncVectorMap", + [](IR2VecHandler &self) { + auto &map = self.getFunctionVecMap(); + py::dict out; + for (const auto &kv : map) { + const llvm::Function *F = kv.first; + if (!F) + continue; + out[py::str((F->getName()).data())] = + kv.second; // std::vector -> list[float] + // TODO :: check if F->getName() is sufficient + // or some other demangled name methods are needed + } + return out; + }, + py::call_guard(), + R"pbdoc(Return {function_name: vector} as a dict.)pbdoc") + + .def( + "getInstVectorMap", + [](IR2VecHandler &self) { + auto &map = self.getInstVecMap(); + py::dict out; + for (const auto &kv : map) { + const llvm::Instruction *I = kv.first; + if (!I) + continue; + out[py::str(I->getOpcodeName())] = + kv.second; // tweak key if you want + } + return out; + }, + py::call_guard(), + R"pbdoc(Return {instruction_name: vector} as a dict.)pbdoc"); } From bbd0e8f106682992954d4f649c4930c9ae83f134 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Thu, 21 Aug 2025 20:19:00 +0530 Subject: [PATCH 11/12] Debug commit - failing build --- bindings/CMakeLists.txt | 20 ++++++++++---------- bindings/py_module.cpp | 2 +- src/libIR2Vec.cpp | 3 +++ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index f61e8f08b..015cbe5a8 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -2,10 +2,10 @@ cmake_minimum_required(VERSION 3.13) find_package(pybind11 CONFIG REQUIRED) -pybind11_add_module(_core py_module.cpp) +pybind11_add_module(pr2vec py_module.cpp) # Headers from src tree (public ones) -target_include_directories(_core +target_include_directories(pr2vec PRIVATE ${CMAKE_SOURCE_DIR}/src/include ${CMAKE_BINARY_DIR}/src/include # where generated headers usually land @@ -20,23 +20,23 @@ elseif (TARGET ${IR2VEC_LIB_STATIC}) endif() if (_ir2vec_tgt) - target_link_libraries(_core PRIVATE ${_ir2vec_tgt}) + target_link_libraries(pr2vec PRIVATE ${_ir2vec_tgt}) - # Ensure generation runs before compiling _core - add_dependencies(_core ${_ir2vec_tgt}) + # Ensure generation runs before compiling pr2vec + add_dependencies(pr2vec ${_ir2vec_tgt}) # Inherit the library target's include dirs & compile defs (covers generated dirs) - target_include_directories(_core PRIVATE + target_include_directories(pr2vec PRIVATE $) - target_compile_definitions(_core PRIVATE + target_compile_definitions(pr2vec PRIVATE $) else() - message(WARNING "IR2Vec library target not found; _core may miss generated headers.") + message(WARNING "IR2Vec library target not found; pr2vec may miss generated headers.") endif() if(UNIX AND NOT APPLE) - target_compile_options(_core PRIVATE -fvisibility=hidden) + target_compile_options(pr2vec PRIVATE -fvisibility=hidden) endif() # Install into a Python package dir -install(TARGETS _core LIBRARY DESTINATION ir2vec) +install(TARGETS pr2vec LIBRARY DESTINATION ir2vec) diff --git a/bindings/py_module.cpp b/bindings/py_module.cpp index 6f6265df4..5c3912441 100644 --- a/bindings/py_module.cpp +++ b/bindings/py_module.cpp @@ -109,7 +109,7 @@ IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", return ir2vecObj; } -PYBIND11_MODULE(_core, m) { +PYBIND11_MODULE(pr2vec, m) { m.doc() = R"pbdoc( IR2Vec Python bindings. diff --git a/src/libIR2Vec.cpp b/src/libIR2Vec.cpp index 8e00c583c..47b80d347 100644 --- a/src/libIR2Vec.cpp +++ b/src/libIR2Vec.cpp @@ -14,6 +14,9 @@ #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" +#include +#include +#include int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, IR2Vec::IR2VecMode mode, char level, From a08fddafd5e45d896c21cf22de3896fb4978eee7 Mon Sep 17 00:00:00 2001 From: nishant_sachdeva Date: Fri, 22 Aug 2025 20:14:53 +0530 Subject: [PATCH 12/12] Debug STA commit - compilation issues solved, segfault issue with FA, and SYM-maps --- CMakeLists.txt | 7 +--- bindings/CMakeLists.txt | 42 --------------------- src/CMakeLists.txt | 8 +++- src/FlowAware.cpp | 13 +++++++ src/bindings/CMakeLists.txt | 48 ++++++++++++++++++++++++ {bindings => src/bindings}/py_module.cpp | 43 +++++++++++++++------ src/include/IR2Vec.h | 8 +++- src/libIR2Vec.cpp | 18 +++++++++ 8 files changed, 123 insertions(+), 64 deletions(-) delete mode 100644 bindings/CMakeLists.txt create mode 100644 src/bindings/CMakeLists.txt rename {bindings => src/bindings}/py_module.cpp (86%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7aeb771f1..480d2ea51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ endif() set(CMAKE_CXX_STANDARD 17 CACHE STRING "") + # LLVM is normally built without RTTI. Be consistent with that. if (NOT LLVM_ENABLE_RTTI) if (TARGET IR2Vec) @@ -28,9 +29,3 @@ if (NOT LLVM_ENABLE_RTTI) endif() add_subdirectory(src) - -# ---- NEW: toggle Python bindings build ---- -option(BUILD_PYTHON "Build the pybind11 Python extension" ON) -if (BUILD_PYTHON) - add_subdirectory(bindings) -endif() diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt deleted file mode 100644 index 015cbe5a8..000000000 --- a/bindings/CMakeLists.txt +++ /dev/null @@ -1,42 +0,0 @@ -# bindings/CMakeLists.txt -cmake_minimum_required(VERSION 3.13) - -find_package(pybind11 CONFIG REQUIRED) -pybind11_add_module(pr2vec py_module.cpp) - -# Headers from src tree (public ones) -target_include_directories(pr2vec - PRIVATE - ${CMAKE_SOURCE_DIR}/src/include - ${CMAKE_BINARY_DIR}/src/include # where generated headers usually land -) - -# Link to your IR2Vec lib target if present, and inherit its usage reqs -set(_ir2vec_tgt "") -if (TARGET ${IR2VEC_LIB}) - set(_ir2vec_tgt ${IR2VEC_LIB}) -elseif (TARGET ${IR2VEC_LIB_STATIC}) - set(_ir2vec_tgt ${IR2VEC_LIB_STATIC}) -endif() - -if (_ir2vec_tgt) - target_link_libraries(pr2vec PRIVATE ${_ir2vec_tgt}) - - # Ensure generation runs before compiling pr2vec - add_dependencies(pr2vec ${_ir2vec_tgt}) - - # Inherit the library target's include dirs & compile defs (covers generated dirs) - target_include_directories(pr2vec PRIVATE - $) - target_compile_definitions(pr2vec PRIVATE - $) -else() - message(WARNING "IR2Vec library target not found; pr2vec may miss generated headers.") -endif() - -if(UNIX AND NOT APPLE) - target_compile_options(pr2vec PRIVATE -fvisibility=hidden) -endif() - -# Install into a Python package dir -install(TARGETS pr2vec LIBRARY DESTINATION ir2vec) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bdeade059..8afe29c5d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -73,8 +73,6 @@ if(NOT LLVM_IR2VEC) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") - - include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) # llvm_map_components_to_libnames(llvm_libs all) llvm_map_components_to_libnames(llvm_libs support core irreader analysis TransformUtils) @@ -92,6 +90,12 @@ if(NOT LLVM_IR2VEC) add_library(${IR2VEC_LIB} SHARED $) add_library(${IR2VEC_LIB_STATIC} STATIC $) + # Link LLVM to the libs (PUBLIC so dependents inherit) + # target_link_libraries(${IR2VEC_LIB} PUBLIC ${llvm_libs}) + # target_link_libraries(${IR2VEC_LIB_STATIC} PUBLIC ${llvm_libs}) + + add_subdirectory(bindings) + set_target_properties(${IR2VEC_LIB} ${IR2VEC_LIB_STATIC} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1 diff --git a/src/FlowAware.cpp b/src/FlowAware.cpp index 43629ec45..d641dd34b 100644 --- a/src/FlowAware.cpp +++ b/src/FlowAware.cpp @@ -134,19 +134,26 @@ void IR2Vec_FA::generateFlowAwareEncodings(std::ostream *o, std::ostream *cyclicCount) { int noOfFunc = 0; + std::cout << "FuncVecMap Function Entered" << std::endl; for (auto &f : M) { + std::cout << "Function F - " << f.getName().str() << std::endl; if (!f.isDeclaration()) { SmallVector funcStack; auto tmp = func2Vec(f, funcStack); + std::cout << "func2Vec call returned " << tmp.size() << std::endl; funcVecMap[&f] = tmp; } } + std::cout << "FuncVecMap init done" << std::endl; + for (auto funcit : funcVecMap) { updateFuncVecMapWithCallee(funcit.first); } + std::cout << "Update Func Vec Map with Callee Done" << std::endl; + for (auto &f : M) { if (!f.isDeclaration()) { Vector tmp; @@ -166,6 +173,8 @@ void IR2Vec_FA::generateFlowAwareEncodings(std::ostream *o, } } + std::cout << "Update Func Vec Map Transform Done" << std::endl; + if (level == 'p') { if (cls != -1) res += std::to_string(cls) + "\t"; @@ -178,9 +187,13 @@ void IR2Vec_FA::generateFlowAwareEncodings(std::ostream *o, res += "\n"; } + std::cout << "Final String Created" << std::endl; + if (o) *o << res; + std::cout << "Written to File" << std::endl; + if (missCount) { std::string missEntry = (M.getSourceFileName() + "\t" + std::to_string(dataMissCounter) + "\n"); diff --git a/src/bindings/CMakeLists.txt b/src/bindings/CMakeLists.txt new file mode 100644 index 000000000..517817e3b --- /dev/null +++ b/src/bindings/CMakeLists.txt @@ -0,0 +1,48 @@ +cmake_minimum_required(VERSION 3.13) + +find_package(pybind11 CONFIG REQUIRED) +# Reuse LLVM already found by src/CMakeLists.txt +# You have llvm_map_components_to_libnames(llvm_libs support core irreader analysis TransformUtils) +# so `llvm_libs` is available here too. + +pybind11_add_module(pr2vec py_module.cpp) + +# Prefer the shared lib if present +set(_ir2vec_tgt "") +if (TARGET ${IR2VEC_LIB}) + set(_ir2vec_tgt ${IR2VEC_LIB}) +elseif (TARGET ${IR2VEC_LIB_STATIC}) + set(_ir2vec_tgt ${IR2VEC_LIB_STATIC}) +endif() + +if (NOT _ir2vec_tgt) + message(FATAL_ERROR "IR2Vec targets not found; ensure add_subdirectory(bindings) runs after libs are defined.") +endif() + +# Link IR2Vec + the same LLVM component libs your exe uses +target_link_libraries(pr2vec PRIVATE ${_ir2vec_tgt} ${llvm_libs}) + +# Ensure generation (objlib) has built before compiling pr2vec (covers generated headers) +add_dependencies(pr2vec ${_ir2vec_tgt}) + +# Inherit includes/defs from IR2Vec (covers ${build}/include and public headers) +target_include_directories(pr2vec PRIVATE + $ + $) + +target_compile_definitions(pr2vec PRIVATE + $ + $) + +# Hide extra symbols in the Python module +if(UNIX AND NOT APPLE) + target_compile_options(pr2vec PRIVATE -fvisibility=hidden) +endif() + +# Optional: help loader during dev runs (adjust to taste) +set_target_properties(pr2vec PROPERTIES + BUILD_RPATH "\$ORIGIN;\$ORIGIN/../lib;${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "\$ORIGIN" +) + +install(TARGETS pr2vec LIBRARY DESTINATION ir2vec) diff --git a/bindings/py_module.cpp b/src/bindings/py_module.cpp similarity index 86% rename from bindings/py_module.cpp rename to src/bindings/py_module.cpp index 5c3912441..6fdfdc91d 100644 --- a/bindings/py_module.cpp +++ b/src/bindings/py_module.cpp @@ -2,10 +2,10 @@ #include #include -// Pull in your IR2Vec public headers: #include "IR2Vec.h" #include "utils.h" #include "version.h" + #include #include #include @@ -64,13 +64,22 @@ class IR2VecHandler { void generateEmbeddings(std::string function_name = "") { IR2Vec::iname = fileName; + std::cout << "Filename settled " << IR2Vec::iname << std::endl; + IR2Vec::IR2VecMode ir2vecMode = (mode == std::string("sym") ? IR2Vec::Symbolic : IR2Vec::FlowAware); + std::cout << "Filename ir2vec mode settled " << ir2vecMode << std::endl; + std::unique_ptr Module = IR2Vec::getLLVMIR(); + std::cout << "Module initialized" << std::endl; + emb = std::make_unique(*Module, ir2vecMode, level.at(0), outputFile, dim, function_name); + + std::cout << "Embedding object created" << std::endl; + if (!emb) { throw std::runtime_error("Failed to create embeddings"); } @@ -83,6 +92,8 @@ IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", std::string function_name = "", unsigned dim = 300) { + std::cout << "TEST function" << std::endl; + if (fileNotValid(filename)) throw std::runtime_error("Invalid File Path"); @@ -98,14 +109,23 @@ IR2VecHandler *initEmbedding(std::string filename = "", std::string mode = "", if (not(level.at(0) == 'p' or level.at(0) == 'f')) throw std::runtime_error("Invalid level specified: Use either p or f"); + std::cout << "Validation Functions done" << std::endl; + IR2VecHandler *ir2vecObj = new IR2VecHandler(filename, output_file, mode, level, dim); + + std::cout << "ir2vec object created" << std::endl; + if (!ir2vecObj) { throw std::runtime_error("Failed to Create embeddings"); } + std::cout << "ir2vec object validated - not nullptr" << std::endl; + ir2vecObj->generateEmbeddings(function_name); + std::cout << "embedding generation done" << std::endl; + return ir2vecObj; } @@ -140,39 +160,39 @@ PYBIND11_MODULE(pr2vec, m) { )pbdoc"); py::class_(m, "IR2VecHandler") - // (constructor binding optional, since users will usually call - // initEmbeddings) .def(py::init(), py::arg("filename"), py::arg("output_file"), py::arg("mode"), py::arg("level"), py::arg("dim")) - .def("generateEmbeddings", &IR2VecHandler::generateEmbeddings, - py::arg("function_name") = std::string{}, - py::call_guard()) - .def("getProgVector", &IR2VecHandler::getProgramVector, py::call_guard(), R"pbdoc(Return the program vector as a list of floats.)pbdoc") - // getFuncVectorMap() -> dict[str, list[float]] .def( "getFuncVectorMap", [](IR2VecHandler &self) { + std::cout << "Entered Function" << std::endl; + auto &map = self.getFunctionVecMap(); + std::cout << "Map fetched" << std::endl; + py::dict out; for (const auto &kv : map) { + std::cout << "Creating elements" << std::endl; const llvm::Function *F = kv.first; if (!F) continue; - out[py::str((F->getName()).data())] = - kv.second; // std::vector -> list[float] + std::cout << F->getName().str() << std::endl; + out[py::str((F->getName()).str())] = py::cast(kv.second); + // std::vector -> list[float] // TODO :: check if F->getName() is sufficient // or some other demangled name methods are needed } + + std::cout << "Dict created" << std::endl; return out; }, - py::call_guard(), R"pbdoc(Return {function_name: vector} as a dict.)pbdoc") .def( @@ -189,6 +209,5 @@ PYBIND11_MODULE(pr2vec, m) { } return out; }, - py::call_guard(), R"pbdoc(Return {instruction_name: vector} as a dict.)pbdoc"); } diff --git a/src/include/IR2Vec.h b/src/include/IR2Vec.h index b9926fad5..b224badeb 100644 --- a/src/include/IR2Vec.h +++ b/src/include/IR2Vec.h @@ -11,6 +11,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/IR/Module.h" +#include #include #include "Vocabulary.h" @@ -46,9 +47,12 @@ class Embeddings { std::string outputFile = "", unsigned dim = 300, std::string funcName = "", float WO = 1, float WA = 0.2, float WT = 0.5) { + std::cout << "Embeddings constructor called " << std::endl; vocabulary = VocabularyFactory::createVocabulary(dim)->getVocabulary(); - generateEncodings(M, mode, level, funcName, dim, outputFile, -1, WO, WA, - WT); + std::cout << "Vocabulary constructor called " << std::endl; + int ret_val = generateEncodings(M, mode, level, funcName, dim, outputFile, + -1, WO, WA, WT); + std::cout << "Encoding generation done " << ret_val << std::endl; } // Returns a map containing instructions and the corresponding vector diff --git a/src/libIR2Vec.cpp b/src/libIR2Vec.cpp index 47b80d347..5207f3146 100644 --- a/src/libIR2Vec.cpp +++ b/src/libIR2Vec.cpp @@ -32,6 +32,8 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, IR2Vec::funcName = funcName; IR2Vec::DIM = dim; + std::cout << "Generate Encoding Function entered" << std::endl; + std::optional outStream; std::ostream *os = [&]() -> std::ostream * { if (outputFile.empty()) { @@ -46,6 +48,8 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, return std::addressof(outStream.value()); }(); + std::cout << "Outfile stream created" << std::endl; + if (mode == IR2Vec::IR2VecMode::FlowAware && !funcName.empty()) { IR2Vec_FA FA(M, vocabulary); FA.generateFlowAwareEncodingsForFunction(os, funcName); @@ -53,12 +57,19 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, funcVecMap = FA.getFuncVecMap(); bbVecMap = FA.getBBVecMap(); } else if (mode == IR2Vec::IR2VecMode::FlowAware) { + std::cout << "Creating FA Embedding" << std::endl; IR2Vec_FA FA(M, vocabulary); + std::cout << "Init - Vocab added" << std::endl; + FA.generateFlowAwareEncodings(os); + std::cout << "Embedding Generation Done" << std::endl; + instVecMap = FA.getInstVecMap(); funcVecMap = FA.getFuncVecMap(); bbVecMap = FA.getBBVecMap(); pgmVector = FA.getProgramVector(); + std::cout << "Vector maps assigned. Function Done" << std::endl; + } else if (mode == IR2Vec::IR2VecMode::Symbolic && !funcName.empty()) { IR2Vec_Symbolic SYM(M, vocabulary); SYM.generateSymbolicEncodingsForFunction(0, funcName); @@ -66,12 +77,19 @@ int IR2Vec::Embeddings::generateEncodings(llvm::Module &M, funcVecMap = SYM.getFuncVecMap(); bbVecMap = SYM.getBBVecMap(); } else if (mode == IR2Vec::IR2VecMode::Symbolic) { + std::cout << "Creating Sym Embedding" << std::endl; IR2Vec_Symbolic SYM(M, vocabulary); + + std::cout << "Init - Vocab added" << std::endl; SYM.generateSymbolicEncodings(os); + + std::cout << "Embedding Generation Done" << std::endl; + instVecMap = SYM.getInstVecMap(); funcVecMap = SYM.getFuncVecMap(); bbVecMap = SYM.getBBVecMap(); pgmVector = SYM.getProgramVector(); + std::cout << "Vector maps assigned. Function Done" << std::endl; } return 0;