diff --git a/include/Analysis/Clones/CloneFinder.h b/include/Analysis/Clones/CloneFinder.h new file mode 100644 index 0000000..7e399c0 --- /dev/null +++ b/include/Analysis/Clones/CloneFinder.h @@ -0,0 +1,44 @@ +#include "Analysis/Clones/HashDeepWalker.h" + +#include +#include +#include + +namespace OPS { + +namespace Clones { + +class CloneFinder { + size_t MassThreshold; + double SimilarityThreshold; + +public: + struct Clone { + int size; + set> refs; + Clone() {} + Clone(int s) :size(s), refs() {} + }; +private: + + map clones; + + int basicTreeCompare(shared_ptr t1, shared_ptr t2); + int isSeqSimilar(shared_ptr < HashDeepWalker::SubTreeInfo> t1, shared_ptr < HashDeepWalker::SubTreeInfo> t2); + bool isSimilar(shared_ptr t1, shared_ptr t2); + void cloneGeneralizing(); + void eraseByHash(shared_ptr < HashDeepWalker::SubTreeInfo> n); + void eraseChildClones(shared_ptr < HashDeepWalker::SubTreeInfo> root); + void eraseSubClones(); + void addClonePair(shared_ptr < HashDeepWalker::SubTreeInfo> s1, shared_ptr < HashDeepWalker::SubTreeInfo> s2); + +public: + CloneFinder(size_t mt, double st) : MassThreshold(mt), SimilarityThreshold(st) { + clones = map(); + } + + vector getClones(TranslationUnit& unit, bool removeSubClones=false); +}; + +} +} diff --git a/include/Analysis/Clones/HashDeepWalker.h b/include/Analysis/Clones/HashDeepWalker.h new file mode 100644 index 0000000..7c2535b --- /dev/null +++ b/include/Analysis/Clones/HashDeepWalker.h @@ -0,0 +1,121 @@ +#pragma once + +#include "Reprise/Reprise.h" +#include "Frontend/Frontend.h" +#include +#include + +#include +#include + +using namespace std; +using namespace OPS; +using namespace OPS::Reprise; + +class HashDeepWalker : public Service::DeepWalker +{ +public: + struct SubTreeInfo { + RepriseBase* node; + size_t hashCode = 0; + size_t subTreeSize=0; + shared_ptr parent; + vector> children; + SubTreeInfo(RepriseBase* n):node(n) {} + SubTreeInfo(RepriseBase* n, int h, int s) : node(n), hashCode(h), subTreeSize(s),children() {} + }; + +protected: + size_t h = 0; + int size = 0; + + shared_ptr currentNode; + + vector< shared_ptr> nodes; + + template + void processNode(Func f, RepriseBase* n) + { + size_t th = h; + int ts = size; + h = 0; + size = 0; + + shared_ptr tp = currentNode; + shared_ptr sti = make_shared(n); + currentNode = sti; + + f(); + + + sti->hashCode = h; + sti->subTreeSize = size; + + nodes.push_back(sti); + + if (tp != nullptr) + { + tp->children.push_back(sti); + } + + size = ts + size; + size++; + size_t h2 = h << 1; + h = th + h2; + sti->parent = tp; + currentNode = tp; + } + +public: + void visit(ProgramUnit&); + void visit(TranslationUnit&); + + //void visit(ProgramFragment&); + + void visit(Declarations&); + void visit(VariableDeclaration&); + void visit(TypeDeclaration&); + void visit(SubroutineDeclaration&); + + void visit(BlockStatement&); + void visit(ForStatement&); + void visit(WhileStatement&); + void visit(IfStatement&); + void visit(PlainCaseLabel&); + void visit(PlainSwitchStatement&); + void visit(GotoStatement&); + void visit(ReturnStatement&); + void visit(ExpressionStatement&); + void visit(ASMStatement&); + void visit(EmptyStatement&); + + void visit(BasicType&); + void visit(PtrType&); + void visit(TypedefType&); + void visit(ArrayType&); + void visit(StructMemberDescriptor& structMember); + void visit(StructType&); + void visit(EnumMemberDescriptor&); + void visit(EnumType&); + void visit(ParameterDescriptor&); + void visit(SubroutineType&); + void visit(DeclaredType&); + void visit(VectorType&); + + void visit(BasicLiteralExpression&); + void visit(StrictLiteralExpression&); + void visit(CompoundLiteralExpression&); + void visit(ReferenceExpression&); + void visit(SubroutineReferenceExpression&); + void visit(StructAccessExpression&); + void visit(EnumAccessExpression&); + void visit(TypeCastExpression&); + void visit(BasicCallExpression&); + void visit(SubroutineCallExpression&); + void visit(EmptyExpression&); + + map>> getBuckets(int MassThreshold); + + int getSize() { return size; } + int getHash() { return h; } +}; \ No newline at end of file diff --git a/source/Analysis/CMakeLists.txt b/source/Analysis/CMakeLists.txt index 2572e15..18d420c 100644 --- a/source/Analysis/CMakeLists.txt +++ b/source/Analysis/CMakeLists.txt @@ -4,6 +4,8 @@ ops_filesin(INTERFACE "../../include/Analysis/" CalculationGraph/CalculationGraphBase.h CalculationGraph/CalculationGraphBuilder.h CalculationGraph/ExpressionGraph.h + Clones/HashDeepWalker.h + Clones/CloneFinder.h CallGraph.h ComplexOccurrenceAnalysis/GrouppedOccurrences.h ConsistCheck/Conditions.h @@ -86,6 +88,8 @@ ops_project( CalculationGraph/CalculationGraph.cpp CalculationGraph/CalculationGraphBase.cpp CalculationGraph/CalculationGraphBuilder.cpp + Clones/HashDeepWalker.cpp + Clones/CloneFinder.cpp CallGraph/CallGraph.cpp ComplexOccurrenceAnalysis/GrouppedOccurrences.cpp ConsistCheck/Conditions.cpp diff --git a/source/Analysis/Clones/CloneFinder.cpp b/source/Analysis/Clones/CloneFinder.cpp new file mode 100644 index 0000000..23db151 --- /dev/null +++ b/source/Analysis/Clones/CloneFinder.cpp @@ -0,0 +1,186 @@ +#include "Analysis/Clones/HashDeepWalker.h" +#include "Analysis/Clones/CloneFinder.h" + +#include +#include +#include + +namespace OPS { + + namespace Clones { + + + int CloneFinder::basicTreeCompare(shared_ptr t1, shared_ptr t2) + { + int shared = 0; + + if (t1->node->is_a() && t2->node->is_a()) + { + return isSeqSimilar(t1,t2); + } + else if (typeid(*t1->node) == typeid(*t2->node)) + { + shared++; + } + + int minChildren = min(t1->children.size(), t2->children.size()); + for (int i = 0; i < minChildren; i++) + shared += this->basicTreeCompare(t1->children[i], t2->children[i]); + + return shared; + } + + int CloneFinder::isSeqSimilar(shared_ptr < HashDeepWalker::SubTreeInfo> t1, shared_ptr < HashDeepWalker::SubTreeInfo> t2) + { + set t1UsedInd = set(); + set t2UsedInd = set(); + + double sharedNodes = 0; + + for (int i = 0; i < t1->children.size(); i++) + { + for (int j = 0; j < t2->children.size(); j++) + { + if (t1->children[i]->hashCode == t2->children[j]->hashCode) + { + sharedNodes += basicTreeCompare(t1->children[i], t2->children[j]); + t1UsedInd.insert(i); + t2UsedInd.insert(j); + break; + } + + } + } + + for (int i = 0; i < t1->children.size(); i++) + { + if (t1UsedInd.find(i) != t1UsedInd.end()) + { + continue; + } + + for (int j = 0; j < t2->children.size(); j++) + { + if (t2UsedInd.find(j) != t2UsedInd.end()) + { + continue; + } + + sharedNodes += basicTreeCompare(t1->children[i], t2->children[j]); + break; + } + } + + return sharedNodes; + } + + bool CloneFinder::isSimilar(shared_ptr t1, shared_ptr t2) + { + double sharedNodes = (double)basicTreeCompare(t1, t2); + double similarity = (2 * sharedNodes) / (2 * sharedNodes + (t1->subTreeSize - sharedNodes) + (t2->subTreeSize - sharedNodes)); + return similarity > SimilarityThreshold; + } + + void CloneFinder::cloneGeneralizing() + { + for (auto& clone : clones) + { + for (auto& i : clone.second.refs) + { + for (auto& j : clone.second.refs) + { + if (i != j) + { + if (i->parent && j->parent) + { + if (isSimilar(i->parent, j->parent)) + { + if (clones.find(i->hashCode) == clones.end()) + { + clones[i->hashCode] = Clone(i->subTreeSize); + } + clones[i->hashCode].refs.insert(i); + clones[i->hashCode].refs.insert(j); + clone.second.refs.erase(i); + clone.second.refs.erase(j); + } + } + } + } + } + } + } + + void CloneFinder::eraseByHash(shared_ptr < HashDeepWalker::SubTreeInfo> n) + { + if (clones.find(n->hashCode) != clones.end()) + { + clones[n->hashCode].refs.erase(n); + if (clones[n->hashCode].refs.size() == 1) + clones[n->hashCode].refs.clear(); + } + eraseChildClones(n); + } + + void CloneFinder::eraseChildClones(shared_ptr < HashDeepWalker::SubTreeInfo> root) + { + for (int i = 0; i < root->children.size(); i++) + eraseByHash(root->children[i]); + } + + void CloneFinder::eraseSubClones() + { + for (auto& clone : clones) + { + for (auto& ref : clone.second.refs) + { + eraseChildClones(ref); + } + } + } + + void CloneFinder::addClonePair(shared_ptr < HashDeepWalker::SubTreeInfo> s1, shared_ptr < HashDeepWalker::SubTreeInfo> s2) + { + if (clones.find(s1->hashCode) == clones.end()) + { + clones[s1->hashCode] = Clone(s1->subTreeSize); + } + clones[s1->hashCode].refs.insert(s1); + clones[s1->hashCode].refs.insert(s2); + } + + vector CloneFinder::getClones(TranslationUnit& unit, bool removeSubClones) + { + HashDeepWalker hdw; + hdw.visit(unit); + auto buckets = hdw.getBuckets(MassThreshold); + + for (auto& bucket : buckets) + { + for (int i = 0; i < bucket.second.size(); i++) + { + for (int j = i + 1; j < bucket.second.size(); j++) + { + if (isSimilar(bucket.second[i], bucket.second[j])) + { + //cout << "Seems like clone found" << endl; + addClonePair(bucket.second[i], bucket.second[j]); + } + } + } + } + + if (removeSubClones) + eraseSubClones(); + + vector cloneLst = vector(); + + for (auto& c : clones) + { + cloneLst.push_back(c.second); + } + + return cloneLst; + } + } +} \ No newline at end of file diff --git a/source/Analysis/Clones/HashDeepWalker.cpp b/source/Analysis/Clones/HashDeepWalker.cpp new file mode 100644 index 0000000..7025917 --- /dev/null +++ b/source/Analysis/Clones/HashDeepWalker.cpp @@ -0,0 +1,342 @@ +#include"Analysis/Clones/HashDeepWalker.h" + +void HashDeepWalker::visit(ProgramUnit& pu) +{ + processNode([&]() { + DeepWalker::visit(pu); + }, &pu); +} +void HashDeepWalker::visit(TranslationUnit& tu) +{ + processNode([&]() { + DeepWalker::visit(tu); + }, &tu); +} + +//void visit(ProgramFragment& pf) +//{ +// //processNode([&]() { +// // cout << string(" ") << "It is subroutinedecl" << endl; +// // DeepWalker::visit(pf); +// // }, &pf); +//} + +void HashDeepWalker::visit(Declarations& dec) +{ + processNode([&]() { + h += hash{}(dec.getChildCount()) << 1; + DeepWalker::visit(dec); + }, &dec); +} + +void HashDeepWalker::visit(VariableDeclaration& vd) +{ + processNode([&]() { + h += hash{}(vd.getKind()) << 1; + //vd.getName() + // vd.getType(); implement hasher for TypeBase + DeepWalker::visit(vd); + }, &vd); +} + +void HashDeepWalker::visit(TypeDeclaration& td) +{ + processNode([&]() { + h += hash{}(td.getKind()) << 1; + td.getKind(); + DeepWalker::visit(td); + }, &td); +} + +void HashDeepWalker::visit(SubroutineDeclaration& sd) +{ + processNode([&]() { + auto& type = sd.getType(); + //h += hash{}(type.getReturnType()); implement hasher for TypeBase + //h += hash{}(sd.getName()) << 1; + DeepWalker::visit(sd); + }, &sd); +} + +//Statements +void HashDeepWalker::visit(BlockStatement& bs) { + processNode([&]() { + h += hash{}(bs.getChildCount()) << 1; + DeepWalker::visit(bs); + }, &bs); +} +void HashDeepWalker::visit(ForStatement& fs) +{ + processNode([&]() { + h += hash{}(typeid(fs).name()) << 1; + DeepWalker::visit(fs); + }, &fs); +} +void HashDeepWalker::visit(WhileStatement& ws) +{ + processNode([&]() { + h += hash{}(typeid(ws).name()) << 1; + DeepWalker::visit(ws); + }, &ws); +} +void HashDeepWalker::visit(IfStatement& ifs) +{ + processNode([&]() { + h += hash{}(typeid(ifs).name()) << 1; + DeepWalker::visit(ifs); + }, &ifs); +} +void HashDeepWalker::visit(PlainCaseLabel& pcl) +{ + processNode([&]() { + h += hash{}(pcl.getValue()) << 1; + DeepWalker::visit(pcl); + }, &pcl); +} +void HashDeepWalker::visit(PlainSwitchStatement& pss) +{ + processNode([&]() { + h += hash{}(typeid(pss).name()) << 1; + DeepWalker::visit(pss); + }, &pss); +} +void HashDeepWalker::visit(GotoStatement& gs) +{ + processNode([&]() { + h += hash{}(typeid(gs).name()) << 1; + DeepWalker::visit(gs); + }, &gs); +} +void HashDeepWalker::visit(ReturnStatement& rs) +{ + processNode([&]() { + h += hash{}(typeid(rs).name()) << 1; + DeepWalker::visit(rs); + }, &rs); +} +void HashDeepWalker::visit(ExpressionStatement& es) +{ + processNode([&]() { + h += hash{}(es.getChildCount()) << 1; + DeepWalker::visit(es); + }, &es); +} +void HashDeepWalker::visit(ASMStatement& as) +{ + processNode([&]() { + h += hash{}(as.getASMString()) << 1; + DeepWalker::visit(as); + }, &as); +} +void HashDeepWalker::visit(EmptyStatement& es) +{ + processNode([&]() { + h += hash{}(typeid(es).name()) << 1; + DeepWalker::visit(es); + }, &es); +} + +//Types +void HashDeepWalker::visit(BasicType& bt) +{ + processNode([&]() { + h += hash{}(bt.getKind()) << 1; + h += hash{}(bt.getSizeOf()) << 1; + DeepWalker::visit(bt); + }, &bt); +} +void HashDeepWalker::visit(PtrType& pt) +{ + processNode([&]() { + h += hash{}(typeid(pt).name()) << 1; + DeepWalker::visit(pt); + }, &pt); +} +void HashDeepWalker::visit(TypedefType& tdt) +{ + processNode([&]() { + h += hash{}(typeid(tdt).name()) << 1; + DeepWalker::visit(tdt); + }, &tdt); +} +void HashDeepWalker::visit(ArrayType& at) +{ + processNode([&]() { + h += hash{}(typeid(at).name()) << 1; + h += hash{}(at.getElementCount()) << 1; + DeepWalker::visit(at); + }, &at); +} +void HashDeepWalker::visit(StructMemberDescriptor& structMember) +{ + processNode([&]() { + h += hash{}(typeid(structMember).name()) << 1; + h += hash{}(structMember.getBitsLimit()) << 1; + //h += hash{}(structMember.getName()) << 1; + DeepWalker::visit(structMember); + }, &structMember); +} +void HashDeepWalker::visit(StructType& st) +{ + processNode([&]() { + h += hash{}(typeid(st).name()) << 1; + h += hash{}(st.getMemberCount()) << 1; + DeepWalker::visit(st); + }, &st); +} +void HashDeepWalker::visit(EnumMemberDescriptor& emdt) +{ + processNode([&]() { + h += hash{}(typeid(emdt).name()) << 1; + h += hash{}(emdt.getValue()) << 1; + DeepWalker::visit(emdt); + }, &emdt); +} +void HashDeepWalker::visit(EnumType& et) +{ + processNode([&]() { + h += hash{}(typeid(et).name()) << 1; + h += hash{}(et.getMemberCount()) << 1; + DeepWalker::visit(et); + }, &et); +} +void HashDeepWalker::visit(ParameterDescriptor& pd) +{ + processNode([&]() { + h += hash{}(typeid(pd).name()) << 1; + h += hash{}(pd.getTransitKind()) << 1; + DeepWalker::visit(pd); + }, &pd); +} +void HashDeepWalker::visit(SubroutineType& st) +{ + processNode([&]() { + h += hash{}(typeid(st).name()) << 1; + h += hash{}(st.getCallingKind()) << 1; + h += hash{}(st.getParameterCount()) << 1; + DeepWalker::visit(st); + }, &st); +} +void HashDeepWalker::visit(DeclaredType& dt) +{ + processNode([&]() { + h += hash{}(typeid(dt).name()) << 1; + DeepWalker::visit(dt); + }, &dt); +} +void HashDeepWalker::visit(VectorType& vt) +{ + processNode([&]() { + h += hash{}(typeid(vt).name()) << 1; + h += hash{}(vt.getElementCount()) << 1; + DeepWalker::visit(vt); + }, &vt); +} + +//Expressions +void HashDeepWalker::visit(BasicLiteralExpression& ble) +{ + processNode([&]() { + h += hash{}(ble.getLiteralType()) << 1; + h += hash{}(ble.getChildCount()) << 1; + DeepWalker::visit(ble); + }, &ble); +} +void HashDeepWalker::visit(StrictLiteralExpression& sle +) +{ + processNode([&]() { + h += hash{}(sle.getLiteralType()) << 1; + h += hash{}(sle.getChildCount()) << 1; + DeepWalker::visit(sle); + }, &sle); +} +void HashDeepWalker::visit(CompoundLiteralExpression& cle) +{ + processNode([&]() { + h += hash{}(cle.getChildCount()) << 1; + DeepWalker::visit(cle); + }, &cle); +} +void HashDeepWalker::visit(ReferenceExpression& re) +{ + processNode([&]() { + h += hash{}(typeid(re).name()) << 1; + h += hash{}(re.getChildCount()) << 1; + DeepWalker::visit(re); + }, &re); +} +void HashDeepWalker::visit(SubroutineReferenceExpression& sre) +{ + processNode([&]() { + h += hash{}(typeid(sre).name()) << 1; + h += hash{}(sre.getChildCount()) << 1; + DeepWalker::visit(sre); + }, &sre); +} +void HashDeepWalker::visit(StructAccessExpression& sae) +{ + processNode([&]() { + h += hash{}(typeid(sae).name()) << 1; + h += hash{}(sae.getChildCount()) << 1; + DeepWalker::visit(sae); + }, &sae); +} +void HashDeepWalker::visit(EnumAccessExpression& eae) +{ + processNode([&]() { + h += hash{}(typeid(eae).name()) << 1; + h += hash{}(eae.getChildCount()) << 1; + DeepWalker::visit(eae); + }, &eae); +} +void HashDeepWalker::visit(TypeCastExpression& tce) +{ + processNode([&]() { + h += hash{}(typeid(tce).name()) << 1; + h += hash{}(tce.getChildCount()) << 1; + DeepWalker::visit(tce); + }, &tce); +} +void HashDeepWalker::visit(BasicCallExpression& bce) +{ + processNode([&]() { + h += hash{}(typeid(bce).name()) << 1; + h += hash{}(bce.getKind()) << 1; + h += hash{}(bce.getChildCount()) << 1; + DeepWalker::visit(bce); + }, &bce); +} +void HashDeepWalker::visit(SubroutineCallExpression& sce) +{ + processNode([&]() { + h += hash{}(typeid(sce).name()) << 1; + h += hash{}(sce.getArgumentCount()) << 1; + DeepWalker::visit(sce); + }, &sce); +} +void HashDeepWalker::visit(EmptyExpression& ee) +{ + processNode([&]() { + DeepWalker::visit(ee); + }, &ee); +} + +map>> HashDeepWalker::getBuckets(int MassThreshold) +{ + map>> buckets = map>>(); + + for (int i = 0; i < nodes.size(); i++) + { + if (nodes[i]->subTreeSize > MassThreshold) + { + if (buckets.find(nodes[i]->hashCode) == buckets.end()) + { + buckets[nodes[i]->hashCode] = vector< shared_ptr>(); + } + buckets[nodes[i]->hashCode].push_back(nodes[i]); + } + } + + return buckets; +} \ No newline at end of file diff --git a/source/Analysis/Clones/testmods/CMakeLists.txt b/source/Analysis/Clones/testmods/CMakeLists.txt new file mode 100644 index 0000000..45ff977 --- /dev/null +++ b/source/Analysis/Clones/testmods/CMakeLists.txt @@ -0,0 +1,14 @@ +project(Induction_testmod) + +ops_project( + Clones_testmod APP + SOURCES + main.cpp + LIBRARIES + Reprise + Transforms + Frontend + Analysis + Shared + Backends + ) diff --git a/source/Analysis/Clones/testmods/input/Program.c b/source/Analysis/Clones/testmods/input/Program.c new file mode 100644 index 0000000..4befaef --- /dev/null +++ b/source/Analysis/Clones/testmods/input/Program.c @@ -0,0 +1,13 @@ +int main() +{ + double A[10],b; + int i; + for (i = 0; i < 10; i++) + { + if (i>5) + A[i] = 10 - A[i-1]; + else + A[i] = 10 + A[i-1]; + } + return 0; +} diff --git a/source/Analysis/Clones/testmods/main.cpp b/source/Analysis/Clones/testmods/main.cpp new file mode 100644 index 0000000..fa9fa34 --- /dev/null +++ b/source/Analysis/Clones/testmods/main.cpp @@ -0,0 +1,49 @@ +// + +#include "Analysis/CalculationGraph/CalculationGraph.h" +#include "Analysis/Clones/CloneFinder.h" +#include "Reprise/Reprise.h" +#include "Frontend/Frontend.h" +#include"Analysis/Clones/SeqHashDeepWalker.h" + +#include +#include +#include + +using namespace OPS; +using namespace OPS::Reprise; +using namespace OPS::Reprise::Canto; +using namespace OPS::Frontend; +using namespace CalculationGraphSpace; +using namespace std; + +int main() +{ + OPS::Frontend::Frontend frontend; + const OPS::Reprise::CompileResult& result = frontend.compileSingleFile("input//Program.c"); + if (result.errorCount() > 0) { std::cout << result.errorText(); std::cout.flush(); } + + TranslationUnit& unit = frontend.getProgramUnit().getUnit(0); + SeqHashDeepWalker shdw; + Service::DeepWalker dw; + shdw.visit(unit); + auto buckets = shdw.getBuckets(MassThreshold); + + for (auto &bucket : buckets) + { + for (int i = 0; i < bucket.second.size(); i++) + { + for (int j = i + 1; j < bucket.second.size(); j++) + { + if (isSimilar(bucket.second[i], bucket.second[j])) + { + cout << "Seems like clone found" << endl; + addClonePair(bucket.second[i], bucket.second[j]); + } + } + } + } + + system("pause"); + return 0; +}