diff --git a/SURF_Documentation.pdf b/SURF_Documentation.pdf new file mode 100644 index 0000000..46939b3 Binary files /dev/null and b/SURF_Documentation.pdf differ diff --git a/WebContent/filter-controls.jspf b/WebContent/filter-controls.jspf index 5407170..f080214 100644 --- a/WebContent/filter-controls.jspf +++ b/WebContent/filter-controls.jspf @@ -1,4 +1,5 @@ <%@ page import="in.edu.ashoka.surf.util.Util" %> +<%@ page import="in.edu.ashoka.surf.*" %> <% // if we already have filter config, initialize filterSpec, gvc, rvc to the corresponding fields in the config, // so we can init the dropdowns to the existing config diff --git a/WebContent/read-dataset.jsp b/WebContent/read-dataset.jsp index 65f926a..6e38cfc 100644 --- a/WebContent/read-dataset.jsp +++ b/WebContent/read-dataset.jsp @@ -1,6 +1,6 @@ <%@ page import="in.edu.ashoka.surf.*" %> -<%@ page import="edu.stanford.muse.util.Util" %> - +<%@ page import="in.edu.ashoka.surf.util.Util" %> +<%-- <%@ page import="edu.stanford.muse.util.Util" %>--%> diff --git a/WebContent/select-op.jsp b/WebContent/select-op.jsp index 258d6f5..c3e62e8 100644 --- a/WebContent/select-op.jsp +++ b/WebContent/select-op.jsp @@ -39,6 +39,8 @@ Edit distance 0 not included - + + + + +<%-- --%> + +
@@ -214,6 +227,19 @@ } else { $('.div-streak-alg-controls').hide(); } + +/* if (alg === 'cosinesimilarity') { + $('.div-cosine-similarity').show(); + } else { + $('.div-cosine-similarity').hide(); + } */ + + if (alg === 'reviewalgo') { + $('.div-review-algo').show(); + } else { + $('.div-review-algo').hide(); + } + } $('#algorithm').change(set_options_for_algorithm); diff --git a/src/edu/tsinghua/dbgroup/EditDistanceClusterer.java b/src/edu/tsinghua/dbgroup/EditDistanceClusterer.java index be03623..0985c80 100644 --- a/src/edu/tsinghua/dbgroup/EditDistanceClusterer.java +++ b/src/edu/tsinghua/dbgroup/EditDistanceClusterer.java @@ -10,6 +10,7 @@ import java.io.Serializable; import java.util.Comparator; import edu.tsinghua.dbgroup.*; + public class EditDistanceClusterer { private final EditDistanceJoiner mJoiner; static class SizeComparator implements Comparator> { diff --git a/src/edu/tsinghua/dbgroup/EditDistanceJoiner.java b/src/edu/tsinghua/dbgroup/EditDistanceJoiner.java index f69a6f7..5a7d016 100644 --- a/src/edu/tsinghua/dbgroup/EditDistanceJoiner.java +++ b/src/edu/tsinghua/dbgroup/EditDistanceJoiner.java @@ -4,6 +4,7 @@ import java.util.concurrent.*; import edu.tsinghua.dbgroup.*; + class EditDistanceJoiner { private List mStrings; private final TreeMap>>> mGlobalIndex; diff --git a/src/in/edu/ashoka/surf/CompatibleNameAlgorithm.java b/src/in/edu/ashoka/surf/CompatibleNameAlgorithm.java index b7f8a2a..393b1ad 100755 --- a/src/in/edu/ashoka/surf/CompatibleNameAlgorithm.java +++ b/src/in/edu/ashoka/surf/CompatibleNameAlgorithm.java @@ -2,6 +2,7 @@ import com.google.common.collect.*; import in.edu.ashoka.surf.util.Pair; +import in.edu.ashoka.surf.util.Timers; import in.edu.ashoka.surf.util.UnionFindSet; import in.edu.ashoka.surf.util.Util; import org.json.JSONArray; @@ -342,9 +343,15 @@ public List> run() throws FileNotFoundException { List filteredRows = filter.isEmpty() ? (List) new ArrayList<>(dataset.getRows()) : dataset.getRows().stream().filter(filter::passes).collect(toList()); + Timers.CompatibleNameTimer.reset(); + Timers.CompatibleNameTimer.start(); // now translate the row#s back to the actual rows classes = new ArrayList<>(); runRecursive (classes, filteredRows, minTokenOverlap, substringAllowed, initialMapping); + + Timers.CompatibleNameTimer.stop(); + + Timers.log.info ("Time for Compatible Name computation: " + Timers.CompatibleNameTimer.toString()); return classes; } diff --git a/src/in/edu/ashoka/surf/Config.java b/src/in/edu/ashoka/surf/Config.java index b2f085c..7fd6639 100755 --- a/src/in/edu/ashoka/surf/Config.java +++ b/src/in/edu/ashoka/surf/Config.java @@ -28,6 +28,7 @@ public class Config { public static String MERGE_FIELD = "Name"; public static final int groupsPerPage = 100; public static final int DEFAULT_EDIT_DISTANCE = 2; +// public static final int DEFAULT_COSINE_ACCURACY = 90; public static final int DEFAULT_MIN_TOKEN_OVERLAP = 2; public static final int DEFAULT_IGNORE_TOKEN_FREQUENCY = 200; public static final int DEFAULT_MIN_SPLITWEIGHT = 10; // a token in a field will be split only if it's constituent parts have appeared independently > 10 times. (However, there is an additional factor of 2x needed if the fields are only of length 3) diff --git a/src/in/edu/ashoka/surf/CosineFunc.java b/src/in/edu/ashoka/surf/CosineFunc.java new file mode 100644 index 0000000..0008f6f --- /dev/null +++ b/src/in/edu/ashoka/surf/CosineFunc.java @@ -0,0 +1,265 @@ +package in.edu.ashoka.surf; + +import static java.util.stream.Collectors.toList; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.lang3.text.WordUtils; + +import in.edu.ashoka.surf.Dataset; +import in.edu.ashoka.surf.Row; + +class Node { + String name1; + String name2; + double cosinesimilarity; + int index; + + public Node(String name1, String name2, double cosinesimilarity, int index) { + this.name1 = name1; + this.name2 = name2; + this.cosinesimilarity = cosinesimilarity; + this.index = index; + } + + public String toString() { + return index + " " + name1 + " " + name2 + " " + cosinesimilarity; + } +} + +class obj { + + HashMap hash; + Set char_set; + double length; + String word; + + public HashMap getHash() { + return hash; + } + + public void setHash(HashMap hash) { + this.hash = hash; + } + + public Set getChar_set() { + return char_set; + } + + public void setChar_set(Set char_set) { + this.char_set = char_set; + } + + public double getLength() { + return length; + } + + public void setLength(int length) { + this.length = length; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public obj(HashMap hash, Set char_set, double length, String word) { + this.hash = hash; + this.char_set = char_set; + this.length = length; + this.word = word; + } + +} + +public class CosineFunc { + + public static HashMap Count(String inputString) { + HashMap charCountMap = new HashMap(); + + char[] strArray = inputString.toCharArray(); + + for (char c : strArray) { + if (charCountMap.containsKey(c)) { + + charCountMap.put(c, charCountMap.get(c) + 1); + } else { + + charCountMap.put(c, 1); + } + } + + return charCountMap; + + } + + public static obj word2vec(String word) { + HashMap count_characters = Count(word); +// System.out.println(count_characters); + Set set_characters = count_characters.keySet(); +// System.out.println(set_characters); + + double length = 0; + int key = 0; + + for (Integer in : count_characters.values()) { + key += (in * in); + } + length = Math.sqrt(key); +// System.out.println(length); + + return new obj(count_characters, set_characters, length, word); + + } + + public static double cosine_similarity(obj vector1, obj vector2) { + Set common_characters = new HashSet(vector1.getChar_set()); // use the copy constructor + common_characters.retainAll(vector2.getChar_set()); +// System.out.println("Intersection = " + common_characters); + + int product_summation = 0; + for (Character ch : common_characters) { + product_summation += vector1.getHash().get(ch) * vector2.getHash().get(ch); + } +// System.out.println("product_summation = " + product_summation); + + double length = vector1.length * vector2.length; +// System.out.println("length = " + length); + + if (length == 0) { + return 0; + } else { + return product_summation / length; + } + + } + + public List> assign_similarity(Collection filteredRows, String fieldName,double val) { + +// HashMap> map = new HashMap<>(); + ArrayList names = new ArrayList<>(); + List> resultx = new ArrayList>(); + filteredRows.forEach(r -> names.add(r.get(fieldName))); +// filteredRows.forEach(r -> map.put(r.get(fieldName), new ArrayList<>())); +// ArrayList similar = new ArrayList<>(); + boolean visited[] = new boolean[names.size()]; + +// System.out.println("Map = " + map); + +// for (int i = 0; i < names.size(); i++) { +// System.out.println(i + " " + names.get(i)); +// } + ArrayList aa = new ArrayList<>(); + for (int i = 0; i < names.size(); i++) { + aa.add(word2vec(names.get(i))); + } + + for (int i = 0; i < names.size(); i++) { + String one = names.get(i); + int task = 0; + Set curr = null; + if (visited[i] == false) { + task = 1; + curr = new LinkedHashSet(); + visited[i] = true; + curr.add(one); + } +// obj v1 = word2vec(one); + for (int j = i + 1; j < names.size(); j++) { + String two = names.get(j); +// obj v2 = word2vec(two); + double cosine_val = cosine_similarity(aa.get(i),aa.get(j)); +// Node nn = new Node(one, two, cosine_similarity(word2vec(one), word2vec(two)), i); +// similar.add(nn); + + if (task == 1) { +// System.out.println("hello"); + if (cosine_val >= val && visited[j] == false) { +// System.out.println("adi"); + curr.add(two); + visited[j] = true; + } + } + } + if (task == 1) { + resultx.add(curr); + } + } +// int l = 0; + + return resultx; + +// System.out.println("gggggggggggggggggggggg"); +// for (int i = 0; i < resultx.size(); i++) { +// l += resultx.get(i).size(); +// System.out.println(resultx.get(i)); +// } + +// System.out.println(l); + +// for (String name : map.keySet()) { +// List list = map.get(name); +// list.add(name); +// map.put(name, list); +// } + +// for (int i = 0; i < similar.size(); i++) { +// Node node = similar.get(i); +// if (node.cosinesimilarity > 1.0) { +// List set1 = map.get(node.name1); +// set1.add(node.name2); +// map.put(node.name1, set1); +// +// List set2 = map.get(node.name2); +// set2.add(node.name1); +// map.put(node.name2, set2); +// } +// } +// System.out.println("Map = " + map); +// +// Collection> result = map.values(); +// +// System.out.println("result = " + result); +// System.out.println(result.size()); +// +// int le = 0; +// for(List aa : result) { +// System.out.println(aa); +// le += aa.size(); +// } +// System.out.println(le); + +// System.out.println(similar); + +// for (int i = 0; i < similar.size(); i++) { +// System.out.println(similar.get(i)); +// } + + } + +// public static void main(String[] args) throws IOException { +// // TODO Auto-generated method stub +//// String s1 = "adity a"; +//// String s2 = "aditya x"; +//// +//// System.out.println(cosine_similarity(word2vec(s1), word2vec(s2))); +// +// Dataset dataset = Dataset.getDataset(path); +// String fieldName = "Candidate"; +// Collection filteredRows = dataset.getRows().stream().collect(toList()); +// +// assign_similarity(filteredRows, fieldName); +// +// } + +} \ No newline at end of file diff --git a/src/in/edu/ashoka/surf/CosineSimilarityAlgo.java b/src/in/edu/ashoka/surf/CosineSimilarityAlgo.java new file mode 100644 index 0000000..461cc21 --- /dev/null +++ b/src/in/edu/ashoka/surf/CosineSimilarityAlgo.java @@ -0,0 +1,80 @@ +package in.edu.ashoka.surf; + +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; +import edu.tsinghua.dbgroup.EditDistanceClusterer; +import in.edu.ashoka.surf.util.Timers; +import java.util.*; +import java.util.stream.Collectors; +import static java.util.stream.Collectors.toList; +import java.io.IOException; + +public class CosineSimilarityAlgo extends MergeAlgorithm { + + private final int inputval; + private final String fieldName; // FieldName on which to Cosine Similarity + private final Filter filter; + + CosineSimilarityAlgo(Dataset dataset, String fieldName, int inputval, Filter filter) { + super (dataset); + this.filter = filter; + this.fieldName = fieldName; + this.inputval = inputval; + } + + @Override + public List> run() { + Collection filteredRows = filter.isEmpty() ? dataset.getRows() : dataset.getRows().stream().filter(filter::passes).collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); + +// filteredRows.forEach(r -> System.out.println(r.get(fieldName))); + double acc = inputval/100.0; + +// System.out.println("--------------------------------------------------------------------" + acc); + CosineFunc func = new CosineFunc(); + + List> clusters; + + Timers.cosineTimer.reset(); + Timers.cosineTimer.start(); + + clusters = func.assign_similarity(filteredRows,fieldName,acc); + + Timers.cosineTimer.stop(); + + System.out.println("--------------------------------------------------------The Time Taken is---------------------------------------------------------------"); + Timers.log.info ("Time for cosine similarity computation: " + Timers.cosineTimer.toString()); + + int key = 0; + classes = new ArrayList<>(); + for (Set cluster : clusters) { +// System.out.println(key++ + " " + cluster); + final Collection rowsForThisCluster = new ArrayList<>(); + // cluster just has strings, convert each string in the cluster to its rows, and + // add it to rowsForThisCluster + cluster.forEach(s -> { + rowsForThisCluster.addAll(fieldValueToRows.get(s)); + }); + classes.add(rowsForThisCluster); + } + +// System.out.println("--------------------------------------------------------------------"); + +// classes.forEach(x -> System.out.println(x.toString())); + + return classes; + + } + + /* debug method */ + void dumpClasses() { + for (Collection rows: classes) { + log.info (rows.iterator().next().get(fieldName)); + } + } + + public String toString() { return "The cosine similarity algorithm works fine with inputval" + inputval; } + +} \ No newline at end of file diff --git a/src/in/edu/ashoka/surf/Dataset.java b/src/in/edu/ashoka/surf/Dataset.java index c255533..b680c90 100755 --- a/src/in/edu/ashoka/surf/Dataset.java +++ b/src/in/edu/ashoka/surf/Dataset.java @@ -193,19 +193,31 @@ public void run() { checkFilesForFailure(filename); this.name = filename; + +// System.out.println("hellllllllllllllloooooooooooo"); Set allRows = new LinkedHashSet<>(); columnsToSave = new ArrayList<>(); int nRows = 0; + boolean isr = false; // Reader in = new FileReader("GE.csv"); // read the names from CSV Iterable records = CSVParser.parse(new File(filename), Charset.forName("UTF-8"), CSVFormat.EXCEL.withHeader()); for (CSVRecord record : records) { + nRows++; Map map = record.toMap(); + + if(map.containsKey("Rid") == false) { + map.put("Rid", "0"); + isr = true; + } + +// System.out.println(map); if (nRows == 1) { for (String col : map.keySet()) { +// System.out.println(col); columnsToSave.add(col); registerColumn(col); } @@ -216,6 +228,11 @@ public void run() { } this.rows = allRows; this.filename = filename; +// System.out.println(columnsToSave); + + if(isr == true) { + save(); + } } private void checkFilesForFailure(String filename) throws IOException{ diff --git a/src/in/edu/ashoka/surf/EditDistanceMergeAlgorithm.java b/src/in/edu/ashoka/surf/EditDistanceMergeAlgorithm.java index 09d670e..550c7b2 100644 --- a/src/in/edu/ashoka/surf/EditDistanceMergeAlgorithm.java +++ b/src/in/edu/ashoka/surf/EditDistanceMergeAlgorithm.java @@ -10,6 +10,8 @@ import static java.util.stream.Collectors.toList; +import java.io.IOException; + /** * Created by hangal on 8/12/17. * New simplified edit distance merge manager. @@ -35,9 +37,11 @@ public List> run() { Collection filteredRows = filter.isEmpty() ? dataset.getRows() : dataset.getRows().stream().filter(filter::passes).collect(toList()); - // create map of fieldValueToRows SetMultimap fieldValueToRows = HashMultimap.create(); filteredRows.forEach (r -> fieldValueToRows.put (r.get(fieldName), r)); + +// System.out.println("--------------------------------------------------------------------"); +// filteredRows.forEach(r -> System.out.println(r.get(fieldName))); // do the clustering based on ed (but only if ed > 0) Timers.editDistanceTimer.reset(); @@ -69,11 +73,13 @@ public List> run() { // compute the result of this algorithm classes = new ArrayList<>(); for (Set cluster : clusters) { +// System.out.println(cluster); final Collection rowsForThisCluster = new ArrayList<>(); // cluster just has strings, convert each string in the cluster to its rows, and add it to rowsForThisCluster cluster.forEach (s -> { rowsForThisCluster.addAll (fieldValueToRows.get(s)); }); classes.add (rowsForThisCluster); } + return classes; } @@ -85,4 +91,13 @@ void dumpClasses() { } public String toString() { return "Edit distance algorithm with maximum edit distance " + maxEditDistance; } + + +// public static void main(String args[]) throws IOException{ +// Dataset d = Dataset.getDataset("/Users/priyamgarrg21/Documents/Aditya/EX/TCPD_GE_Delhi_2020-6-18.csv"); +// Filter f = new Filter(null); +// EditDistanceMergeAlgorithm aa = new EditDistanceMergeAlgorithm(d, "Candidate", 1, f); +// aa.run(); +// } + } \ No newline at end of file diff --git a/src/in/edu/ashoka/surf/MergeManager.java b/src/in/edu/ashoka/surf/MergeManager.java index 3c96ecb..18286a7 100755 --- a/src/in/edu/ashoka/surf/MergeManager.java +++ b/src/in/edu/ashoka/surf/MergeManager.java @@ -111,6 +111,7 @@ public String description() { private final Multimap idToRows = LinkedHashMultimap.create(); private final SetMultimap rowToLabels = HashMultimap.create(); private int nextAvailableId = 0; + private int uniqueval = 0; private final List allCommands = new ArrayList<>(); // compile all the commands, so that they can be replayed some day, if needed @@ -170,6 +171,23 @@ public MergeManager(Dataset dataset, Map params) throws FileNotF algorithm = new EditDistanceMergeAlgorithm(d, "_st_" + Config.MERGE_FIELD, editDistance, filter); // run e.d. on the _st_ version of the field break; + + case "reviewalgo": + algorithm = new NewReviewAlgorithm(d, Config.MERGE_FIELD, filter); + break; + + case "cosinesimilarity": + int accuracy = 90; + try { + accuracy = Integer.parseInt(params.get("cosine-similarity")); + } catch (NumberFormatException e) { + Util.print_exception(e, log); + } +// algorithm = new EditDistanceMergeAlgorithm(d, "_st_" + Config.MERGE_FIELD, 5, filter); // run e.d. on the _st_ version of the field + System.out.println("---------------------------------------------------------" + accuracy + "-------------------------------------------------------------------------"); + algorithm = new CosineSimilarityAlgo(d, "_st_" + Config.MERGE_FIELD, accuracy, filter); + break; + case "allNames": algorithm = new MergeAlgorithm(dataset) { @Override @@ -296,8 +314,19 @@ void updateMergesBasedOnIds() { /* computes idToRows and also updates nextAvailableId */ private void computeIdToRows (Collection rows) { - for (Row r: rows) +// int i = 1; + for (Row r: rows) { idToRows.put (r.get(Config.ID_FIELD), r); +// System.out.println(r.get("Candidate") + "->" + i + "->" + r.get(Config.ID_FIELD)); +// i++; + } + int maxn = 0; + for(Row r: rows) { + if(Integer.parseInt(r.get("Rid")) >= maxn) { + maxn = Integer.parseInt(r.get("Rid")); + } + } + uniqueval = maxn + 1; int maxNumberUsed = 1; for (String id: idToRows.keySet()) { @@ -329,15 +358,20 @@ public void applyUpdatesAndSave(Command[] commands) throws IOException { firstId = id; continue; } +// System.out.println(id); +// System.out.println("aaaaaaaaaaaa"); // update all the rows for this id to firstId // also remember to update the idToRows map log.info("Merging id " + id + " into " + firstId); Collection rowsForThisId = idToRows.get(id); +// System.out.println(rowsForThisId); + if (rowsForThisId.size() == 0) log.warn ("While trying to merge into id " + firstId + ", not found any rows for id: " + id); for (Row row : rowsForThisId) { +// System.out.println(row); row.set(Config.ID_FIELD, firstId); // we wipe out the old id for this row idToRows.get(firstId).add (row); } @@ -373,18 +407,69 @@ public void applyUpdatesAndSave(Command[] commands) throws IOException { } // create unique id's for all rows for (String id : command.ids) { +// System.out.println(id); + emptyRow = new Row(new LinkedHashMap<>(), toBeReviewed.rows.size(), toBeReviewed); Collection rowsForThisId = idToRows.get(id); if (rowsForThisId == null) { log.warn ("rowsForThisID is null for id " + id); continue; } +// System.out.println(rowsForThisId); toBeReviewed.rows.addAll(rowsForThisId); - toBeReviewed.rows.add(emptyRow); // add empty row with no data +// toBeReviewed.rows.add(emptyRow); // add empty row with no data } - - // add 2 empty rows with no data + + emptyRow = new Row(new LinkedHashMap<>(), toBeReviewed.rows.size(), toBeReviewed); +// System.out.println("a"); + toBeReviewed.rows.add(emptyRow); +// System.out.println("b"); toBeReviewed.rows.add(emptyRow); +// System.out.println("c"); tbrNeedsToBeSaved = true; + + System.out.println("--------------Review Algorithm Stats-----------"); + + + int temp = 0; + for(String id: command.ids) { + Collection rowsForThisId = idToRows.get(id); +// if (rowsForThisId == null) { +// System.out.println("No Row for this ID"); +// continue; +// } + for(Row row: rowsForThisId) { + if(Integer.parseInt(row.get("Rid")) >= temp) { + temp = Integer.parseInt(row.get("Rid")); + } + } + } + + System.out.println("If existing rows are changed then temp!=0 otherwise it is 0:- "+ temp); + System.out.println("These rows will be given rid = "+ uniqueval); + if(temp != 0) { + for(String id: command.ids) { + Collection rowsForThisId = idToRows.get(id); + for(Row row: rowsForThisId) { + System.out.println("Candidate Name:- " + row.get("Candidate")); + row.set("Rid", Integer.toString(temp)); + System.out.println("Its Given Rid:- " + row.get("Rid")); + } + } + } + else { + for(String id: command.ids) { + Collection rowsForThisId = idToRows.get(id); + for(Row row: rowsForThisId) { + System.out.println("Candidate Name:- " + row.get("Candidate")); + row.set("Rid", Integer.toString(uniqueval)); + System.out.println("Its Given Rid:- " + row.get("Rid")); + } + } + uniqueval++; + } + + d.save(); + } else if ("add-label".equalsIgnoreCase(command.op)) { String label = command.label; for (String gid : command.ids) { diff --git a/src/in/edu/ashoka/surf/NewReviewAlgorithm.java b/src/in/edu/ashoka/surf/NewReviewAlgorithm.java new file mode 100644 index 0000000..d431663 --- /dev/null +++ b/src/in/edu/ashoka/surf/NewReviewAlgorithm.java @@ -0,0 +1,77 @@ +package in.edu.ashoka.surf; + +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; +import edu.tsinghua.dbgroup.EditDistanceClusterer; +import in.edu.ashoka.surf.util.Timers; +import java.util.*; +import java.util.stream.Collectors; +import static java.util.stream.Collectors.toList; +import java.io.IOException; + +public class NewReviewAlgorithm extends MergeAlgorithm { + + private final String fieldName; // FieldName on which to Cosine Similarity + private final Filter filter; + + NewReviewAlgorithm(Dataset dataset, String fieldName, Filter filter) { + super(dataset); + this.filter = filter; + this.fieldName = fieldName; + } + + @Override + public List> run() { + Collection filteredRows = filter.isEmpty() ? dataset.getRows() + : dataset.getRows().stream().filter(filter::passes).collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); +// filteredRows.forEach(r -> System.out.println(r.get(fieldName))); + + Timers.ReviewTimer.reset(); + Timers.ReviewTimer.start(); + + HashMap> map = new HashMap>(); + + for (Row row : filteredRows) { + int ridval = Integer.parseInt(row.get("Rid")); + if (ridval != 0) { + if (map.containsKey(ridval) == false) { + map.put(ridval, new ArrayList()); + map.get(ridval).add(row); + } else { + map.get(ridval).add(row); + } + } + } + + classes = new ArrayList<>(); + + for (Integer i : map.keySet()) { + final Collection rowsForThisCluster = map.get(i); + classes.add(rowsForThisCluster); + } + + Timers.ReviewTimer.stop(); + Timers.log.info ("TimeTaken by New Review Algo: " + Timers.ReviewTimer.toString()); + +// System.out.println("--------------------------------------------------------------------"); + +// classes.forEach(x -> System.out.println(x.toString())); + + return classes; + } + + /* debug method */ + void dumpClasses() { + for (Collection rows : classes) { + log.info(rows.iterator().next().get(fieldName)); + } + } + + public String toString() { + return "The new review algo works fine"; + } + +} \ No newline at end of file diff --git a/src/in/edu/ashoka/surf/ReviewAlgo.java b/src/in/edu/ashoka/surf/ReviewAlgo.java new file mode 100644 index 0000000..9c6642a --- /dev/null +++ b/src/in/edu/ashoka/surf/ReviewAlgo.java @@ -0,0 +1,88 @@ +package in.edu.ashoka.surf; + +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; +import edu.tsinghua.dbgroup.EditDistanceClusterer; +import in.edu.ashoka.surf.util.Timers; +import java.util.*; +import java.util.stream.Collectors; +import static java.util.stream.Collectors.toList; +import java.io.IOException; + +public class ReviewAlgo extends MergeAlgorithm { + + private final String fieldName; //This Review Algorithm adds data to TBRfile System + private final Filter filter; + + ReviewAlgo(Dataset dataset, String fieldName, Filter filter) { + super (dataset); + this.filter = filter; + this.fieldName = fieldName; + } + + @Override + public List> run() { + Collection filteredRows = filter.isEmpty() ? dataset.getRows() : dataset.getRows().stream().filter(filter::passes).collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); + +// System.out.println("------------------------------------------------------------------------------------"); +// System.out.println(filteredRows.size()); +// filteredRows.forEach(r -> System.out.println(r.alldata())); +// filteredRows.forEach(r -> System.out.println(r.get(fieldName))); +// System.out.println("------------------------------------------------------------------------------------"); + + List> clusters = new ArrayList>(); + + Timers.cosineTimer.reset(); + Timers.cosineTimer.start(); + + ArrayList namex = new ArrayList<>(); + filteredRows.forEach(r -> namex.add(r.get(fieldName))); + + int task = 1; + Set curr = null; + for (int i = 0; i < namex.size(); i++) { + if(task == 1) { + curr = new LinkedHashSet(); + } + if(namex.get(i).length() != 0) { + curr.add(namex.get(i)); + task = 0; + } + else { + task++; + if(i!=0 && task == 1) { + clusters.add(curr); + } + } + } + + classes = new ArrayList<>(); + for (Set cluster : clusters) { + final Collection rowsForThisCluster = new ArrayList<>(); + cluster.forEach(s -> { + rowsForThisCluster.addAll(fieldValueToRows.get(s)); + }); + classes.add(rowsForThisCluster); + } + + + Timers.cosineTimer.stop(); + Timers.log.info ("Time for Review Algo: " + Timers.cosineTimer.toString()); + + return classes; + + } + + /* debug method */ + void dumpClasses() { + for (Collection rows: classes) { + log.info (rows.iterator().next().get(fieldName)); + } + } + + public String toString() { return "The Review Algo Works Fine";} + +} diff --git a/src/in/edu/ashoka/surf/Row.java b/src/in/edu/ashoka/surf/Row.java index 50427aa..007337b 100755 --- a/src/in/edu/ashoka/surf/Row.java +++ b/src/in/edu/ashoka/surf/Row.java @@ -77,6 +77,14 @@ public void set(String col, String val) { public int nFields () { return this.fields.keySet().size(); } + + public String alldata() { + String ss = ""; + for(String str : fields.keySet()) { + ss = ss + (" " + str + ":" + fields.get(str)); + } + return ss; + } public Set getAllFieldNames () { return fields.keySet(); diff --git a/src/in/edu/ashoka/surf/Test2.java b/src/in/edu/ashoka/surf/Test2.java index 6a03305..982ac84 100755 --- a/src/in/edu/ashoka/surf/Test2.java +++ b/src/in/edu/ashoka/surf/Test2.java @@ -11,7 +11,7 @@ class Test2 { public static void main(String[] args) throws IOException { - Dataset d = Dataset.getDataset("/home/sudx/surf.java/surf/GE/candidates/csv/candidates_info.csv"); + Dataset d = Dataset.getDataset("/Users/priyamgarrg21/Documents/Aditya/EX/TCPD_GE_Delhi_2020-6-18.csv"); Collection rows = d.rows; //set ups what toString() of Row needs to print diff --git a/src/in/edu/ashoka/surf/newtest.java b/src/in/edu/ashoka/surf/newtest.java new file mode 100644 index 0000000..ce5dacc --- /dev/null +++ b/src/in/edu/ashoka/surf/newtest.java @@ -0,0 +1,85 @@ +package in.edu.ashoka.surf; + +import in.edu.ashoka.surf.*; +import edu.tsinghua.dbgroup.EditDistanceClusterer; +import in.edu.ashoka.surf.Dataset; +import in.edu.ashoka.surf.Row; +import in.edu.ashoka.surf.util.Timers; +import static java.util.stream.Collectors.toList; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; + +class Test1 { + + private static final String path = "/Users/priyamgarrg21/Documents/Aditya/Internship@Ashoka/TCPD_GE_Delhi_2020-6-18.csv"; + + public static void main(String args[]) throws IOException { + Dataset dataset = Dataset.getDataset(path); + Filter filter = new Filter(null); + String fieldName = "Candidate"; + int maxEditDistance = 1; + List> classes; + +// Set names = d.getRows().stream().map(r -> r.get("Candidate")).collect(Collectors.toSet()); +// +// EditDistanceClusterer edc = new EditDistanceClusterer(5); +// names.forEach(edc::populate); +// List> clusters = (List) edc.getClusters(); +// +// int i = 0; +// for (Set cluster : clusters) { +// System.out.println("Cluster " + i++ + " -------"); +// for (String s : cluster) +// System.out.println(s); +// } + + Collection filteredRows = filter.isEmpty() ? dataset.getRows() + : dataset.getRows().stream().filter(filter::passes).collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); + + // do the clustering based on ed (but only if ed > 0) + + List> clusters; + + if (maxEditDistance >= 1) { + final EditDistanceClusterer edc = new EditDistanceClusterer(maxEditDistance); + filteredRows.forEach(r -> edc.populate(r.get(fieldName))); + clusters = (List) edc.getClusters(); + } else { + // handle the case when edit distance is 0 by creating a list of single-element + // sets with all unique fieldVal's + clusters = new ArrayList<>(); + for (String fieldVal : fieldValueToRows.keySet()) { + // create a set with a single val + Set set = new LinkedHashSet(); + set.add(fieldVal); + clusters.add(set); + } + } + + // compute the result of this algorithm + classes = new ArrayList<>(); + for (Set cluster : clusters) { + System.out.println("----gjkkklkjhgfds------"); + System.out.println(cluster); + final Collection rowsForThisCluster = new ArrayList<>(); + // cluster just has strings, convert each string in the cluster to its rows, and + // add it to rowsForThisCluster + cluster.forEach(s -> { + rowsForThisCluster.addAll(fieldValueToRows.get(s)); + }); + classes.add(rowsForThisCluster); + } + + } +} + diff --git a/src/in/edu/ashoka/surf/test/T2.java b/src/in/edu/ashoka/surf/test/T2.java new file mode 100644 index 0000000..9bd2724 --- /dev/null +++ b/src/in/edu/ashoka/surf/test/T2.java @@ -0,0 +1,86 @@ +package in.edu.ashoka.surf.test; + +import in.edu.ashoka.surf.*; +import edu.tsinghua.dbgroup.EditDistanceClusterer; +import in.edu.ashoka.surf.Dataset; +import in.edu.ashoka.surf.Row; +import in.edu.ashoka.surf.util.Timers; +import static java.util.stream.Collectors.toList; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; + +class T2 { + + private static final String path = "/Users/priyamgarrg21/Documents/Aditya/Internship@Ashoka/TCPD_GE_Delhi_2020-6-18.csv"; + + public static void main(String args[]) throws IOException { + Dataset dataset = Dataset.getDataset(path); + String fieldName = "Candidate"; + int maxEditDistance = 5; + List> classes; + +// Set names = dataset.getRows().stream().map(r -> r.get("Candidate")).collect(Collectors.toSet()); +// +// EditDistanceClusterer edcx = new EditDistanceClusterer(5); +// names.forEach(edcx::populate); +// List> clustersx = (List) edcx.getClusters(); +// +// int ix = 0; +// for (Set cluster : clustersx) { +// System.out.println("Cluster " + ix++ + " -------"); +// for (String s : cluster) +// System.out.println(s); +// } +// System.out.println("--------------------------------------------------------------------"); + + Collection filteredRows = dataset.getRows().stream().collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); + + // do the clustering based on ed (but only if ed > 0) + + filteredRows.forEach(r -> System.out.println(r.get(fieldName))); + + List> clusters; + + if (maxEditDistance >= 1) { + final EditDistanceClusterer edc = new EditDistanceClusterer(maxEditDistance); + filteredRows.forEach(r -> edc.populate(r.get(fieldName))); + clusters = (List) edc.getClusters(); + } else { + // handle the case when edit distance is 0 by creating a list of single-element + // sets with all unique fieldVal's + clusters = new ArrayList<>(); + for (String fieldVal : fieldValueToRows.keySet()) { + // create a set with a single val + Set set = new LinkedHashSet(); + set.add(fieldVal); + clusters.add(set); + } + } + + // compute the result of this algorithm + classes = new ArrayList<>(); + for (Set cluster : clusters) { + System.out.println(cluster); + final Collection rowsForThisCluster = new ArrayList<>(); + // cluster just has strings, convert each string in the cluster to its rows, and + // add it to rowsForThisCluster + cluster.forEach(s -> { + rowsForThisCluster.addAll(fieldValueToRows.get(s)); + }); + classes.add(rowsForThisCluster); + } + + classes.forEach(x -> System.out.println(x.toString())); + + } +} diff --git a/src/in/edu/ashoka/surf/test/T3.java b/src/in/edu/ashoka/surf/test/T3.java new file mode 100644 index 0000000..0bce1c1 --- /dev/null +++ b/src/in/edu/ashoka/surf/test/T3.java @@ -0,0 +1,75 @@ +package in.edu.ashoka.surf.test; + +import static java.util.stream.Collectors.toList; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.HashMultimap; +import com.google.common.collect.SetMultimap; + +import in.edu.ashoka.surf.Dataset; +import in.edu.ashoka.surf.Row; + +class T3 { + + static final String path = "/Users/priyamgarrg21/Documents/Aditya/Internship@Ashoka/TCPD_GE_Delhi_2020-6-18.csv"; + + public static void main(String args[]) throws IOException { + Dataset dataset = Dataset.getDataset(path); + String fieldName = "Candidate"; + int maxEditDistance = 5; + List> classes; + + cos_sample mainfunc = new cos_sample(); + +// System.out.println(mainfunc.cosine_similarity(mainfunc.word2vec("adity a"), mainfunc.word2vec("aditya xs"))); + + Collection filteredRows = dataset.getRows().stream().collect(toList()); + + SetMultimap fieldValueToRows = HashMultimap.create(); + filteredRows.forEach(r -> fieldValueToRows.put(r.get(fieldName), r)); + +// Iterator iterator = filteredRows.iterator(); + +// while(iterator.hasNext()) { +// System.out.println(iterator.next().get(fieldName)); +// } + +// filteredRows.forEach(r -> System.out.println(r.get(fieldName))); + + System.out.println("--------------------------------------------------------------------"); + + List> clusters = mainfunc.assign_similarity(filteredRows, fieldName); + +// for (Row fil : filteredRows) { +//// System.out.println(fil.get(fieldName)); +// Set setx = new LinkedHashSet(); +// setx.add(fil.get(fieldName)); +// clusters.add(setx); +// } + + int key = 0; + classes = new ArrayList<>(); + for (Set cluster : clusters) { + System.out.println(key++ + " " + cluster); + final Collection rowsForThisCluster = new ArrayList<>(); + cluster.forEach(s -> { + rowsForThisCluster.addAll(fieldValueToRows.get(s)); + }); + classes.add(rowsForThisCluster); + } + + System.out.println("--------------------------------------------------------------------"); + + classes.forEach(x -> System.out.println(x.toString())); + + } +} diff --git a/src/in/edu/ashoka/surf/test/Test1.java b/src/in/edu/ashoka/surf/test/Test1.java index 10e0823..78bba66 100644 --- a/src/in/edu/ashoka/surf/test/Test1.java +++ b/src/in/edu/ashoka/surf/test/Test1.java @@ -13,12 +13,12 @@ */ class Test1 { - private static final String path = "/Users/hangal/Gujarat_worksheet.csv"; + private static final String path = "/Users/priyamgarrg21/Documents/Aditya/Internship@Ashoka/TCPD_GE_Delhi_2020-6-18.csv"; public static void main(String args[]) throws IOException { Dataset d = Dataset.getDataset(path); - Set names = d.getRows().stream().map (r -> r.get("Name")).collect (Collectors.toSet()); + Set names = d.getRows().stream().map (r -> r.get("Candidate")).collect (Collectors.toSet()); - EditDistanceClusterer edc = new EditDistanceClusterer(1); + EditDistanceClusterer edc = new EditDistanceClusterer(5); names.forEach (edc::populate); List> clusters = (List) edc.getClusters(); diff --git a/src/in/edu/ashoka/surf/test/cos_sample.java b/src/in/edu/ashoka/surf/test/cos_sample.java new file mode 100644 index 0000000..cee32a5 --- /dev/null +++ b/src/in/edu/ashoka/surf/test/cos_sample.java @@ -0,0 +1,258 @@ +package in.edu.ashoka.surf.test; + +import static java.util.stream.Collectors.toList; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import in.edu.ashoka.surf.Dataset; +import in.edu.ashoka.surf.Row; + +class Node { + String name1; + String name2; + double cosinesimilarity; + int index; + + public Node(String name1, String name2, double cosinesimilarity, int index) { + this.name1 = name1; + this.name2 = name2; + this.cosinesimilarity = cosinesimilarity; + this.index = index; + } + + public String toString() { + return index + " " + name1 + " " + name2 + " " + cosinesimilarity; + } +} + +class obj { + + HashMap hash; + Set char_set; + double length; + String word; + + public HashMap getHash() { + return hash; + } + + public void setHash(HashMap hash) { + this.hash = hash; + } + + public Set getChar_set() { + return char_set; + } + + public void setChar_set(Set char_set) { + this.char_set = char_set; + } + + public double getLength() { + return length; + } + + public void setLength(int length) { + this.length = length; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public obj(HashMap hash, Set char_set, double length, String word) { + this.hash = hash; + this.char_set = char_set; + this.length = length; + this.word = word; + } + +} + +public class cos_sample { + + static final String path = "/Users/priyamgarrg21/Documents/Aditya/Internship@Ashoka/TCPD_GE_Delhi_2020-6-18.csv"; + + public static HashMap Count(String inputString) { + HashMap charCountMap = new HashMap(); + + char[] strArray = inputString.toCharArray(); + + for (char c : strArray) { + if (charCountMap.containsKey(c)) { + + charCountMap.put(c, charCountMap.get(c) + 1); + } else { + + charCountMap.put(c, 1); + } + } + + return charCountMap; + + } + + public static obj word2vec(String word) { + HashMap count_characters = Count(word); +// System.out.println(count_characters); + Set set_characters = count_characters.keySet(); +// System.out.println(set_characters); + + double length = 0; + int key = 0; + + for (Integer in : count_characters.values()) { + key += (in * in); + } + length = Math.sqrt(key); +// System.out.println(length); + + return new obj(count_characters, set_characters, length, word); + + } + + public static double cosine_similarity(obj vector1, obj vector2) { + Set common_characters = new HashSet(vector1.getChar_set()); // use the copy constructor + common_characters.retainAll(vector2.getChar_set()); +// System.out.println("Intersection = " + common_characters); + + int product_summation = 0; + for (Character ch : common_characters) { + product_summation += vector1.getHash().get(ch) * vector2.getHash().get(ch); + } +// System.out.println("product_summation = " + product_summation); + + double length = vector1.length * vector2.length; +// System.out.println("length = " + length); + + if (length == 0) { + return 0; + } else { + return product_summation / length; + } + + } + + public static List> assign_similarity(Collection filteredRows, String fieldName) { + + HashMap> map = new HashMap<>(); + ArrayList names = new ArrayList<>(); + List> resultx = new ArrayList>(); + filteredRows.forEach(r -> names.add(r.get(fieldName))); + filteredRows.forEach(r -> map.put(r.get(fieldName), new ArrayList<>())); + ArrayList similar = new ArrayList<>(); + boolean visited[] = new boolean[names.size()]; + +// System.out.println("Map = " + map); + +// for (int i = 0; i < names.size(); i++) { +// System.out.println(i + " " + names.get(i)); +// } + + for (int i = 0; i < names.size(); i++) { + String one = names.get(i); + int task = 0; + Set curr = null; + if (visited[i] == false) { + task = 1; + curr = new LinkedHashSet(); + visited[i] = true; + curr.add(one); + } + for (int j = i + 1; j < names.size(); j++) { + String two = names.get(j); + Node nn = new Node(one, two, cosine_similarity(word2vec(one), word2vec(two)), i); + similar.add(nn); + + if (task == 1) { +// System.out.println("hello"); + if (nn.cosinesimilarity >= 0.7 && visited[j] == false) { +// System.out.println("adi"); + curr.add(two); + visited[j] = true; + } + } + } + if (task == 1) { + resultx.add(curr); + } + } + int l = 0; + + return resultx; + +// System.out.println("gggggggggggggggggggggg"); +// for (int i = 0; i < resultx.size(); i++) { +// l += resultx.get(i).size(); +// System.out.println(resultx.get(i)); +// } + +// System.out.println(l); + +// for (String name : map.keySet()) { +// List list = map.get(name); +// list.add(name); +// map.put(name, list); +// } + +// for (int i = 0; i < similar.size(); i++) { +// Node node = similar.get(i); +// if (node.cosinesimilarity > 1.0) { +// List set1 = map.get(node.name1); +// set1.add(node.name2); +// map.put(node.name1, set1); +// +// List set2 = map.get(node.name2); +// set2.add(node.name1); +// map.put(node.name2, set2); +// } +// } +// System.out.println("Map = " + map); +// +// Collection> result = map.values(); +// +// System.out.println("result = " + result); +// System.out.println(result.size()); +// +// int le = 0; +// for(List aa : result) { +// System.out.println(aa); +// le += aa.size(); +// } +// System.out.println(le); + +// System.out.println(similar); + +// for (int i = 0; i < similar.size(); i++) { +// System.out.println(similar.get(i)); +// } + + } + + public static void main(String[] args) throws IOException { + // TODO Auto-generated method stub +// String s1 = "adity a"; +// String s2 = "aditya x"; +// +// System.out.println(cosine_similarity(word2vec(s1), word2vec(s2))); + + Dataset dataset = Dataset.getDataset(path); + String fieldName = "Candidate"; + Collection filteredRows = dataset.getRows().stream().collect(toList()); + + assign_similarity(filteredRows, fieldName); + + } + +} diff --git a/src/in/edu/ashoka/surf/util/Timers.java b/src/in/edu/ashoka/surf/util/Timers.java index ae0ab92..7c3b143 100644 --- a/src/in/edu/ashoka/surf/util/Timers.java +++ b/src/in/edu/ashoka/surf/util/Timers.java @@ -14,6 +14,9 @@ public class Timers { public static final StopWatch tokenizationTimer = new StopWatch(); public static final StopWatch editDistanceTimer = new StopWatch(); public static final StopWatch unionFindTimer = new StopWatch(); + public static final StopWatch cosineTimer = new StopWatch(); + public static final StopWatch CompatibleNameTimer = new StopWatch(); + public static final StopWatch ReviewTimer = new StopWatch(); public static void print() { log.info ("Canonicalization: " + canonTimer); @@ -21,4 +24,6 @@ public static void print() { log.info ("Edit distance computation: " + editDistanceTimer); log.info ("Union Find: " + unionFindTimer); } + + }