From 351150167b5c7c6228b1b04baaae46c25eeecf38 Mon Sep 17 00:00:00 2001 From: AhmedYounes94 Date: Sat, 14 Aug 2021 00:16:19 +0100 Subject: [PATCH 1/4] resolve conflict --- pom.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index 568e9a4..b826d47 100644 --- a/pom.xml +++ b/pom.xml @@ -9,12 +9,12 @@ jar - - 1.0.6 - uk.ac.susx.tag - tag-dist - ../tag-dist - + + + + + + @@ -80,7 +80,7 @@ uk.ac.susx.tag dependencyparser - 1.17.1 + 1.18.0 From d96219349b6adde3b672ee924622ffd47ea1d664 Mon Sep 17 00:00:00 2001 From: AhmedYounes94 Date: Fri, 12 Nov 2021 04:01:04 +0000 Subject: [PATCH 2/4] add russian stopwords --- .../filtering/TokenFilterRelevanceStopwords.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java b/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java index f72655c..7f3aef7 100644 --- a/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java +++ b/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java @@ -188,6 +188,19 @@ public class TokenFilterRelevanceStopwords extends TokenFilter { "unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen", "welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde", "würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens")); + + put("ru", Sets.newHashSet("c", "а", "алло", "без", "белый", "близко", "более", "больше", "большой", "будем", "будет", "будете", "будешь", "будто", "буду", "будут", "будь", "бы", "бывает", "бывь", "был", "была", "были", "было", "быть", "в", "важная", "важное", "важные", "важный", "вам", "вами", "вас", "ваш", "ваша", "ваше", "ваши", "вверх", "вдали", "вдруг", "ведь", "везде", "вернуться", "весь", "вечер", "взгляд", "взять", "вид", + "видел", "видеть", "вместе", "вне", "вниз", "внизу", "во", "вода", "война", "вокруг", "вон", "вообще", "вопрос", "восемнадцатый", "восемнадцать", "восемь", "восьмой", "вот", "впрочем", "времени", "время", "все", "все еще", "всегда", "всего", "всем", "всеми", "всему", "всех", "всею", "всю", "всюду", "вся", "всё", "второй", "вы", "выйти", "г", "где", "главный", "глаз", "говорил", "говорит", "говорить", "год", "года", "году", "голова", + "голос", "город", "да", "давать", "давно", "даже", "далекий", "далеко", "дальше", "даром", "дать", "два", "двадцатый", "двадцать", "две", "двенадцатый", "двенадцать", "дверь", "двух", "девятнадцатый", "девятнадцать", "девятый", "девять", "действительно", "дел", "делал", "делать", "делаю", "дело", "день", "деньги", "десятый", "десять", "для", "до", "довольно", "долго", "должен", "должно", "должный", "дом", "дорога", "друг", "другая", + "другие", "других", "друго", "другое", "другой", "думать", "душа", "е", "его", "ее", "ей", "ему", "если", "есть", "еще", "ещё", "ею", "её", "ж", "ждать", "же", "жена", "женщина", "жизнь", "жить", "за", "занят", "занята", "занято", "заняты", "затем", "зато", "зачем", "здесь", "земля", "знать", "значит", "значить", "и", "иди", "идти", "из", "или", "им", "имеет", "имел", "именно", "иметь", "ими", "имя", "иногда", "их", "к", "каждая", "каждое", + "каждые", "каждый", "кажется", "казаться", "как", "какая", "какой", "кем", "книга", "когда", "кого", "ком", "комната", "кому", "конец", "конечно", "которая", "которого", "которой", "которые", "который", "которых", "кроме", "кругом", "кто", "куда", "лежать", "лет", "ли", "лицо", "лишь", "лучше", "любить", "люди", "м", "маленький", "мало", "мать", "машина", "между", "меля", "менее", "меньше", "меня", "место", "миллионов", "мимо", "минута", "мир", + "мира", "мне", "много", "многочисленная", "многочисленное", "многочисленные", "многочисленный", "мной", "мною", "мог", "могу", "могут", "мож", "может", "может быть", "можно", "можхо", "мои", "мой", "мор", "москва", "мочь", "моя", "моё", "мы", "на", "наверху", "над", "надо", "назад", "наиболее", "найти", "наконец", "нам", "нами", "народ", "нас", "начала", "начать", "наш", "наша", "наше", "наши", "не", "него", "недавно", "недалеко", "нее", "ней", + "некоторый", "нельзя", "нем", "немного", "нему", "непрерывно", "нередко", "несколько", "нет", "нею", "неё", "ни", "нибудь", "ниже", "низко", "никакой", "никогда", "никто", "никуда", "ним", "ними", "них", "ничего", "ничто", "но", "новый", "нога", "ночь", "ну", "нужно", "нужный", "нх", "о", "об", "оба", "обычно", "один", "одиннадцатый", "одиннадцать", "однажды", "однако", "одного", "одной", "оказаться", "окно", "около", "он", "она", "они", "оно", "опять", + "особенно", "остаться", "от", "ответить", "отец", "откуда", "отовсюду", "отсюда", "очень", "первый", "перед", "писать", "плечо", "по", "под", "подойди", "подумать", "пожалуйста", "позже", "пойти", "пока", "пол", "получить", "помнить", "понимать", "понять", "пор", "пора", "после", "последний", "посмотреть", "посреди", "потом", "потому", "почему", "почти", "правда", "прекрасно", "при", "про", "просто", "против", "процентов", "путь", "пятнадцатый", "пятнадцать", + "пятый", "пять", "работа", "работать", "раз", "разве", "рано", "раньше", "ребенок", "решить", "россия", "рука", "русский", "ряд", "рядом", "с", "с кем", "сам", "сама", "сами", "самим", "самими", "самих", "само", "самого", "самой", "самом", "самому", "саму", "самый", "свет", "свое", "своего", "своей", "свои", "своих", "свой", "свою", "сделать", "сеаой", "себе", "себя", "сегодня", "седьмой", "сейчас", "семнадцатый", "семнадцать", "семь", "сидеть", "сила", "сих", + "сказал", "сказала", "сказать", "сколько", "слишком", "слово", "случай", "смотреть", "сначала", "снова", "со", "собой", "собою", "советский", "совсем", "спасибо", "спросить", "сразу", "стал", "старый", "стать", "стол", "сторона", "стоять", "страна", "суть", "считать", "т", "та", "так", "такая", "также", "таки", "такие", "такое", "такой", "там", "твои", "твой", "твоя", "твоё", "те", "тебе", "тебя", "тем", "теми", "теперь", "тех", "то", "тобой", "тобою", "товарищ", + "тогда", "того", "тоже", "только", "том", "тому", "тот", "тою", "третий", "три", "тринадцатый", "тринадцать", "ту", "туда", "тут", "ты", "тысяч", "у", "увидеть", "уж", "уже", "улица", "уметь", "утро", "хороший", "хорошо", "хотел бы", "хотеть", "хоть", "хотя", "хочешь", "час", "часто", "часть", "чаще", "чего", "человек", "чем", "чему", "через", "четвертый", "четыре", "четырнадцатый", "четырнадцать", "что", "чтоб", "чтобы", "чуть", "шестнадцатый", "шестнадцать", "шестой", + "шесть", "эта", "эти", "этим", "этими", "этих", "это", "этого", "этой", "этом", "этому", "этот", "эту", "я", "являюсь")); }}; public static boolean supportedLanguages(String lang) { From 4defb687b69d151ab73756e84cc084cb856bf0ae Mon Sep 17 00:00:00 2001 From: AhmedYounes94 Date: Fri, 12 Nov 2021 07:17:51 +0000 Subject: [PATCH 3/4] serialize russian background --- .../ClusterFeatureAnalysis.java | 60 +++++++------------ 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java index 237b1f9..4aeb5ba 100644 --- a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java +++ b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java @@ -545,7 +545,7 @@ public static FeatureBasedCounts loadBackgroundCounter(File inputFile) throws IO public static List readCsv(String inpath) { List list = new ArrayList(); try { - Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_16); + Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_8); CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT // change based on the the number of columns in the file, my file had only text column which contains the articles .withHeader("text") @@ -619,17 +619,7 @@ public static void print_instance(Iterable corpus) { public static void main(String[] args) throws IOException, ClassNotFoundException { -// // Deserialised it to see if it contains the lang key -// ObjectInputStream ios = new ObjectInputStream(new FileInputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser")); -// FeatureExtractionPipeline temp; -// try { -// while ((temp = (FeatureExtractionPipeline) ios.readObject()) != null) { -// System.out.println(temp); -// } -// } catch (EOFException e) { -// } finally { -// ios.close(); -// } + List topFeatures = Lists.newArrayList( "visualisation", @@ -644,59 +634,55 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio "algorithms" ); -// build the pipeline - FeatureExtractionPipeline arabic_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline. + // build the pipeline + FeatureExtractionPipeline russian_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline. .add("tokeniser", ImmutableMap.of( - "type", "arabicstanford", - "filter_punctuation", true, - "normalise_urls", true + "type", "basic", + "filter_punctuation", true, + "normalise_urls", true, + "lower_case", true ) ) .add("remove_stopwords", ImmutableMap.of( "use", "true", - "lang", "ar")) + "lang", "ru")) .add("filter_regex", "[\\-(())【\\[\\]】]") .add("unigrams", true) ); -// Read the csv file that contains the article and save it in articles.ser - List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/source_background/ar_background_10.csv"); -// to test the correctness of serializing articles load the generated file - Iterable bl_test = load(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser")); -// load one of the files previously generated by other language - Iterable bl_en_test = load(new File("/Users/ay227/Desktop/CASM/git/wikisample-withzh/sample/zh-wiki-articles.ser")); + // Read the csv file that contains the article and save it in articles.ser + List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/M52/Russian_background/russian_background.csv"); + // to test the correctness of serializing articles load the generated file + Iterable bl_test = load(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-articles.ser")); System.out.println("Done loading articles"); -// to print the count of the serialized articles for example for the output should be -// done loading articles -// 36205 the size of the generated Arabic file (approximately) -// 15000 the size of english - System.out.println(count_size(bl_test)); - System.out.println(count_size(bl_en_test)); -// to print the serialized articles + + // to print the serialized articles print_instance(bl_test); + // to print the count of the serialized articles for example for the output should be + System.out.println(count_size(bl_test)); - savepipeline(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"), arabic_pipeline); + savepipeline(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-pipeline.ser"), russian_pipeline); System.out.println("Done saving pipeline.ser"); - FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-count.ser"), 1, bl, arabic_pipeline, 3); + FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-count.ser"), 1, bl, russian_pipeline, 3); System.out.println("Done testing FeatureBasedCounts"); IncrementalFeatureCounter cNew = new IncrementalFeatureCounter(0.1); - cNew.incrementCounts(bl, arabic_pipeline, 10); + cNew.incrementCounts(bl, russian_pipeline, 10); cNew.pruneFeaturesWithCountLessThanN(3); System.out.println("Done testing IncrementalFeatureCounter"); - try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-inc-feat-counts.ser"))){ + try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-inc-feat-counts.ser"))){ out.writeObject(cNew); } - System.out.println("Done saving ar-wiki-inc-feat-counts.ser"); + System.out.println("Done saving wiki-inc-feat-counts.ser"); + } - } public static void buildGermanBackground() { From 2918bdb8fc19b39ff1cae2d55e0cd2a77268422c Mon Sep 17 00:00:00 2001 From: AhmedYounes94 Date: Sun, 14 Nov 2021 00:21:41 +0000 Subject: [PATCH 4/4] use much larger background articles 400000 --- .../clusteranalysis/ClusterFeatureAnalysis.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java index 4aeb5ba..50d95a5 100644 --- a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java +++ b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java @@ -651,9 +651,9 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio ); // Read the csv file that contains the article and save it in articles.ser - List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/M52/Russian_background/russian_background.csv"); + List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-articles-large.ser"), "/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru_backgroud.csv"); // to test the correctness of serializing articles load the generated file - Iterable bl_test = load(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-articles.ser")); + Iterable bl_test = load(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-articles-large.ser")); System.out.println("Done loading articles"); @@ -662,20 +662,20 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio // to print the count of the serialized articles for example for the output should be System.out.println(count_size(bl_test)); - savepipeline(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-pipeline.ser"), russian_pipeline); + savepipeline(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-pipeline-large.ser"), russian_pipeline); System.out.println("Done saving pipeline.ser"); - FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-count.ser"), 1, bl, russian_pipeline, 3); - System.out.println("Done testing FeatureBasedCounts"); +// FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-count-large.ser"), 1, bl, russian_pipeline, 3); +// System.out.println("Done testing FeatureBasedCounts"); IncrementalFeatureCounter cNew = new IncrementalFeatureCounter(0.1); cNew.incrementCounts(bl, russian_pipeline, 10); cNew.pruneFeaturesWithCountLessThanN(3); System.out.println("Done testing IncrementalFeatureCounter"); - try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/Russian_background/ru-wiki-inc-feat-counts.ser"))){ + try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-inc-feat-counts-large.ser"))){ out.writeObject(cNew); } System.out.println("Done saving wiki-inc-feat-counts.ser");