Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ public static FeatureBasedCounts loadBackgroundCounter(File inputFile) throws IO
public static List<Instance> readCsv(String inpath) {
List<Instance> list = new ArrayList<Instance>();
try {
Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_16);
Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_8);
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT
// change based on the the number of columns in the file, my file had only text column which contains the articles
.withHeader("text")
Expand Down Expand Up @@ -619,17 +619,7 @@ public static void print_instance(Iterable<Instance> corpus) {

public static void main(String[] args) throws IOException, ClassNotFoundException {

// // Deserialised it to see if it contains the lang key
// ObjectInputStream ios = new ObjectInputStream(new FileInputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"));
// FeatureExtractionPipeline temp;
// try {
// while ((temp = (FeatureExtractionPipeline) ios.readObject()) != null) {
// System.out.println(temp);
// }
// } catch (EOFException e) {
// } finally {
// ios.close();
// }


List<String> topFeatures = Lists.newArrayList(
"visualisation",
Expand All @@ -644,59 +634,55 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
"algorithms"
);

// build the pipeline
FeatureExtractionPipeline arabic_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline.
// build the pipeline
FeatureExtractionPipeline russian_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline.
.add("tokeniser", ImmutableMap.of(
"type", "arabicstanford",
"filter_punctuation", true,
"normalise_urls", true
"type", "basic",
"filter_punctuation", true,
"normalise_urls", true,
"lower_case", true
)
)
.add("remove_stopwords", ImmutableMap.of(
"use", "true",
"lang", "ar"))
"lang", "ru"))
.add("filter_regex", "[\\-(())【\\[\\]】]")
.add("unigrams", true)
);

// Read the csv file that contains the article and save it in articles.ser
List<Instance> bl = savecorpus(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/source_background/ar_background_10.csv");
// to test the correctness of serializing articles load the generated file
Iterable<Instance> bl_test = load(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"));
// load one of the files previously generated by other language
Iterable<Instance> bl_en_test = load(new File("/Users/ay227/Desktop/CASM/git/wikisample-withzh/sample/zh-wiki-articles.ser"));
// Read the csv file that contains the article and save it in articles.ser
List<Instance> bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-articles-large.ser"), "/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru_backgroud.csv");
// to test the correctness of serializing articles load the generated file
Iterable<Instance> bl_test = load(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-articles-large.ser"));

System.out.println("Done loading articles");
// to print the count of the serialized articles for example for the output should be
// done loading articles
// 36205 the size of the generated Arabic file (approximately)
// 15000 the size of english
System.out.println(count_size(bl_test));
System.out.println(count_size(bl_en_test));
// to print the serialized articles

// to print the serialized articles
print_instance(bl_test);
// to print the count of the serialized articles for example for the output should be
System.out.println(count_size(bl_test));

savepipeline(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"), arabic_pipeline);
savepipeline(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-pipeline-large.ser"), russian_pipeline);
System.out.println("Done saving pipeline.ser");



FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-count.ser"), 1, bl, arabic_pipeline, 3);
System.out.println("Done testing FeatureBasedCounts");
// FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-count-large.ser"), 1, bl, russian_pipeline, 3);
// System.out.println("Done testing FeatureBasedCounts");

IncrementalFeatureCounter cNew = new IncrementalFeatureCounter(0.1);
cNew.incrementCounts(bl, arabic_pipeline, 10);
cNew.incrementCounts(bl, russian_pipeline, 10);
cNew.pruneFeaturesWithCountLessThanN(3);
System.out.println("Done testing IncrementalFeatureCounter");

try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-inc-feat-counts.ser"))){
try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/generate_background/Russian_background/ru-wiki-inc-feat-counts-large.ser"))){
out.writeObject(cNew);
}
System.out.println("Done saving ar-wiki-inc-feat-counts.ser");
System.out.println("Done saving wiki-inc-feat-counts.ser");

}


}


public static void buildGermanBackground() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,19 @@ public class TokenFilterRelevanceStopwords extends TokenFilter {
"unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen",
"welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde",
"würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens"));

put("ru", Sets.newHashSet("c", "а", "алло", "без", "белый", "близко", "более", "больше", "большой", "будем", "будет", "будете", "будешь", "будто", "буду", "будут", "будь", "бы", "бывает", "бывь", "был", "была", "были", "было", "быть", "в", "важная", "важное", "важные", "важный", "вам", "вами", "вас", "ваш", "ваша", "ваше", "ваши", "вверх", "вдали", "вдруг", "ведь", "везде", "вернуться", "весь", "вечер", "взгляд", "взять", "вид",
"видел", "видеть", "вместе", "вне", "вниз", "внизу", "во", "вода", "война", "вокруг", "вон", "вообще", "вопрос", "восемнадцатый", "восемнадцать", "восемь", "восьмой", "вот", "впрочем", "времени", "время", "все", "все еще", "всегда", "всего", "всем", "всеми", "всему", "всех", "всею", "всю", "всюду", "вся", "всё", "второй", "вы", "выйти", "г", "где", "главный", "глаз", "говорил", "говорит", "говорить", "год", "года", "году", "голова",
"голос", "город", "да", "давать", "давно", "даже", "далекий", "далеко", "дальше", "даром", "дать", "два", "двадцатый", "двадцать", "две", "двенадцатый", "двенадцать", "дверь", "двух", "девятнадцатый", "девятнадцать", "девятый", "девять", "действительно", "дел", "делал", "делать", "делаю", "дело", "день", "деньги", "десятый", "десять", "для", "до", "довольно", "долго", "должен", "должно", "должный", "дом", "дорога", "друг", "другая",
"другие", "других", "друго", "другое", "другой", "думать", "душа", "е", "его", "ее", "ей", "ему", "если", "есть", "еще", "ещё", "ею", "её", "ж", "ждать", "же", "жена", "женщина", "жизнь", "жить", "за", "занят", "занята", "занято", "заняты", "затем", "зато", "зачем", "здесь", "земля", "знать", "значит", "значить", "и", "иди", "идти", "из", "или", "им", "имеет", "имел", "именно", "иметь", "ими", "имя", "иногда", "их", "к", "каждая", "каждое",
"каждые", "каждый", "кажется", "казаться", "как", "какая", "какой", "кем", "книга", "когда", "кого", "ком", "комната", "кому", "конец", "конечно", "которая", "которого", "которой", "которые", "который", "которых", "кроме", "кругом", "кто", "куда", "лежать", "лет", "ли", "лицо", "лишь", "лучше", "любить", "люди", "м", "маленький", "мало", "мать", "машина", "между", "меля", "менее", "меньше", "меня", "место", "миллионов", "мимо", "минута", "мир",
"мира", "мне", "много", "многочисленная", "многочисленное", "многочисленные", "многочисленный", "мной", "мною", "мог", "могу", "могут", "мож", "может", "может быть", "можно", "можхо", "мои", "мой", "мор", "москва", "мочь", "моя", "моё", "мы", "на", "наверху", "над", "надо", "назад", "наиболее", "найти", "наконец", "нам", "нами", "народ", "нас", "начала", "начать", "наш", "наша", "наше", "наши", "не", "него", "недавно", "недалеко", "нее", "ней",
"некоторый", "нельзя", "нем", "немного", "нему", "непрерывно", "нередко", "несколько", "нет", "нею", "неё", "ни", "нибудь", "ниже", "низко", "никакой", "никогда", "никто", "никуда", "ним", "ними", "них", "ничего", "ничто", "но", "новый", "нога", "ночь", "ну", "нужно", "нужный", "нх", "о", "об", "оба", "обычно", "один", "одиннадцатый", "одиннадцать", "однажды", "однако", "одного", "одной", "оказаться", "окно", "около", "он", "она", "они", "оно", "опять",
"особенно", "остаться", "от", "ответить", "отец", "откуда", "отовсюду", "отсюда", "очень", "первый", "перед", "писать", "плечо", "по", "под", "подойди", "подумать", "пожалуйста", "позже", "пойти", "пока", "пол", "получить", "помнить", "понимать", "понять", "пор", "пора", "после", "последний", "посмотреть", "посреди", "потом", "потому", "почему", "почти", "правда", "прекрасно", "при", "про", "просто", "против", "процентов", "путь", "пятнадцатый", "пятнадцать",
"пятый", "пять", "работа", "работать", "раз", "разве", "рано", "раньше", "ребенок", "решить", "россия", "рука", "русский", "ряд", "рядом", "с", "с кем", "сам", "сама", "сами", "самим", "самими", "самих", "само", "самого", "самой", "самом", "самому", "саму", "самый", "свет", "свое", "своего", "своей", "свои", "своих", "свой", "свою", "сделать", "сеаой", "себе", "себя", "сегодня", "седьмой", "сейчас", "семнадцатый", "семнадцать", "семь", "сидеть", "сила", "сих",
"сказал", "сказала", "сказать", "сколько", "слишком", "слово", "случай", "смотреть", "сначала", "снова", "со", "собой", "собою", "советский", "совсем", "спасибо", "спросить", "сразу", "стал", "старый", "стать", "стол", "сторона", "стоять", "страна", "суть", "считать", "т", "та", "так", "такая", "также", "таки", "такие", "такое", "такой", "там", "твои", "твой", "твоя", "твоё", "те", "тебе", "тебя", "тем", "теми", "теперь", "тех", "то", "тобой", "тобою", "товарищ",
"тогда", "того", "тоже", "только", "том", "тому", "тот", "тою", "третий", "три", "тринадцатый", "тринадцать", "ту", "туда", "тут", "ты", "тысяч", "у", "увидеть", "уж", "уже", "улица", "уметь", "утро", "хороший", "хорошо", "хотел бы", "хотеть", "хоть", "хотя", "хочешь", "час", "часто", "часть", "чаще", "чего", "человек", "чем", "чему", "через", "четвертый", "четыре", "четырнадцатый", "четырнадцать", "что", "чтоб", "чтобы", "чуть", "шестнадцатый", "шестнадцать", "шестой",
"шесть", "эта", "эти", "этим", "этими", "этих", "это", "этого", "этой", "этом", "этому", "этот", "эту", "я", "являюсь"));
}};

public static boolean supportedLanguages(String lang) {
Expand Down