Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -545,15 +545,15 @@ public static FeatureBasedCounts loadBackgroundCounter(File inputFile) throws IO
public static List<Instance> readCsv(String inpath) {
List<Instance> list = new ArrayList<Instance>();
try {
Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_16);
Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_8);
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT
// change based on the the number of columns in the file, my file had only text column which contains the articles
.withHeader("text")
.withIgnoreHeaderCase()
.withTrim());

for (CSVRecord csvRecord: csvParser) {
String text = csvRecord.get("text").replaceAll("[^\u0000-\u200f]", "");
String text = csvRecord.get("text");
// This was added by Qiwiei to check if there is some errors in the file i guess for missing values i kept it but change the size to the number of columns your file have in my case only one column
if (csvRecord.size()!=1) {
System.out.println("!!!!!!! corrupted instance !!!!!");
Expand Down Expand Up @@ -619,17 +619,6 @@ public static void print_instance(Iterable<Instance> corpus) {

public static void main(String[] args) throws IOException, ClassNotFoundException {

// // Deserialised it to see if it contains the lang key
// ObjectInputStream ios = new ObjectInputStream(new FileInputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"));
// FeatureExtractionPipeline temp;
// try {
// while ((temp = (FeatureExtractionPipeline) ios.readObject()) != null) {
// System.out.println(temp);
// }
// } catch (EOFException e) {
// } finally {
// ios.close();
// }

List<String> topFeatures = Lists.newArrayList(
"visualisation",
Expand All @@ -645,56 +634,49 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio
);

// build the pipeline
FeatureExtractionPipeline arabic_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline.
FeatureExtractionPipeline french_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline.
.add("tokeniser", ImmutableMap.of(
"type", "arabicstanford",
"type", "basic",
"filter_punctuation", true,
"normalise_urls", true
"normalise_urls", true,
"lower_case", true
)
)
.add("remove_stopwords", ImmutableMap.of(
"use", "true",
"lang", "ar"))
"lang", "fr"))
.add("filter_regex", "[\\-(())【\\[\\]】]")
.add("unigrams", true)
);

// Read the csv file that contains the article and save it in articles.ser
List<Instance> bl = savecorpus(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/source_background/ar_background_10.csv");
// to test the correctness of serializing articles load the generated file
Iterable<Instance> bl_test = load(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"));
// load one of the files previously generated by other language
Iterable<Instance> bl_en_test = load(new File("/Users/ay227/Desktop/CASM/git/wikisample-withzh/sample/zh-wiki-articles.ser"));
// Read the csv file that contains the article and save it in articles.ser
List<Instance> bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr_backgroud.csv");
// to test the correctness of serializing articles load the generated file

System.out.println("Done loading articles");
// to print the count of the serialized articles for example for the output should be
// done loading articles
// 36205 the size of the generated Arabic file (approximately)
// 15000 the size of english
System.out.println(count_size(bl_test));
System.out.println(count_size(bl_en_test));
// to print the serialized articles
print_instance(bl_test);

savepipeline(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"), arabic_pipeline);

// to print the serialized articles
print_instance(bl);
// to print the count of the serialized articles for example for the output should be
System.out.println(count_size(bl));

savepipeline(new File("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-pipeline.ser"), french_pipeline);
System.out.println("Done saving pipeline.ser");



FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-count.ser"), 1, bl, arabic_pipeline, 3);
System.out.println("Done testing FeatureBasedCounts");
// FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/French_background/fr-wiki-count.ser"), 1, bl, french_pipeline, 3);
// System.out.println("Done testing FeatureBasedCounts");

IncrementalFeatureCounter cNew = new IncrementalFeatureCounter(0.1);
cNew.incrementCounts(bl, arabic_pipeline, 10);
cNew.incrementCounts(bl, french_pipeline, 10);
cNew.pruneFeaturesWithCountLessThanN(3);
System.out.println("Done testing IncrementalFeatureCounter");

try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-inc-feat-counts.ser"))){
try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-inc-feat-counts.ser"))){
out.writeObject(cNew);
}
System.out.println("Done saving ar-wiki-inc-feat-counts.ser");


System.out.println("Done saving wiki-inc-feat-counts.ser");

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ public class TokenFilterRelevanceStopwords extends TokenFilter {
"unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen",
"welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde",
"würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens"));

put("fr", Sets.newHashSet("a", "abord", "absolument", "afin", "ah", "ai", "aie", "aient", "aies", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", "aucuns", "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez",
"aurions", "aurons", "auront", "aussi", "autant", "autre", "autrefois", "autrement", "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avoir", "avons", "ayant", "ayez", "ayons", "b", "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "bon", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là",
"celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "comparable",
"comparables", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "devrait", "different", "differentes", "différent", "différente", "différentes", "différents", "dire", "directe",
"directement", "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", "dont", "dos", "douze", "douzième", "dring", "droite", "du", "duquel", "durant", "dès", "début", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "essai", "est", "et",
"etant", "etc", "etre", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "eûmes", "eût", "eûtes", "f", "fais", "faisaient", "faisant", "fait", "faites", "façon", "feront", "fi", "flac", "floc", "fois", "font", "force", "furent", "fus", "fussefussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes",
"g", "gens", "h", "ha", "haut", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "ici", "il", "ils", "importe", "j", "je", "jusqu", "jusquejuste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m",
"ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mineminimalemoimoi-meme", "moi-même", "moindres", "moins", "mon", "mot", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "nommés",
"non", "nos", "notamment", "notre", "nous", "nous-mêmes", "nouveau", "nouveaux", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "où", "p", "paf", "pan", "par", "parce", "parfoisparle", "parlent", "parler", "parmi", "parole", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas",
"passé", "pendant", "pense", "permet", "personne", "personnes", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "pièce", "plein", "plouf", "plupart", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", "procedant", "proche", "près",
"psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelquun", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent",
"restrictif", "retour", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", "semble", "semblent", "sent", "septseptième", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "seul", "seule", "seulement", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même",
"soient", "sois", "soit", "soixante", "sommes", "son", "sont", "sous", "souvent", "soyez", "soyons", "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", "suivant", "suivante", "suivantes", "suivants", "suivre", "sujet", "superpose", "sur", "surtout", "t", "ta", "tac", "tandis", "tant", "tardive", "te", "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir",
"tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", "uniques", "uns", "v", "va", "vais", "valeur", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan",
"voici", "voie", "voient", "voilà", "voirevont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", "étais", "était", "étant", "état", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"));
}};

public static boolean supportedLanguages(String lang) {
Expand Down