diff --git a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java index 237b1f9..b7a339c 100644 --- a/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java +++ b/src/main/java/uk/ac/susx/tag/classificationframework/clusters/clusteranalysis/ClusterFeatureAnalysis.java @@ -545,7 +545,7 @@ public static FeatureBasedCounts loadBackgroundCounter(File inputFile) throws IO public static List readCsv(String inpath) { List list = new ArrayList(); try { - Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_16); + Reader reader = Files.newBufferedReader(Paths.get(inpath) , StandardCharsets.UTF_8); CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT // change based on the the number of columns in the file, my file had only text column which contains the articles .withHeader("text") @@ -553,7 +553,7 @@ public static List readCsv(String inpath) { .withTrim()); for (CSVRecord csvRecord: csvParser) { - String text = csvRecord.get("text").replaceAll("[^\u0000-\u200f]", ""); + String text = csvRecord.get("text"); // This was added by Qiwiei to check if there is some errors in the file i guess for missing values i kept it but change the size to the number of columns your file have in my case only one column if (csvRecord.size()!=1) { System.out.println("!!!!!!! corrupted instance !!!!!"); @@ -619,17 +619,6 @@ public static void print_instance(Iterable corpus) { public static void main(String[] args) throws IOException, ClassNotFoundException { -// // Deserialised it to see if it contains the lang key -// ObjectInputStream ios = new ObjectInputStream(new FileInputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser")); -// FeatureExtractionPipeline temp; -// try { -// while ((temp = (FeatureExtractionPipeline) ios.readObject()) != null) { -// System.out.println(temp); -// } -// } catch (EOFException e) { -// } finally { -// ios.close(); -// } List topFeatures = Lists.newArrayList( "visualisation", @@ -645,56 +634,49 @@ public static void main(String[] args) throws IOException, ClassNotFoundExceptio ); // build the pipeline - FeatureExtractionPipeline arabic_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline. + FeatureExtractionPipeline french_pipeline = new PipelineBuilder().build(new PipelineBuilder.OptionList() // Instantiate the pipeline. .add("tokeniser", ImmutableMap.of( - "type", "arabicstanford", + "type", "basic", "filter_punctuation", true, - "normalise_urls", true + "normalise_urls", true, + "lower_case", true ) ) .add("remove_stopwords", ImmutableMap.of( "use", "true", - "lang", "ar")) + "lang", "fr")) .add("filter_regex", "[\\-(())【\\[\\]】]") .add("unigrams", true) ); -// Read the csv file that contains the article and save it in articles.ser - List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/source_background/ar_background_10.csv"); -// to test the correctness of serializing articles load the generated file - Iterable bl_test = load(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-articles.ser")); -// load one of the files previously generated by other language - Iterable bl_en_test = load(new File("/Users/ay227/Desktop/CASM/git/wikisample-withzh/sample/zh-wiki-articles.ser")); + // Read the csv file that contains the article and save it in articles.ser + List bl = savecorpus(new File("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-articles.ser"), "/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr_backgroud.csv"); + // to test the correctness of serializing articles load the generated file System.out.println("Done loading articles"); -// to print the count of the serialized articles for example for the output should be -// done loading articles -// 36205 the size of the generated Arabic file (approximately) -// 15000 the size of english - System.out.println(count_size(bl_test)); - System.out.println(count_size(bl_en_test)); -// to print the serialized articles - print_instance(bl_test); - - savepipeline(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-pipeline.ser"), arabic_pipeline); + + // to print the serialized articles + print_instance(bl); + // to print the count of the serialized articles for example for the output should be + System.out.println(count_size(bl)); + + savepipeline(new File("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-pipeline.ser"), french_pipeline); System.out.println("Done saving pipeline.ser"); - FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-count.ser"), 1, bl, arabic_pipeline, 3); - System.out.println("Done testing FeatureBasedCounts"); +// FeatureBasedCounts counter1 = saveNewBackgroundCounter(new File("/Users/ay227/Desktop/CASM/M52/French_background/fr-wiki-count.ser"), 1, bl, french_pipeline, 3); +// System.out.println("Done testing FeatureBasedCounts"); IncrementalFeatureCounter cNew = new IncrementalFeatureCounter(0.1); - cNew.incrementCounts(bl, arabic_pipeline, 10); + cNew.incrementCounts(bl, french_pipeline, 10); cNew.pruneFeaturesWithCountLessThanN(3); System.out.println("Done testing IncrementalFeatureCounter"); - try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/Arabic_background/ar-wiki-inc-feat-counts.ser"))){ + try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream("/Users/ay227/Desktop/CASM/M52/generate_background/French_background/fr-wiki-inc-feat-counts.ser"))){ out.writeObject(cNew); } - System.out.println("Done saving ar-wiki-inc-feat-counts.ser"); - - + System.out.println("Done saving wiki-inc-feat-counts.ser"); } diff --git a/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java b/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java index f72655c..56afc5f 100644 --- a/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java +++ b/src/main/java/uk/ac/susx/tag/classificationframework/featureextraction/filtering/TokenFilterRelevanceStopwords.java @@ -188,6 +188,22 @@ public class TokenFilterRelevanceStopwords extends TokenFilter { "unsen","unser","unsere","unserer","unses","unter","v","vergangenen","viel","viele","vielem","vielen","vielleicht","vier","vierte","vierten","vierter","viertes","vom","von","vor","w","wahr","wann","war","waren","warst","wart","warum","was","weg","wegen","weil","weit","weiter","weitere","weiteren","weiteres","welche","welchem","welchen", "welcher","welches","wem","wen","wenig","wenige","weniger","weniges","wenigstens","wenn","wer","werde","werden","werdet","weshalb","wessen","wie","wieder","wieso","will","willst","wir","wird","wirklich","wirst","wissen","wo","woher","wohin","wohl","wollen","wollt","wollte","wollten","worden","wurde","wurden","während","währenddem","währenddessen","wäre","würde", "würden","x","y","z","z.b","zehn","zehnte","zehnten","zehnter","zehntes","zeit","zu","zuerst","zugleich","zum","zunächst","zur","zurück","zusammen","zwanzig","zwar","zwei","zweite","zweiten","zweiter","zweites","zwischen","zwölf","über","überhaupt","übrigens")); + + put("fr", Sets.newHashSet("a", "abord", "absolument", "afin", "ah", "ai", "aie", "aient", "aies", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", "aucuns", "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", + "aurions", "aurons", "auront", "aussi", "autant", "autre", "autrefois", "autrement", "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avoir", "avons", "ayant", "ayez", "ayons", "b", "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "bon", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", + "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "comparable", + "comparables", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "devrait", "different", "differentes", "différent", "différente", "différentes", "différents", "dire", "directe", + "directement", "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", "dont", "dos", "douze", "douzième", "dring", "droite", "du", "duquel", "durant", "dès", "début", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "essai", "est", "et", + "etant", "etc", "etre", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "eûmes", "eût", "eûtes", "f", "fais", "faisaient", "faisant", "fait", "faites", "façon", "feront", "fi", "flac", "floc", "fois", "font", "force", "furent", "fus", "fussefussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", + "g", "gens", "h", "ha", "haut", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "ici", "il", "ils", "importe", "j", "je", "jusqu", "jusquejuste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m", + "ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mineminimalemoimoi-meme", "moi-même", "moindres", "moins", "mon", "mot", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "nommés", + "non", "nos", "notamment", "notre", "nous", "nous-mêmes", "nouveau", "nouveaux", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "où", "p", "paf", "pan", "par", "parce", "parfoisparle", "parlent", "parler", "parmi", "parole", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas", + "passé", "pendant", "pense", "permet", "personne", "personnes", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "pièce", "plein", "plouf", "plupart", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", "procedant", "proche", "près", + "psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelquun", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent", + "restrictif", "retour", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", "semble", "semblent", "sent", "septseptième", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "seul", "seule", "seulement", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", + "soient", "sois", "soit", "soixante", "sommes", "son", "sont", "sous", "souvent", "soyez", "soyons", "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", "suivant", "suivante", "suivantes", "suivants", "suivre", "sujet", "superpose", "sur", "surtout", "t", "ta", "tac", "tandis", "tant", "tardive", "te", "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir", + "tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", "uniques", "uns", "v", "va", "vais", "valeur", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", + "voici", "voie", "voient", "voilà", "voirevont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", "étais", "était", "étant", "état", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô")); }}; public static boolean supportedLanguages(String lang) {