From e3827be6c3b1b8888ebdf8ad3c5c759858c62b13 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Fri, 26 Apr 2019 13:26:29 +0200 Subject: [PATCH 1/8] progress --- .gitignore | 5 +- data/{eval => }/random.csv | 1112 ++++++-------------- src/check/isstub.py | 84 +- src/classify/decision_tree.py | 94 +- src/cluster/hierarchical.py | 109 ++ src/data/__init__.py | 18 +- src/data/eval/random_sampling.py | 21 +- src/data/explore/explore_seed_infoboxes.py | 6 +- src/data/explore/explore_seed_lists.py | 8 + src/data/explore/explore_seed_nouns.py | 11 +- src/features/cop_firstsentence.py | 1 + src/features/cop_semgrex.py | 37 +- src/features/lists_of.py | 4 +- src/features/summary_lemma.py | 2 +- src/features/summary_words.py | 2 +- src/mine/dbpedia.py | 1 + src/mine/mine_listlinks.py | 27 + src/mine/pipeline.py | 15 +- src/mine/wiki.py | 14 +- src/stanford/custom_stanford_api.py | 8 +- 20 files changed, 663 insertions(+), 916 deletions(-) rename data/{eval => }/random.csv (57%) create mode 100644 src/cluster/hierarchical.py create mode 100644 src/data/explore/explore_seed_lists.py create mode 100644 src/mine/mine_listlinks.py diff --git a/.gitignore b/.gitignore index e6f0d06..2965b03 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ *.dat-jrnl *.lock *.log - +.DS_Store .metadata bin/ tmp/ @@ -19,6 +19,9 @@ local.properties .settings/ .loadpath .recommenders +data/mined/ +data/temp/ +src/env/ # External tool builders .externalToolBuilders/ diff --git a/data/eval/random.csv b/data/random.csv similarity index 57% rename from data/eval/random.csv rename to data/random.csv index 7bcf61f..3dd2bc4 100644 --- a/data/eval/random.csv +++ b/data/random.csv @@ -1,79 +1,46 @@ -|List_of_years_in_Swedish_television|,|0| |Garnir_relations|,|0| |Behind_Enemy_Lines_(band)|,|0| -|Thorbjørn_Egners_lesebøker|,|0| -|Free_monoid|,|0| |International_Ideographs_Core|,|0| |Kajona|,|0| -|List_of_Prime_Ministers_of_Canada_by_approval_rating|,|0| |List_of_largest_poker_tournaments_in_history_(by_prize_pool)|,|0| -|Project_404|,|0| |Nehari_manifold|,|0| |Jucys–Murphy_element|,|0| -|Directed_individual_study|,|0| |Rafael_Irizarry_(scientist)|,|0| |Discriminator|,|0| |Central_African_Republic|,|0| -|Herman_of_Alaska|,|0| |Cyathea_acanthophora|,|0| -|Verrückt_(water_slide)|,|0| |MaxCliqueDyn_maximum_clique_algorithm|,|0| -|Nick_O'Bannon|,|0| |Dol_Ammad|,|0| |Pungent_Stench|,|0| |Kalhora|,|0| |High_entropy_alloys|,|0| |Total_Immersion_Racing|,|0| -|Balankanche|,|0| -|List_of_longest-running_United_States_television_series|,|0| -|Close_to_You_(Maxi_Priest_song)|,|0| |Technology_transfer_in_computer_science|,|0| -|Arctic_Research_Foundation|,|0| -|Mathematical_fallacy|,|0| |Gilbert_model|,|0| -|Contingent_aftereffect|,|0| -|List_of_tallest_hotels|,|0| -|Energy_medicine|,|0| |Nexus_5X|,|0| |Brothers_Hildebrandt|,|0| -|Lovin'_Is_Really_My_Game|,|0| |Officer|,|0| |Computing_with_Memory|,|0| -|Oahspe:_A_New_Bible|,|0| |Cdist|,|0| -|Steigerkopf|,|0| |Isuzu_117_Coupé|,|0| -|Parker_immunity_doctrine|,|0| -|Lesya|,|0| -|Eye_rhyme|,|0| |North_Palisade|,|0| -|Conrad_of_Parzham|,|0| |Raking_light|,|0| -|Accione|,|0| |List_of_games_using_SDL|,|0| -|I_Love_You_(Martina_McBride_song)|,|0| |24_Boötis|,|0| |Dancing_in_the_Streets|,|0| -|Yahya_Efendi|,|0| |AI_Bridging_Cloud_Infrastructure|,|0| -|Road_to_Emmaus_appearance|,|0| |Bratschen|,|0| |Ordered_dithering|,|0| |HD_224693|,|0| |Female_entrepreneurs|,|0| |Regional_Data_Exchange_System|,|0| |Landmark_Worldwide|,|0| -|Digital_orthophoto_quadrangle|,|0| -|Midpoint_(astrology)|,|0| -|Athletics_record_progressions|,|0| |Boussingaultite|,|0| |Dream_argument|,|0| |Google_litigation|,|0| |Analog-to-digital_converter|,|0| |Master_of_the_Buckhounds|,|0| -|IPO_model|,|0| |Abell_754|,|0| -|Rule_in_Dumpor's_Case|,|0| |Coxeter_notation|,|0| |Platform-independent_GUI_library|,|0| |Orange_Goblin|,|0| @@ -84,15 +51,9 @@ |Ford_Super_Duty|,|0| |Lira_da_braccio|,|0| |Dantzig–Wolfe_decomposition|,|0| -|Renierite|,|0| -|Crisis_communication|,|0| |Pozohondo_(Vino_de_la_Tierra)|,|0| -|Kolakeia|,|0| -|Lady_(Jack_Jones_song)|,|0| -|Deep_operation|,|0| |DT_Virginis|,|0| |Isotonic_regression|,|0| -|Runge's_theorem|,|0| |Jenny_LeClue|,|0| |Jokgu|,|0| |Fiddler's_Green|,|0| @@ -103,33 +64,23 @@ |James_Walbourne|,|0| |HD_151613|,|0| |Anvil_of_Doom|,|0| -|CELAR|,|0| |Bhuikhel|,|0| |Richard_and_Linda_J._Eyre|,|0| |Ternary_computer|,|0| -|Erotic_art_in_Pompeii_and_Herculaneum|,|0| |Schur_multiplier|,|0| |Artificial_photosynthesis|,|0| |2093_Genichesk|,|0| |The_Christopher_Street_Connection|,|0| -|Paleo-Balkan_mythology|,|0| |The_Ceasars|,|0| |Bernheim_Arboretum_and_Research_Forest|,|0| -|Ishikism|,|0| |Grammy_Award_for_Best_Hard_Rock/Metal_Performance_Vocal_or_Instrumental|,|0| -|Aggregative_Contingent_Estimation_(ACE)_Program|,|0| |PPML|,|1| |Anatomical_terms_of_location|,|0| -|Arctic_and_Antarctic_Research_Institute|,|0| |Methods_of_computing_square_roots|,|0| -|Ain't_Nothing_'bout_You|,|0| |Digital_economy_rankings|,|0| |YAML|,|1| |Flat_panel_detector|,|0| -|Bush_tomato|,|0| -|Ocean_Drive_(Duke_Dumont_song)|,|0| |Planeta_No|,|0| -|Doldrums|,|0| |Abacaba_pattern|,|0| |The_Alice_Rose|,|0| |Vorticity_confinement|,|0| @@ -137,123 +88,76 @@ |Ginkgo_CADx|,|0| |Custos_Messium|,|0| |Clockmaker|,|0| -|Duenos_inscription|,|0| -|European_Newspaper_Award|,|0| |CimTrak|,|0| |HD_23089|,|0| -|Erdős–Rado_theorem|,|0| |Richard_Alley|,|0| |Species|,|0| |Ergodic_sequence|,|0| -|Pin_prick_attack|,|0| |Java_Caps|,|0| -|Brian_Conrad|,|0| -|Gromov's_compactness_theorem_(topology)|,|0| |Makani_Power|,|0| |Watcher_(presence)|,|0| |Onogoro_Island|,|0| |Viathyn|,|0| -|Okun's_law|,|0| -|Colonel_Tomb|,|0| -|Total_Software_Deployment|,|0| +|Total_Software_Deployment|,|1| |Nepotism|,|0| -|Eye_of_the_Tiger|,|0| |Framework_Programmes_for_Research_and_Technological_Development|,|0| -|Origen|,|0| -|Great_Work|,|0| -|Snooker_world_rankings_2015/2016|,|0| |Gender_and_development|,|0| -|Filial_mourning|,|0| -|Antony_I_of_Constantinople|,|0| -|Dirichlet_process|,|0| |Geist|,|0| |Cockpit_display_system|,|0| |Strobiloideae|,|0| |Ophiuchus|,|0| -|Moral_hierarchy|,|0| |Metaplectic_group|,|0| |Surdo|,|0| |NGC_463|,|0| -|Proud_to_Fall|,|0| -|Émile_Léonard_Mathieu|,|0| -|White_Knuckle_Ride|,|0| -|White_lighter_myth|,|0| -|Blechnum_monomorphum|,|0| |CONADI|,|0| |Colossal_red_granite_statue_of_Amenhotep_III|,|0| -|Yerbatero|,|0| |KXDocker|,|0| -|No_One_Like_You|,|0| -|Pärlor_åt_svin|,|0| -|Fritillaria_chitralensis|,|0| |Hard_systems|,|0| |1987_Kaplan|,|0| -|I'd_Rather_Love_You|,|0| |Combined_DNA_Index_System|,|0| -|Nurse_stereotypes|,|0| |Cryptophasia|,|0| -|List_of_Ultratop_40_number-one_singles_of_2010|,|0| |Return_to_Mysterious_Island_2|,|0| |Telephone_numbers_in_Western_Sahara|,|0| |PROSITE|,|0| |Software-defined_data_center|,|0| -|Java_Decompiler|,|0| +|Java_Decompiler|,|1| |TZ_Cassiopeiae|,|0| -|World_record_progression_track_cycling_–_Men's_1_km_time_trial|,|0| |Web_Compatibility_Test_for_Mobile_Browsers|,|0| |Self_(programming_language)|,|1| |Box_modeling|,|0| |Service_layer|,|0| |Sotoarc|,|0| -|Hearts_Aren't_Made_to_Break_(They're_Made_to_Love)|,|0| -|Cube_of_Space|,|0| |Google_Translate|,|0| -|Dye_3|,|0| |HD_185269|,|0| |Böhm_tree|,|0| |Dryopteris_aemula|,|0| |Evolved_antenna|,|0| -|Qijue|,|0| -|Buddhism_and_evolution|,|0| |HD_85951|,|0| -|Philippe_Flajolet|,|0| -|Woodward_effect|,|0| |Primerica|,|0| |Belovezhskaya_Pushcha_National_Park|,|0| |Ones'_complement|,|0| |Alfonso_Fróilaz|,|0| |Square_lattice|,|0| |Phantasmagoria|,|0| -|Illusion|,|0| |Church_of_Hakeem|,|0| |SlickEdit|,|0| -|Immortality|,|0| -|Anselm_of_Besate|,|0| -|Undergraduate_research|,|0| |Technology_of_television|,|0| |UIQ|,|0| |CasADi|,|0| -|Jos_mä_oisin_sä|,|0| |Humf|,|0| |Apache_iBATIS|,|0| -|Ledger_Wood|,|0| |Multilinear_polynomial|,|0| |WISE_2220−3628|,|0| |Abelian_integral|,|0| |List_of_crowdsourcing_projects|,|0| -|Hercolubus|,|0| |Organizational_culture|,|0| |NGC_3918|,|0| |Maximal_torus|,|0| |Sir,_You_Are_Being_Hunted|,|0| |Evolution@Home|,|0| |StarCraft_II:_Legacy_of_the_Void|,|0| -|Qatma|,|0| |Seiberg_duality|,|0| -|Artephius|,|0| |Yemenite_Children_Affair|,|0| -|How_to_Be_a_Conservative|,|0| -|JoCaml|,|1| |Canto_(news_aggregator)|,|0| |List_of_cities_proper_by_population|,|0| |South_Deccan_Plateau_dry_deciduous_forests|,|0| @@ -262,42 +166,23 @@ |Heuristics_in_judgment_and_decision-making|,|0| |Civilization_V:_Brave_New_World|,|0| |Rappelz|,|0| -|Radio_y_Televisión_Martí|,|0| |GJ_3737|,|0| -|Addie_L._Ballou|,|0| |Last_call_(bar_term)|,|0| |Galois_cohomology|,|0| |Stephen_Altschul|,|0| -|Role-playing|,|0| -|Fabula_togata|,|0| -|Cyathea_glaziovii|,|0| |Nancy_Drew:_The_Deadly_Device|,|0| |Demographics_of_the_member_states_of_the_Organisation_of_Islamic_Cooperation|,|0| |Solicitors_Journal|,|0| |Advanced_combat_direction_system|,|0| -|Brave_(2018_book)|,|0| |D3web|,|0| -|Arthur_J._Finkelstein|,|0| -|Stereotypes_of_groups_within_the_United_States|,|0| |Messier_21|,|0| |Mount_Greylock|,|0| -|Sternomancy|,|0| -|Dynamic_Bayesian_network|,|0| |Rohit_(caste)|,|0| |Characters_of_Myst|,|0| -|Unbelievable_(Diamond_Rio_song)|,|0| |Graph_drawing|,|0| -|Yazidi_Academy|,|0| -|Expert|,|0| |Illusory_superiority|,|0| -|Afterlight|,|0| |Landed_gentry|,|0| -|Turing's_proof|,|0| -|Old_Master|,|0| |Zero-forcing_precoding|,|0| -|Underwater_searches|,|0| -|Adorcism|,|0| -|Machiavellian_intelligence|,|0| |Eclipse_SCADA|,|0| |Liouville_number|,|0| |Slovakia|,|0| @@ -305,19 +190,11 @@ |Pop-up_maps|,|0| |Video_compression_picture_types|,|0| |Battlefield_2142|,|0| -|Completely-S_matrix|,|0| |Mallotus_philippensis|,|0| -|Supertaster|,|0| |European_Nucleotide_Archive|,|0| -|Talladega_(song)|,|0| |Dodge_St._Regis|,|0| -|Chlormayenite|,|0| -|Jagadguru_Kripalu_Parishat|,|0| -|Epistemocracy|,|0| |Chaim_Noll|,|0| -|Logia|,|0| |Hertzsprung–Russell_diagram|,|0| -|Reactive_inhibition|,|0| |Tenement_(band)|,|0| |Single_particle_analysis|,|0| |Oscillator_sync|,|0| @@ -325,25 +202,15 @@ |Szydłów_Synagogue|,|0| |Phelim_Boyle|,|0| |The_Secret_Saturdays|,|0| -|MN103|,|0| |Eta_Aquarii|,|0| -|A_Girl_Like_You_(Edwyn_Collins_song)|,|0| |Heteromerae|,|0| |Glory_to_Ukraine|,|0| -|Flying_monkeys_(psychology)|,|0| |List_of_proposed_state_mergers|,|0| -|Darkness_in_El_Dorado|,|0| |GNU_arch|,|0| |Traceroute_(film)|,|0| -|Erdős–Mordell_inequality|,|0| |Jorge_Orta_(artist)|,|0| -|Deterministic_context-free_grammar|,|0| |History_monoid|,|0| |NGC_6124|,|0| -|I'm_Not_the_Only_One|,|0| -|Don't_Go_(Wretch_32_song)|,|0| -|Leuckart's_law|,|0| -|Mount_Marshall_(New_York)|,|0| |Mobile_asset_management|,|0| |Dagobah|,|0| |Team_Role_Inventories|,|0| @@ -352,325 +219,180 @@ |Star_Wars_Forces_of_Destiny|,|0| |Vernacular|,|0| |Kappa_Columbae|,|0| -|Magical_motto|,|0| |COPASI|,|0| |Outline_of_Swaziland|,|0| |Train_whistle|,|0| |Rho_Leonis|,|0| |Dsign_Music|,|0| -|Memory_work|,|0| -|Whitewash_(sport)|,|0| |2114_Wallenquist|,|0| |1DayLater|,|0| |Videokymography|,|0| |Dr._Sapir_Hospital_and_Charity_Center|,|0| -|44_Cancri|,|0| |Fischer_group_Fi24|,|0| -|Gravitational_interaction_of_antimatter|,|0| -|It's_Raining_Men|,|0| |Kurt_Hummel|,|0| |NGC_64|,|0| |HD_210277|,|0| -|HD_180555|,|0| -|Teach_Me_Again|,|0| |Zagar_Mengal|,|0| |Index_Filicum|,|0| |Grammy_Award_for_Best_Roots_Gospel_Album|,|0| |Communism|,|0| |Aqualung_(software)|,|0| -|Tattva_(Jainism)|,|0| -|Acystopteris|,|0| -|Electrophone_(information_system)|,|0| -|Light_dues|,|0| -|Can't_Say_I'm_Sorry|,|0| -|Capture_of_Grenada_(1779)|,|0| -|Botanical_expedition|,|0| -|Breath_of_the_Gods|,|0| -|List_of_UK_Albums_Chart_number_ones_of_the_2000s|,|0| -|Divine_grace|,|0| -|Research_assistant|,|0| |HD_215497|,|0| |Viterbi_decoder|,|0| -|Operation_Tomodachi|,|0| |High_yellow|,|0| |FTSE_All-World_index_series|,|0| |Herbert_Gelernter|,|0| |Daily_Mix|,|0| -|Crystalline_cohomology|,|0| -|List_of_2009_box_office_number-one_films_in_the_United_Kingdom|,|0| -|Unitary_theories_of_memory|,|0| -|International_Moral_Education_Congress|,|0| |NTRUSign|,|0| -|Russo–Dye_theorem|,|0| -|Dievdirbys|,|0| |Helminthostachys_zeylanica|,|0| |Samy_(computer_worm)|,|0| |Quippian|,|0| |MP3+G|,|1| |Rheoencephalography|,|0| -|Rape_fantasy|,|0| -|List_of_Norwegians_by_net_worth|,|0| |Endless_(Frank_Ocean_album)|,|0| -|William_Murray_(educationist)|,|0| -|Start_of_a_Romance|,|0| -|List_of_the_most_prominent_summits_of_Colorado|,|0| -|List_of_number-one_singles_of_2004_(Australia)|,|0| |Capacity_in_English_law|,|0| -|Waring's_prime_number_conjecture|,|0| |Approximation_theory|,|0| -|Hour_record|,|0| |Polybotrya|,|0| |TVPaint_Animation|,|0| -|Any_Man_of_Mine|,|0| -|Exceptional_isomorphism|,|0| |Hitomi_no_Naka_no_Galaxy/Hero|,|0| |Rodolfo_Emilio_Giuseppe_Pichi-Sermolli|,|0| -|Right_or_Wrong_(song)|,|0| -|Gennithika_Gia_Sena/Always_Broken|,|0| |Vivid_knowledge|,|0| |Strip_Search_(web_series)|,|0| -|Pericú_language|,|0| |Gray_card|,|0| |Skype_Technologies|,|0| |Shared_information_bias|,|0| |Phi2_Orionis|,|0| -|List_of_2004_box_office_number-one_films_in_Australia|,|0| -|FX-87|,|0| -|Haddington_Range|,|0| |Thomas_D._Waldhauser|,|0| -|Supreme_Council_for_the_Confucian_Religion_in_Indonesia|,|0| -|Ukobach|,|0| -|DGCA_(computing)|,|1| |Ministry_of_Information_(United_Kingdom)|,|0| |NGC_7424|,|0| |Muskox|,|0| |Lieu-dit|,|0| -|Secular_Review|,|0| |NGC_1087|,|0| |Scaleform_GFx|,|0| |Richard_Roberts_(pharmaceutical_executive)|,|0| |Potrace|,|0| |Near-field_communication|,|0| -|Death_messenger|,|0| |Entropy_of_mixing|,|0| |Amanda_Barnard|,|0| |Nu_Doradus|,|0| -|Arctic_Climate_Impact_Assessment|,|0| -|An_Enquiry_Concerning_Human_Understanding|,|0| |Ain't_Nobody_Got_Time_for_That|,|0| -|Ecotheology|,|0| |Boötes|,|0| |Logitech_Media_Server|,|0| -|HORTON_(software)|,|0| |Dimensionality_reduction|,|0| |Mark_Phillips_(author)|,|0| -|Homotopy_group|,|0| -|Time_Variance_Authority|,|0| -|LOADHIGH|,|0| -|List_of_UK_Independent_Album_Breakers_Chart_number_ones_of_the_2000s|,|0| |Text_Template_Transformation_Toolkit|,|1| -|Direct_therapeutic_exposure|,|0| -|Ruffneck_(song)|,|0| |Captology|,|0| |HD_215114|,|0| -|Angle_(astrology)|,|0| -|Jangada|,|0| |V3903_Sagittarii|,|0| |Union_catalog|,|0| -|Somebody_Dance_with_Me|,|0| -|Compass|,|0| |Flintstones_(basketball)|,|0| -|Publish_or_perish|,|0| -|Gireogi_appa|,|0| |Spec_Sharp|,|1| -|Peter_Cameron_(minister)|,|0| |Wodginite|,|0| |Feminist_digital_humanities|,|0| |Phocaea_family|,|0| -|Who_Is_It_(Michael_Jackson_song)|,|0| |The_Machine_Question|,|0| |Samorost|,|0| |Man_the_Hunter|,|0| |Iota_Pictoris|,|0| |GNU_C_Library|,|0| -|Taiyi_Shengshui|,|0| -|Society_of_Saint_Pius_V|,|0| -|Acoustic_Doppler_velocimetry|,|0| |White_(KAT-TUN_song)|,|0| |Perlin_noise|,|0| -|Juggling_world_records|,|0| |Antibody_microarray|,|0| |Sony_Xperia_Z3_Compact|,|0| |Do3D|,|0| -|John_Foxe's_apocalyptic_thought|,|0| |Senario|,|0| |Aestivation_(botany)|,|0| |Transformers:_The_Last_Knight|,|0| |World_Network_of_Biosphere_Reserves|,|0| |Nypa_fruticans|,|0| |Internet_culture|,|0| -|I_Just_Fall_in_Love_Again|,|0| |Canadian_Index_of_Consumer_Confidence|,|0| -|Ronald_Gould_(mathematician)|,|0| -|Language_As_Symbolic_Action|,|0| |Pajama_Sam_3:_You_Are_What_You_Eat_from_Your_Head_to_Your_Feet|,|0| |Middle_Franconia|,|0| |Tonkori|,|0| |Synchronicity|,|0| -|Copying_network_models|,|0| |The_Famous_Five_(football)|,|0| -|Frog_pond_effect|,|0| -|American_Market|,|0| |Bat_algorithm|,|0| |Japanese|,|0| |LTTng|,|0| -|List_of_Colorado_locations_by_per_capita_income|,|0| -|List_of_largest_private_companies_in_the_United_Kingdom|,|0| |Chi_Hydrae|,|0| -|Direct_comparison_test|,|0| |Ampere_(band)|,|0| -|Negative_inversion|,|0| -|Kaiserpanorama|,|0| |Noddy_(TV_interview_technique)|,|0| -|1951_executions_in_Albania|,|0| |Henry_Crawford|,|0| -|W-test|,|0| |Drools|,|0| |Lax_equivalence_theorem|,|0| -|Public_rhetoric|,|0| |Liberty_Writers_News|,|0| -|Don't_Sleep_in_the_Subway|,|0| -|Fritillaria_pyrenaica|,|0| |Solver|,|0| -|Independence_hypothesis|,|0| |Binomial_heap|,|0| -|List_of_largest_cargo_airports_in_the_United_States|,|0| -|Shirk_(Islam)|,|0| -|Academy_Award_for_Best_Makeup_and_Hairstyling|,|0| -|Barbier's_theorem|,|0| |Maffei_2|,|0| -|James_Hydrick|,|0| -|Strongly_connected_component|,|0| |The_Betsy|,|0| -|World_in_Motion|,|0| -|Nissan_Lafesta|,|0| |Anna_Wikland|,|0| |Jef_Raskin|,|0| |GNU_Interpreter_for_Java|,|0| |Chematica|,|0| -|Alexander_of_Abonoteichus|,|0| |Chi_Virginis|,|0| |Process.h|,|0| -|Far-Fetched_Facts|,|0| |CUTEr|,|0| |Nullarbor_Plain|,|0| |Meyer_set|,|0| -|Journal_of_Transpersonal_Psychology|,|0| |Semantic_heterogeneity|,|0| -|Lomar|,|0| -|Sumerian_creation_myth|,|0| |Sharp_Zaurus|,|0| |Spartan-V|,|0| -|Equivalent_spherical_diameter|,|0| |AgentCubes|,|1| |Parallel_projection|,|0| -|Wisdom_literature|,|0| |Blind_Faith|,|0| |Austrobaileyales|,|0| -|Montaillou_(book)|,|0| |Dr._DivX|,|0| -|Oliver_Benjamin|,|0| -|Stealth_juror|,|0| -|The_Blue_Jackal|,|0| |In_the_Dark_(podcast)|,|0| |Baldwin_Locomotive_Works|,|0| -|Change:_The_Magazine_of_Higher_Learning|,|0| -|Koto-furunushi|,|0| -|Council_of_Paderborn|,|0| -|CER-202|,|0| |ISO_3166-2:KY|,|0| -|List_of_number-one_albums_of_2013_(Canada)|,|0| -|Invenio|,|0| -|Irreligion_in_Rwanda|,|0| -|Carr_Communications|,|0| |Drugs_I_Need|,|0| -|I'm_Leaving_It_Up_to_You|,|0| -|HD_141937_b|,|0| |Random_neural_network|,|0| -|Staatenverbund|,|0| |ZBasic|,|0| -|List_of_Billboard_number-one_albums_of_1952|,|0| |Bard_(Dungeons_&_Dragons)|,|0| |Innovation_economics|,|0| |Beta_Pyxidis|,|0| -|Cock_Lane_ghost|,|0| -|Downbound_/_Upbound|,|0| |Comparison_of_LAN_messengers|,|0| |European_chemical_Substances_Information_System|,|0| -|List_of_Staffordshire_settlements_by_population|,|0| |Amman_Message|,|0| |World's_largest_palace|,|0| |Franco-Ontarian_flag|,|0| |Synaptic_transistor|,|0| |Sim4|,|0| |Gemini_Guidance_Computer|,|0| -|Fraternité_Notre-Dame|,|0| -|Experiencia_Religiosa|,|0| |The_Hampton_Institute|,|0| |Jongla|,|0| |All_Shall_Perish|,|0| |Superfetation|,|0| -|Looking_for_a_New_Love|,|0| |Pūtātara|,|0| |Desktop_Dungeons|,|0| -|USRC_Diligence|,|0| -|List_of_military_nuclear_accidents|,|0| |Iridescence|,|0| |EU_Open_Data_Portal|,|0| |D*|,|0| |Granularity_(parallel_computing)|,|0| |Vector-radix_FFT_algorithm|,|0| -|Fool_(If_You_Think_It's_Over)|,|0| -|Ideological_criticism|,|0| |Osmos|,|0| -|Babaylan|,|0| -|Operation_Eastern_Exit|,|0| |Hotfrog|,|0| -|Estadio_Doroteo_Guamuch_Flores|,|0| -|Sepharial|,|0| -|Abrázame_Muy_Fuerte_(song)|,|0| -|Light-mantled_albatross|,|0| -|Pharmacokinetics_simulation|,|0| |Dolorian|,|0| |EXtensible_Server_Pages|,|1| -|PX4_autopilot|,|0| |Lisa:_The_Painful|,|0| |MINUIT|,|0| |F55_(classification)|,|0| -|Arctic_Search_and_Rescue_Agreement|,|0| |Davidon–Fletcher–Powell_formula|,|0| -|Daniel_Zion|,|0| |Officialese|,|0| |Odds_algorithm|,|0| |Shaheen-III|,|0| |Outline_of_Poland|,|0| |TC0|,|0| |Vexi|,|0| -|List_of_deadly_earthquakes_since_1900|,|0| |Geikielite|,|0| |Thelema|,|0| |Bastard_brothers|,|0| -|Calais_(Reuters_product)|,|0| -|Kazakh_Uplands|,|0| |Loews_Cineplex_Entertainment|,|0| -|The_leans|,|0| -|Bob_Bryan|,|0| -|Dan_Smith_Will_Teach_You_Guitar|,|0| |Harish-Chandra_isomorphism|,|0| |L'Arc-en-Ciel|,|0| |CT_pulmonary_angiogram|,|0| |Pilot_plant|,|0| -|John_the_Revelator_/_Lilian|,|0| |Design_Web_Format|,|1| |Cycle_polo|,|0| |Colorimeter_(chemistry)|,|0| @@ -683,143 +405,85 @@ |Attila_(metalcore_band)|,|0| |Xenocentrism|,|0| |123_Marseille|,|0| -|Psychorama|,|0| -|I'm_in_Love_(I_Wanna_Do_It)|,|0| -|Tasman_Outflow|,|0| |Congo_River|,|0| |Reeducation_in_Communist_Romania|,|0| |Folgerphone|,|0| -|Cyberith_Virtualizer|,|0| |Random_indexing|,|0| |Horapollo|,|0| |Pisces_I_(dwarf_galaxy)|,|0| |Area_codes_in_Mexico_by_code|,|0| |Robert_Wilensky|,|0| |DONKEY.BAS|,|0| -|Politics_Can_Be_Different|,|0| -|Design_matrix|,|0| |YASARA|,|0| |Kono_Mystery_ga_Sugoi!|,|0| -|Wings_of_a_Butterfly|,|0| |Snit|,|0| -|Fierz_identity|,|0| -|Asclepiodotus_of_Alexandria|,|0| |Indicator_bacteria|,|0| |Google_Street_View_in_Europe|,|0| |Dependent_type|,|0| |PhosphoSitePlus|,|0| |Summum|,|0| |Detrended_fluctuation_analysis|,|0| -|Automotive_Industries_(magazine)|,|0| -|PR_Newswire|,|0| -|Whispering_campaign|,|0| |Quaternary_cubic|,|0| -|List_of_1998_box_office_number-one_films_in_Australia|,|0| -|Thurstone_scale|,|0| -|Woo_Hah!!_Got_You_All_in_Check|,|0| -|Bhakti_yoga|,|0| -|List_of_number-one_hits_of_1993_(Mexico)|,|0| -|Angakkuq|,|0| -|JLab|,|0| |LRLL_54361|,|0| |Jaguar_D-Type|,|0| |P/B_ratio|,|0| -|Bastard_color|,|0| |XML_Interface_for_Network_Services|,|0| |Lively_Kernel|,|0| |NGC_6293|,|0| |LW9|,|0| -|Volkstum|,|0| -|Love_Is_Love_Is_Love|,|0| -|She's_a_Miracle|,|0| |ISO_3166-2:BZ|,|0| -|Is_Anybody_Goin'_to_San_Antone|,|0| |BMC_Systems_Biology|,|0| |Outline_of_Nicaragua|,|0| |Holmes_(computer)|,|0| -|Existentially_closed_model|,|0| |Matthew_3:11|,|0| -|Birkhoff's_theorem_(relativity)|,|0| |TWX_(magazine)|,|0| -|Stochastic_portfolio_theory|,|0| |Gunpoint_(video_game)|,|0| |Ashok_Row_Kavi|,|0| -|Margaret_Jones_(Puritan_midwife)|,|0| |Function_point|,|0| |Ford_Falcon_(BF)|,|0| -|Hiyoshi-zukuri|,|0| |Visual_language|,|0| |Sigillaria|,|0| -|Kappa_Lyrae|,|0| -|Eckmann–Hilton_argument|,|0| -|Aetites|,|0| |1087_Arabis|,|0| -|Tutte_l'opere_d'architettura_et_prospetiva|,|0| |The_Chicken_and_the_Pig|,|0| |Human_impact_on_the_environment|,|0| |HD_217107_c|,|0| -|Outline_of_communication|,|0| |Standing_crop|,|0| |NGC_232|,|0| -|Matrix_determinant_lemma|,|0| |HD_130084|,|0| |Thriller_(viral_video)|,|0| |Collapse_of_the_Royal_Plaza_Hotel|,|0| -|Ga-Rei|,|0| |Queer_theology|,|0| |Leopold_and_Rudolf_Blaschka|,|0| |Media_engagement_framework|,|0| |McAlpine_v_Bercow|,|0| -|Formula_composition|,|0| -|New_York_lunar_sample_displays|,|0| |System_for_Electronic_Document_Analysis_and_Retrieval|,|0| -|Scientology_beliefs_and_practices|,|0| -|Tashbih|,|0| |Cuestión_moral:_si_el_chocolate_quebranta_el_ayuno_eclesiástico|,|0| -|Two_truths_doctrine|,|0| -|Padeye|,|0| |Rural_area|,|0| -|Main_conjecture_of_Iwasawa_theory|,|0| |The_Acid|,|0| |System_requirements_specification|,|0| -|1977_Tonga_earthquake|,|0| |Stibarsen|,|0| -|1995_in_the_decathlon|,|0| |Nothoceros|,|0| |Tersus|,|1| -|Local_Government_Category_List|,|0| |Debt-to-capital_ratio|,|0| -|Sweet_City_Woman|,|0| |Vala_(programming_language)|,|1| |Radio_Bearer_in_UMTS|,|0| |Debutante|,|0| -|Leave_Virginia_Alone|,|0| |Jordan–Wigner_transformation|,|0| -|Masonic_Temple|,|0| |List_of_types_of_XML_schemas|,|0| -|Yonaguni_Monument|,|0| -|Garbhodaksayi_Vishnu|,|0| -|Coin_capsule|,|0| |Modified_Richardson_iteration|,|0| -|Rachel_Elior|,|0| |Earth_Group|,|0| |International_rankings_of_Honduras|,|0| |Pikey|,|0| -|Real_Love_(Jody_Watley_song)|,|0| |Soils_of_Fate|,|0| -|The_Fury_of_Dracula|,|0| |Apache_HBase|,|0| |Bitard|,|0| |Monasticism|,|0| |Hostile_architecture|,|0| -|Button_(clothing)|,|0| |Govind_Pashu_Vihar_National_Park_and_Sanctuary|,|0| |967_Helionape|,|0| |5855_Yukitsuna|,|0| |Commuter_Cars_Tango|,|0| -|Babe_(Take_That_song)|,|0| |Desmond_G._Higgins|,|0| -|Geographers_on_Film|,|0| |Unrestricted_Hartree–Fock|,|0| |Mobile_computer-supported_collaborative_learning|,|0| |SpiNNaker|,|0| @@ -830,12 +494,8 @@ |Leapfrogging|,|0| |Vladimir_Damgov|,|0| |Arctic_sea_ice_ecology_and_history|,|0| -|Marxist_criminology|,|0| -|Felicity_Huntingford|,|0| -|Jedediah_Smith_Wilderness|,|0| |Raja|,|0| |World_Rainforest_Movement|,|0| -|Mourning_portraits|,|0| |Gerlachovský_štít|,|0| |SofCheck_Inspector|,|0| |Jindal_Steel_and_Power|,|0| @@ -844,214 +504,124 @@ |Aakash_(tablet)|,|0| |NGC_89|,|0| |Cellular_algebra|,|0| -|Japanese_Society_for_Bioinformatics|,|0| -|List_of_network_theory_topics|,|0| |Ford_Taurus_X|,|0| |List_of_airports_by_IATA_code:_U|,|0| |Chevalley_restriction_theorem|,|0| |Sitelink|,|0| -|List_of_number-one_streaming_tracks_of_2016_(Australia)|,|0| -|Timeline_of_the_gunpowder_age_in_Korea|,|0| |Left_corner|,|0| -|Lists_of_UK_Compilation_Chart_number_ones|,|0| |Sand_table|,|0| -|United_States_of_China|,|0| -|2014_Shanghai_stampede|,|0| |The_Breeders|,|0| -|Operation_Aerial|,|0| -|Carmichael_function|,|0| |Wiki_Conference_India|,|0| -|Dominion_theology|,|0| -|Mi_PC|,|0| |The_Bad_Breed|,|0| -|Laffy_Taffy_(song)|,|0| |Grizzly_Creek_Redwoods_State_Park|,|0| -|Thomas_Sheridan_(actor)|,|0| |False_radiosity|,|0| -|Social_comparison_theory|,|0| |Cavalieri's_quadrature_formula|,|0| |NGC_3603-B|,|0| -|Dark_Side_of_the_Rainbow|,|0| |Anton_(computer)|,|0| |Imaging_cycler_microscopy|,|0| -|Principle_of_distributivity|,|0| -|Bosques_Templados_Lluviosos_de_los_Andes_Australes|,|0| -|Panacea_(medicine)|,|0| -|Undercover_Angel_(song)|,|0| |JBoss_Enterprise_Application_Platform|,|0| |Pneumonoultramicroscopicsilicovolcanoconiosis|,|0| -|Mitsubishi_Xpander|,|0| -|Koboltstaler_Köpfe|,|0| |NGC_1984|,|0| |Rynersonite|,|0| |ILOG|,|0| |Indianapolis_500_traditions|,|0| -|Country_Boy_(Alan_Jackson_song)|,|0| |American_Truck_Simulator|,|0| |Rank_correlation|,|0| -|Morley_rank|,|0| |Ford_Bronco_II|,|0| |Existential_risk_from_artificial_general_intelligence|,|0| |Trojitá|,|0| |INGENIAS|,|0| -|Preimage_theorem|,|0| -|Toledan_Tables|,|0| -|Maurice_Frydman|,|0| |Colocation_centre|,|0| |MOA-2007-BLG-400L|,|0| |Martyrs_of_Japan|,|0| |SAARC_Secretary_General|,|0| -|Utm_theorem|,|0| |Uniqueness_type|,|0| -|Atheism_in_Christianity|,|0| |Da_Youngsta's|,|0| -|The_Global_Media_Monitoring_Project|,|0| -|Curse|,|0| |Steal_the_Bacon|,|0| |Pavo_in_Chinese_astronomy|,|0| |Alpina_B10_Bi-Turbo|,|0| -|Luck_egalitarianism|,|0| -|Positivistic_animism|,|0| -|Job_fraud|,|0| |Yo-yo_problem|,|0| -|Organ_theft|,|0| |Sugar_bush|,|0| -|JEDEC_memory_standards|,|0| |Minimalism_(computing)|,|0| |Mazda3|,|0| |Linux_Lite|,|0| |Chozen|,|0| |PR2_(classification)|,|0| |Hikari_no_Signal|,|0| -|False_cognate|,|0| -|Circumnavigation|,|0| |Byggnadsarbetaren|,|0| |Retention_and_protention|,|0| |Crystal_Fighters|,|0| -|Riemann_series_theorem|,|0| -|Stand_by_You_(Rachel_Platten_song)|,|0| |Outline_of_Afghanistan|,|0| -|International_Development_Markup_Language|,|1| -|Dog_Tooth_Peak|,|0| |Enchanted_loom|,|0| |Gizmo5|,|0| -|Zone_(vestment)|,|0| -|Kolmogorov's_normability_criterion|,|0| |Cochran–Mantel–Haenszel_statistics|,|0| -|Cradle_of_civilization|,|0| -|Flip-flop_(politics)|,|0| |Clip_(compiler)|,|0| -|SISD|,|0| |Grey_alien|,|0| |Apache_Wave|,|0| -|John_Vianney|,|0| -|Stay_Real|,|0| -|Debreu_theorems|,|0| |Puffer_train|,|0| |Flail_space_model|,|0| -|Bodies_(Robbie_Williams_song)|,|0| |Maria_Cristina_Facchini|,|0| |The_Havana_Pitbulls|,|0| |Tree_rearrangement|,|0| -|Kleiner_Winterberg_(Harz)|,|0| |List_of_development_aid_country_donors|,|0| |Sculptor_in_Chinese_astronomy|,|0| |Psychochemical_warfare|,|0| -|Age_of_the_Gods|,|0| |Ministry_of_Communications_and_Information_Technology_(Egypt)|,|0| |Leo_(astrology)|,|0| -|I_Drove_All_Night|,|0| -|Polygenism|,|0| |Holmfirth_floods|,|0| -|David_W._Hamlyn|,|0| -|Massachusetts_(Ylvis_song)|,|0| |Yanaki_and_Milton_Manaki|,|0| -|Crisis_of_Marxism|,|0| -|Twelve_Women|,|0| |PythonAnywhere|,|0| |Real_tree|,|0| |Common_Lisp|,|1| |Nutrient_management|,|0| -|Stannite|,|0| |Kindred_the_Family_Soul|,|0| -|Seven_Nation_Army|,|0| |Income_deprivation_affecting_children_index_(UK)|,|0| |Masyanya|,|0| -|Spider-Woman_(Mattie_Franklin)|,|0| |Nature_(essay)|,|0| |Reverse_Monte_Carlo|,|0| |B-Step_Sequencer|,|0| |Cyathea_recommutata|,|0| -|Classification_of_demons|,|0| |Marcel_the_Shell_with_Shoes_On|,|0| -|Schiavinatoite|,|0| |Yahoo_Beijing_Global_R&D_Center|,|0| |David_States|,|0| |Context_MBA|,|0| -|Ahmadiyya_and_other_faiths|,|0| |Pseudomonas_genome_database|,|0| |List_of_the_largest_islands_in_the_North_Sea|,|0| -|Cyclotomic_identity|,|0| |Blowin'_(song)|,|0| |Another_War|,|0| |ISO_3166-2:WF|,|0| |Commandaria|,|0| |Ballistics_(video_game)|,|0| -|Don't_Throw_Your_Love_Away|,|0| -|List_of_Fab_40_number-one_singles|,|0| |Everybody_Draw_Mohammed_Day|,|0| |Astove_Island|,|0| |O3Spaces|,|0| |Hole_(band)|,|0| -|Ditloid|,|0| |Samsung_Galaxy_Note_3|,|0| -|Eternal_return|,|0| -|GNOME_Activity_Journal|,|0| |Gog_and_Magog|,|0| -|Explorer_II|,|0| -|Wonderful_(Annie_Lennox_song)|,|0| |Textpattern|,|0| |The_Darkness_II|,|0| -|International_Vedanta_Society|,|0| |League_of_the_Southwest|,|0| |Unity_of_Command_(video_game)|,|0| -|Escambray_rebellion|,|0| |Mnemosyne_(software)|,|0| |XMTC|,|1| |Nubbin_(landform)|,|0| |Terms_for_Syriac_Christians|,|0| |Samsung_Galaxy_Tab_3_7.0|,|0| -|Systematic_review|,|0| |DECtalk|,|0| |Arghul|,|0| |Audio_file_format|,|0| |Maberry_and_Walker|,|0| |Gerrit_(software)|,|0| -|Sumerian_religion|,|0| -|Legendary_creature|,|0| -|Dayworld_Rebel|,|0| -|Mysteries_of_the_Unknown|,|0| |International_rankings_of_Israel|,|0| -|List_of_World_Number_One_male_golfers|,|0| -|Annual_Review_of_Information_Science_and_Technology|,|0| -|Bigu_(grain_avoidance)|,|0| -|Autonomous_category|,|0| -|WASP-7b|,|0| |BacMap|,|0| |Crow_Black_Chicken|,|0| |Dope_(band)|,|0| -|1938–39_German_expedition_to_Tibet|,|0| -|Dolceola|,|0| |Drunk_walking|,|0| |Beta_Crucis|,|0| -|Latitudinarianism_(philosophy)|,|0| -|Go_Heritage_Run|,|0| |Short_range_order|,|0| |PriceRunner|,|0| -|Bounded_inverse_theorem|,|0| |Enfer|,|0| |Tmesipteris_truncata|,|0| -|Jason_and_Medea|,|0| |String_Theory_(artist_collective)|,|0| |Fact,_Fiction,_and_Forecast|,|0| |Paesia|,|0| @@ -1061,38 +631,23 @@ |Program_analysis|,|0| |Pannaiyar_Caste|,|0| |Indo-European_languages|,|0| -|La_Hojilla|,|0| |Epsilon_Volantis|,|0| |41_Lyncis|,|0| |Libra_(astrology)|,|0| -|Westgate_shopping_mall_attack|,|0| |Noppera-bō|,|0| |Animoog|,|0| |Tsubame_(supercomputer)|,|0| |WSPR_(amateur_radio_software)|,|0| -|Luminus_(comics)|,|0| |Priority_Sector_Lending_Certificates|,|0| -|Grip_(software)|,|0| |Inner_Sanctum_(band)|,|0| -|Yottabit|,|0| |Superspace|,|0| -|List_of_oldest_National_Hockey_League_players|,|0| |Invent_This!|,|0| -|Brain_Quest|,|0| -|Chris_Sherwin|,|0| |Tehilla_Lichtenstein|,|0| -|The_Theory_of_Moral_Sentiments|,|0| |Multiplan|,|0| -|SIOD|,|1| -|List_of_1999_box_office_number-one_films_in_Australia|,|0| |DENIS-P_J020529.0-115925|,|0| -|Creationist_museum|,|0| |Sea_ice_microbial_communities|,|0| |Volvo_XC40|,|0| -|Portrait_of_Paquius_Proculo|,|0| -|Sufi_Order_Ināyati|,|0| |Body_orifice|,|0| -|Main_Stem|,|0| |Cha_110913-773444|,|0| |Case–Shiller_index|,|0| |Positive_systems|,|0| @@ -1105,12 +660,8 @@ |Galileo_affair|,|0| |Jaron_Lanier|,|0| |Raita_algorithm|,|0| -|The_New_York_Times_Fiction_Best_Sellers_of_1959|,|0| -|List_of_extreme_points_of_Mongolia|,|0| |Vanessa_Fox|,|0| |List_of_most_viewed_online_trailers_in_the_first_24_hours|,|0| -|The_Truth_about_Nanjing|,|0| -|I've_Been_Everywhere|,|0| |Children_of_Bodom|,|0| |Three-taxon_analysis|,|0| |Stratton_Brothers_case|,|0| @@ -1119,114 +670,67 @@ |OASIS_SOA_Reference_Model|,|0| |Baumslag–Solitar_group|,|0| |Pandora_(musical_group)|,|0| -|Toxic_cough_syrup|,|0| -|Wooden_language|,|0| |Mikhail_Bystrov|,|0| -|William_Joyce|,|0| -|Lendava_Synagogue|,|0| |P_system|,|0| |New_Lab|,|0| |Xiaowei|,|0| |Celebrate_the_Nun|,|0| -|Biproduct|,|0| |Pre-Code_sex_films|,|0| |Neuron_(software)|,|0| |Acer_beTouch_E100/E101|,|0| |Bertram_Raphael|,|0| |Systematics|,|0| -|Hounfour|,|0| |Tata_Venture|,|0| -|Verbal_reasoning|,|0| -|Mama_Tried_(song)|,|0| |Sailing_at_the_1976_Summer_Olympics_–_Soling|,|0| -|Orisha|,|0| |1741_Giclas|,|0| -|Suzanne_Briet|,|0| -|Mean_absolute_error|,|0| |Hot_and_cold_cognition|,|0| |Breeder_(cellular_automaton)|,|0| |DesktopTwo|,|0| |Service_pack|,|0| |Process|,|0| -|Detroit_Cyclecar|,|0| |Danelectro_Amp-in-case|,|0| |Hosmer–Lemeshow_test|,|0| |Electronic_bagpipes|,|0| |Pseudorandom_permutation|,|0| |CP_Lacertae|,|0| -|Give_It_to_Me_(Timbaland_song)|,|0| |McCallum_and_Tarry|,|0| -|Sheikh_Sharaf_ad-Din_ibn_al-Hasan|,|0| |Jewel_Pod|,|0| -|YouTube_Next_Lab_and_Audience_Development_Group|,|0| |Forestry_literature|,|0| |Canada_Geographic_Information_System|,|0| |Latin_square|,|0| -|Gaudapada|,|0| -|Have_a_Little_Faith_(song)|,|0| -|Tingles|,|0| -|Digital_interview|,|0| |HAT-P-1b|,|0| -|Cassé|,|0| -|Hold_It_Don't_Drop_It|,|0| |PowerDEVS|,|0| |Semi-simplicity|,|0| |GXL|,|1| -|Byron_Reed_Collection|,|0| |NDepend|,|0| |Bass_Brothers|,|0| |Eleven_Past_One|,|0| |Gymnocarpium|,|0| -|Chart_Rulership|,|0| |Continental_shelf|,|0| -|Greatest_Painting_in_Britain_Vote|,|0| |1333_Cevenola|,|0| |Schisandra|,|0| |Manchester_Mark_1|,|0| |Yellowstone_fires_of_1988|,|0| |Lacrosse|,|0| -|Precision_questioning|,|0| |Oh_Darling|,|0| |The_Hidden_Wiki|,|0| -|Yaakov_Israel_Ifargan|,|0| -|Meredith_graph|,|0| -|Morant_Point|,|0| |Philosophy_of_information|,|0| -|Kirby_J._Hensley|,|0| |Outboard_gear|,|0| |Modern_C++_Design|,|0| -|Living_tree_doctrine|,|0| -|Imitation_of_God|,|0| |Necrophagia|,|0| |Affix_grammar_over_a_finite_lattice|,|0| -|Draft_(hull)|,|0| |Trash_Talk_(band)|,|0| -|USRC_South_Carolina|,|0| -|Institute_for_the_Secularisation_of_Islamic_Society|,|0| -|Sunda_Wiwitan|,|0| -|Kohlberg_(Fichtel_Mountains)|,|0| -|Three-volley_salute|,|0| -|Makhdoom_Shaban_ul_Millat_Ali_Murtaza|,|0| |3322_Lidiya|,|0| -|Yanks_for_Stalin|,|0| |Kudit|,|0| |Tokwe_Mukorsi_Dam|,|0| |List_of_busiest_ports_by_cargo_tonnage|,|0| -|Plankowner|,|0| -|Media_coverage_of_North_Korea|,|0| |Białowieża_Forest|,|0| -|Dravidar_Kazhagam|,|0| |Harwell-Boeing_file_format|,|1| |Chalo_Chatu|,|0| -|Chromostereopsis|,|0| -|Rubric_(academic)|,|0| |Hymenaea_protera|,|0| -|Confident_(Demi_Lovato_song)|,|0| |Time_crystal|,|0| |TEA_(text_editor)|,|0| -|Upasni_Maharaj|,|0| |Nigerian_Capital_Development_Fund_(NCDF)|,|0| -|Port_(circuit_theory)|,|0| |Environmental_resource_management|,|0| |The_Major_Transitions_in_Evolution|,|0| |La'cryma_Christi|,|0| @@ -1238,241 +742,140 @@ |Wirecast|,|0| |Russian_boxing|,|0| |Tokyo_Dawn_Records|,|0| -|Amritbani_Guru_Ravidass_Ji|,|0| |Raft_(computer_science)|,|0| -|Park_Young-seok|,|0| |Sony_Xperia_X|,|0| |9912_Donizetti|,|0| -|Metaplasm|,|0| -|Transient_receptor_potential_channel-interacting_protein_database|,|0| |El_Chavo_Animado|,|0| -|William_Torrey_Harris|,|0| |Indo-European_ablaut|,|0| |Zeta2_Muscae|,|0| |Chemical_Markup_Language|,|1| -|Records_of_heads_of_state|,|0| |List_of_largest_shopping_malls|,|0| |Laptop|,|0| -|IC_4406|,|0| |Bangarang|,|0| |List_of_best-selling_PC_games|,|0| |Fear_of_commitment|,|0| |Ddrescue|,|0| -|Hopf_maximum_principle|,|0| |Community_of_position|,|0| -|Wendy_Christensen|,|0| |Digital_Fish_Library|,|0| |Cognitive_Research_Trust|,|0| -|Jaroslav_Nešetřil|,|0| |Franklin's_Forest|,|0| |Higher-Order_Perl|,|0| |Kolhi|,|0| |Flue-gas_condensation|,|0| |Lexicographic_code|,|0| -|Poundage|,|0| -|Intuitionistic_logic|,|0| -|Four_Worlds|,|0| |Bolesław_Matuszewski|,|0| -|Takwin|,|0| |Blocksworld|,|0| -|Dallas_Mavericks_all-time_roster_and_statistics_leaders|,|0| |Fort_Lean|,|0| |Iron_Monkey_(band)|,|0| |LG_Optimus_G_Pro|,|0| |Robinson–Schensted–Knuth_correspondence|,|0| -|Country_Is|,|0| -|Modal_fallacy|,|0| -|Toy_problem|,|0| |Church_of_the_Holy_Sepulchre|,|0| |Atomistix_ToolKit|,|0| |Trust_region|,|0| -|Matteuccia_de_Francesco|,|0| |List_of_seas|,|0| |Cyber_spying|,|0| |Specification_and_Description_Language|,|1| -|2011_United_States_listeriosis_outbreak|,|0| -|Spencer_cohomology|,|0| |Jarrakan_languages|,|0| |Para-cycling_classification|,|0| |Yehowists|,|0| |List_of_MeSH_codes_(C01)|,|0| |Buick_Rendezvous|,|0| |Chalk_heath|,|0| -|Ancestor_veneration_in_China|,|0| |Distrust|,|0| |Moufang_set|,|0| -|List_of_number-one_hits_of_1970_(Italy)|,|0| -|Productivity_(ecology)|,|0| -|HD_162020|,|0| -|Babbs_Switch_fire|,|0| -|Blue_Mountain_Peak|,|0| -|Online_survey_platforms|,|0| -|Scuba_diving|,|0| -|Kronecker's_lemma|,|0| |DataCite|,|0| |Honor_Harrington|,|0| |Henryk_Witek|,|0| |Analog_recording|,|0| -|Mamoru_Samuragochi|,|0| |Global_Reporting_Initiative|,|0| |NGC_193|,|0| |NGC_4919|,|0| -|Norwegian_Polar_Institute|,|0| |Outline_of_the_United_States_Virgin_Islands|,|0| |Chrysler_Airflow|,|0| |Mechanical_television|,|0| -|Electromagnetism_uniqueness_theorem|,|0| |Epsilon_Aurigae|,|0| -|Spanish_Renaissance_architecture|,|0| -|Lists_of_number-one_albums|,|0| -|The_Moscow_rules|,|0| |Ar_(Unix)|,|0| |PHP-Nuke|,|0| |Sharp-P-complete|,|0| |SeaMonkey|,|0| -|Sweet_Caroline|,|0| |Skylanders:_Spyro's_Adventure|,|0| -|Shift_operator|,|0| |Specular_holography|,|0| |OpenBLAS|,|0| |Carcass_(band)|,|0| -|Agha_Waqar's_water-fuelled_car|,|0| |Country_code_top-level_domain|,|0| -|Tutte–Berge_formula|,|0| -|When_It's_Love|,|0| |Caelum|,|0| -|Herculaneum|,|0| -|Sortes_Vergilianae|,|0| -|In_Mourning_(band)|,|0| -|Arithmetic_progression|,|0| |N_Carinae|,|0| |Ovius_and_Novius_Calavius|,|0| -|Boiling_liquid_expanding_vapor_explosion|,|0| |Líbido_(band)|,|0| -|Cardus_Education_Survey_Canada|,|0| -|Justice_Network|,|0| |Etoy|,|0| |PARSEC|,|0| |80_Cancri|,|0| -|Envy|,|0| -|La_Vieille_Taupe|,|0| |Aquarius/Let_the_Sunshine_In|,|0| |Schubert_variety|,|0| |Stephen_H._Davis|,|0| |OpenScientist|,|0| -|Impact_factor|,|0| |NGC_7315|,|0| |Werauhia_insignis|,|0| |Petty_nobility|,|0| -|Kurdification|,|0| -|Novensiles|,|0| -|Neo-Sovietism|,|0| -|The_James_Deans|,|0| |Spiritualism|,|0| |Anna_Krylov|,|0| |Pattern_language_(formal_languages)|,|0| |Meetic|,|0| |Prune_and_search|,|0| -|Good_Grief_(song)|,|0| |Messier_50|,|0| -|IFRS_9|,|0| |XO-5b|,|0| -|Redneck_Girl|,|0| |Dlisted|,|0| |Precompiled_header|,|0| -|Shikantaza|,|0| |BitLord|,|0| |Hermes_Project|,|0| |Daisyworld|,|0| |The_Killers|,|0| -|Lleyton_Hewitt|,|0| |List_of_longest_films|,|0| |Cooperative_MIMO|,|0| -|Moorish_architecture|,|0| -|With_One_Exception|,|0| |Taurus_(constellation)|,|0| -|Latter_Rain_(post–World_War_II_movement)|,|0| |Persiankiwi|,|0| |HD_89744_b|,|0| -|Memorization|,|0| |Endless_Game|,|0| -|List_of_localities_in_Northern_Ireland_by_population|,|0| |One-dimensional_symmetry_group|,|0| |Novorossiya_TV|,|0| |American_Telemedicine_Association|,|0| |Rivethead|,|0| -|Arctic_Institute_of_North_America|,|0| |Bad_Radio|,|0| |Kafr_Abdu|,|0| -|Janiszewski's_theorem|,|0| |NIDDK_Office_of_Technology_Transfer_and_Development|,|0| |Glove_problem|,|0| -|Command_and_Control_Research_Program|,|0| -|Philastrius|,|0| |Joel_Lee_Brenner|,|0| -|Kolmogorov's_zero–one_law|,|0| -|Blechnum_maximum|,|0| |Brouwer–Heyting–Kolmogorov_interpretation|,|0| |Student_information_system|,|0| -|Assahifa_Al_Ousbouia|,|0| |Trust_(electronics_company)|,|0| -|Cavalieri's_principle|,|0| |FreeCreditScore.com|,|0| -|Institut_Jeanne_Gatineau|,|0| -|Center_for_Investigative_Reporting|,|0| |Rise_to_Ruins|,|0| |Mind_games|,|0| |List_of_fields_of_doctoral_studies_in_the_United_States|,|0| |Complexity_class|,|0| -|Ain't_Nobody_Better|,|0| |Fear_Zero|,|0| -|Fuzzy_electronics|,|0| -|Men's_60_metres_world_record_progression|,|0| -|Bishop–Gromov_inequality|,|0| |Global_village|,|0| -|Political_socialization|,|0| |Lexical_grammar|,|0| |Official_World_Golf_Ranking|,|0| |Sieve_C++_Parallel_Programming_System|,|0| |China_Merchants_Shekou_Industrial_Zone_Holdings|,|0| -|List_of_mountains_of_the_Alps_(2000–2499_m)|,|0| |List_of_longest_placenames_in_Ireland|,|0| |Close_(system_call)|,|0| |Czech_Republic|,|0| -|Keep_Indy_Indie|,|0| -|Memorial_reconstruction|,|0| -|List_of_world_records_in_finswimming|,|0| -|List_of_highest-grossing_Filipino_films_in_2011|,|0| -|Gymnocarpium_dryopteris|,|0| |Electromigrated_nanogaps|,|0| |Erewhon_Revisited|,|0| |Sacred_Seven|,|0| |Herschel_Island_(Chile)|,|0| -|El_pueblo_unido_jamás_será_vencido|,|0| |Melilla|,|0| |JAligner|,|0| -|I'm_Gay_(6_AM_song)|,|0| -|Trilok_(Jainism)|,|0| -|Sign_of_the_Times_(Harry_Styles_song)|,|0| |62_Andromedae|,|0| -|Irreligion_in_Belgium|,|0| -|Perfect_Day_(Lou_Reed_song)|,|0| |Two-way_finite_automaton|,|0| -|NGC_1808|,|0| |Yahoo!_Search_Marketing|,|0| |New_Jerusalem|,|0| -|List_of_instrumental_number_ones_on_the_UK_Singles_Chart|,|0| -|De_Moivre–Laplace_theorem|,|0| -|Neural_adaptation|,|0| |Patois|,|0| |Superorganism|,|0| -|Gin_Chow|,|0| |Simultaneous_algebraic_reconstruction_technique|,|0| |Forest_History_Center|,|0| -|Function_space|,|0| -|Walter_Rudin|,|0| -|Adams_Filmi|,|0| -|Primula_beesiana|,|0| |Acer_stewarti|,|0| |List_of_undecidable_problems|,|0| |Cultural_diversity|,|0| @@ -1484,45 +887,23 @@ |QM/MM|,|0| |Glycomics|,|0| |X11_color_names|,|0| -|Lyle_H._Lanier|,|0| |Computational_electromagnetics|,|0| -|NGC_2613|,|0| |ISO_3166-2:MH|,|0| |Dolphin_(file_manager)|,|0| -|Dependency_graph|,|0| -|Hotel_Terme_Millepini|,|0| -|Rhythm_Club_fire|,|0| -|NGC_149|,|0| |Odious_debt|,|0| |Racial_wage_gap_in_the_United_States|,|0| |Hanna-Barbera|,|0| |Kepler-22|,|0| -|Private_Eyes_(song)|,|0| |A+_(programming_language)|,|1| -|Applied_academics|,|0| |Art_of_Illusion|,|0| -|Ti_Amo_(Gina_G_song)|,|0| -|Khwajagan|,|0| |Seven_Sleepers|,|0| -|Jamais_vu|,|0| |Imaging_radar|,|0| -|Snowball_Earth|,|0| -|1979_(song)|,|0| -|Ice_tank|,|0| |The_Mighty_Don't_Kneel|,|0| -|Anson_Archipelago|,|0| |Nitish_V._Thakor|,|0| -|Erke|,|0| -|Inferential_theory_of_learning|,|0| |EbXML|,|0| -|Ajapa|,|0| |Slope-former|,|0| |MIDI_Machine_Control|,|1| |Memory-mapped_file|,|0| -|Breakdown_(Tantric_song)|,|0| -|Paulus_of_Verdun|,|0| -|Volterra_operator|,|0| -|Spark_Social|,|0| |Roland_and_Almita_Vamos|,|0| |Phi_Orionis|,|0| |Omega1_Scorpii|,|0| @@ -1530,36 +911,24 @@ |Jeff_Jarrett|,|0| |Lord_Murugan_Statue|,|0| |Microbotics|,|0| -|Ain't_Nuthin'_in_the_World|,|0| |BASICODE|,|0| |HD_52265_b|,|0| |Acropora|,|0| -|Rescue_archaeology|,|0| |Project_Starfighter|,|0| |NGC_247|,|0| -|Our_Lady_of_the_Rockies|,|0| |CMD_file_(CP/M)|,|0| -|Peruvian_records_in_Olympic_weightlifting|,|0| |Bootlegging_(business)|,|0| |Jeehiun_Lee|,|0| -|Ascendency|,|0| |Restricted_root_system|,|0| |Welch's_method|,|0| -|Oleandra|,|0| -|UMAP_Credit_Transfer_Scheme|,|0| |Google_Book_Search_Settlement_Agreement|,|0| -|Top-down_parsing_language|,|0| |Breadwinners_(TV_series)|,|0| -|Sledgehammer_(Peter_Gabriel_song)|,|0| |Social_class|,|0| |United_Kingdom_Standard_Industrial_Classification_of_Economic_Activities|,|0| |LL_parser|,|0| -|Woke_Up_in_Love|,|0| -|Charles_Eliot_Norton_Lectures|,|0| |Has-a|,|0| |Inverse_iteration|,|0| |Computational_linguistics|,|0| -|Handkerchief_skirt|,|0| |Turnstile_(symbol)|,|0| |Hymenophyllum_howense|,|0| |Geometric_topology|,|0| @@ -1571,7 +940,6 @@ |Warrior_monk|,|0| |Observer_pattern|,|0| |Turán's_brick_factory_problem|,|0| -|Hartley_function|,|0| |JPedal|,|0| |Apache_Ambari|,|0| |Machine_orders|,|0| @@ -1579,53 +947,29 @@ |Redundant_array_of_independent_memory|,|0| |Phylogenetic_inertia|,|0| |Oh-OK|,|0| -|A_Good_Heart|,|0| |Unit_disk_graph|,|0| -|Experiential_knowledge|,|0| -|Ibis_redibis_nunquam_per_bella_peribis|,|0| |Entry_point|,|0| -|Promise|,|0| -|Schafkopf_(Haardt)|,|0| -|I_Do_(Lisa_Loeb_song)|,|0| |Dicksonia_fibrosa|,|0| -|Hyper-Wiener_index|,|0| -|BEEPS|,|0| |List_of_amphibians_of_Yellowstone_National_Park|,|0| |Batales|,|0| -|Justice_delayed_is_justice_denied|,|0| |Eta_Capricorni|,|0| |Mutopia_Project|,|0| |Turbine_inlet_air_cooling|,|0| -|Angle_bisector_theorem|,|0| |Gtk2-Perl|,|0| -|Back_to_the_80s_(song)|,|0| -|Effects_of_Hurricane_Katrina_in_New_Orleans|,|0| |Apple_Maps|,|0| -|Be_the_One_(Dua_Lipa_song)|,|0| -|World_record_progression_women's_weightlifting|,|0| |384_(number)|,|0| |Hall's_identity|,|0| |Givens_rotation|,|0| |Merkos_L'Inyonei_Chinuch|,|0| -|Lone_wolf_(trait)|,|0| |Daniel_S._Loeb|,|0| |Multipath_I/O|,|0| -|The_rich_get_richer_and_the_poor_get_poorer|,|0| -|Lambeth_Homilies|,|0| |Muslim_Shaikh|,|0| |Folded_Reed–Solomon_code|,|0| |Caribou-Targhee_National_Forest|,|0| -|Akademio_de_Esperanto|,|0| -|Cathedral_Peak_(Wyoming)|,|0| |Magnificent_Seven_(gymnastics)|,|0| -|Teach_the_Controversy|,|0| |NGC_4570|,|0| |CamStudio|,|0| -|Cleveite|,|0| |Zoka_Forest|,|0| -|Byzantine_Iconoclasm|,|0| -|14_Herculis_c|,|0| -|Airyanem_Vaejah|,|0| |North_West_England|,|0| |Hindmarsh–Rose_model|,|0| |Eamon_Dunne|,|0| @@ -1638,90 +982,57 @@ |Rapture_Ready|,|0| |Historical_GDP_of_China|,|0| |73_(number)|,|0| -|Hengduan_Mountains_subalpine_conifer_forests|,|0| |Millvina_Dean|,|0| |Constraint_satisfaction|,|0| |Pascal_MicroEngine|,|0| -|I_Don't_Wanna_Stop|,|0| |South_Star|,|0| -|Artstein's_theorem|,|0| |Terrorist_Finance_Tracking_Program|,|0| -|List_of_historical_tsunamis|,|0| |Panther_De_Ville|,|0| -|Hermetica|,|0| |Graph_edit_distance|,|0| -|Green_Car_Vision_Award|,|0| |Hi_and_Lois|,|0| |HD_122563|,|0| |Estampage|,|0| -|Terrain_theory|,|0| |OTC_Markets_Group|,|0| -|Helen_Jacobs|,|0| |Brauer–Suzuki–Wall_theorem|,|0| |IFolder|,|0| |L._T._F._Gamut|,|0| |Iterative_proportional_fitting|,|0| |List_of_years_in_film|,|0| -|Trivial_Pursuit|,|0| -|Vuelve_(song)|,|0| |Sacred_geometry|,|0| -|Faultless_disagreement|,|0| |Arithmetic_combinatorics|,|0| -|Muisca_mythology|,|0| -|End_(graph_theory)|,|0| |Orc_(programming_language)|,|1| |PROGOL|,|0| |Feminism_in_culture|,|0| |Yao_Wenyuan|,|0| -|LHC_Accelerator_Research_Program|,|0| -|Rainmaking|,|0| -|Biblical_Research_Institute|,|0| |Ogier_the_Dane|,|0| |Kaoru_Genji|,|0| |ISO_3166-2:PH|,|0| -|Pre-conception_counseling|,|0| -|Timeline_of_theoretical_physics|,|0| -|Hypothetical_protein|,|0| |List_of_Dark_Shadows_characters|,|0| -|List_of_number-one_singles_of_1995_(Canada)|,|0| -|I_Love_You_Because_(song)|,|0| -|List_of_municipalities_in_South_Africa|,|0| |Figure_rating_scale|,|0| |Partial_template_specialization|,|0| |Saharo-Arabian_Region|,|0| |F.lux|,|0| |Sink_(computing)|,|0| -|Yoga-darsana|,|0| |Gentile|,|0| |Sustainable_national_income|,|0| |King_Arthur's_family|,|0| -|Alvin_Goldfarb|,|0| |Unambiguous_Turing_machine|,|0| -|Refutation_of_All_Heresies|,|0| |PSE_All_Shares_Index|,|0| |Superbone|,|0| |Ternary_commutator|,|0| -|Influence_and_legacy_of_Swami_Vivekananda|,|0| |Attendance|,|0| |WxSQLite3|,|0| |Fynbos|,|0| |Lamport_signature|,|0| |IDEF4|,|1| -|Shorten_(file_format)|,|1| -|Philippine_ceramics|,|0| |National_Bureau_of_Classification_(NBC)|,|0| |TT_Corvi|,|0| -|Ctenitis_squamigera|,|0| -|Déjà_Vu_(Giorgio_Moroder_song)|,|0| -|Higher_Education_for_American_Democracy|,|0| |Mind_Games_(Zard_song)|,|0| |Longitude|,|0| -|Harry_Henry|,|0| |Djibouti|,|0| |System_requirements_(spacecraft_system)|,|0| |Link_distance|,|0| |HD_115211|,|0| -|Providentialism|,|0| |Iyoite|,|0| |BioBIKE|,|0| |Jurisprudence|,|0| @@ -1729,73 +1040,41 @@ |F1_2013_(video_game)|,|0| |Buru|,|0| |L.A._Noire|,|0| -|Baba_Sali|,|0| |Dakhinpat_Satra|,|0| |International_rankings_of_Denmark|,|0| |Cypher_(video_game)|,|0| |Gamma_Ursae_Minoris|,|0| -|Self-persuasion|,|0| |Annualized_failure_rate|,|0| -|Extreme_points_of_Moldova|,|0| -|Receiver_general|,|0| -|Headless_Horseman|,|0| -|Drummond_Rennie|,|0| |Rosewood_massacre|,|0| |4CAPS|,|0| |Pricena|,|0| |XML_data_binding|,|0| |Drifting_ice_station|,|0| -|Bill_Cosby_sexual_assault_allegations|,|0| |Earle–Hamilton_fixed-point_theorem|,|0| -|British_Library_Leyden_Medical_Dissertations_Collection|,|0| -|Rhetoric_(Aristotle)|,|0| |Nemesysco|,|0| -|Toulmin_method|,|0| -|Pragmatic_theory_of_information|,|0| |GraphExeter|,|0| |IBM_History_Flow_tool|,|0| -|Iva_Toguri_D'Aquino|,|0| |Kiss_(band)|,|0| |Harrison_Institute|,|0| -|List_of_Canadian_number-one_albums_of_1977|,|0| |International_Yacht_Training_Worldwide|,|0| -|Karaite_Judaism|,|0| -|Heimia_salicifolia|,|0| |Daniel_Striped_Tiger|,|0| |Phuket_City|,|0| -|GADV-protein_world_hypothesis|,|0| -|National_Center_for_Family_Literacy|,|0| |Telephone_numbers_in_Namibia|,|0| -|Aïcha|,|0| |Post–Turing_machine|,|0| -|This_too_shall_pass|,|0| -|The_Snake_Pit|,|0| -|UA:First|,|0| -|Mackey_topology|,|0| |Sierra_Leone|,|0| -|Kotobagari|,|0| -|Telephone_numbers_in_Poland|,|0| |Assam_Lemon|,|0| -|Paolo_Virno|,|0| |Turing_machine_examples|,|0| |Simulated_annealing|,|0| |Amon_Amarth|,|0| -|Effect_of_Hurricane_Katrina_on_the_Louisiana_Superdome|,|0| |Los_Carpinteros|,|0| |Video_film_era|,|0| -|Haidakhan_Babaji|,|0| |NASA_Space_Universe|,|0| -|Psilocybe_weilii|,|0| |Joint_source_and_channel_coding|,|0| |Clinical_quality_management_system|,|0| -|Theodidaktos|,|0| -|Diana_Butler_Bass|,|0| |Nissan_Prairie|,|0| |List_of_stars_in_Gemini|,|0| |Pale_of_Settlement|,|0| |Clara.io|,|0| -|List_of_number-one_hits_of_2013_(Germany)|,|0| -|Thuridur_Olafsdottir|,|0| |Resource_Description_and_Access|,|0| |CiviCRM|,|0| |Template_Haskell|,|0| @@ -1805,57 +1084,32 @@ |Bit-reversal_permutation|,|0| |SME_Server|,|0| |Homonym_(biology)|,|0| -|Akiyoshi_Kitaoka|,|0| |Mycin|,|0| |List_of_stars_in_Leo_Minor|,|0| |Salvia_africana-lutea|,|0| |Katia_Sycara|,|0| |The_Rolling_Girls|,|0| -|Implication_(information_science)|,|0| |Sverigetopplistan|,|0| -|Frequency-doubling_illusion|,|0| |Max_Figman_and_Lolita_Robertson|,|0| |Unary_function|,|0| -|Joiner|,|0| -|List_of_number-one_albums_of_2015_(Poland)|,|0| |Sign_function|,|0| -|Elastic_therapeutic_tape|,|0| -|Let_My_Love_Be_Your_Pillow|,|0| |Omphalos|,|0| |List_of_reptiles_of_Yellowstone_National_Park|,|0| |Taskworld|,|0| -|List_of_Celtic-language_media|,|0| -|Ex's_&_Oh's|,|0| -|Literacy_Center_West|,|0| |Fay's_trisecant_identity|,|0| -|List_of_number-one_singles_of_2014_(Finland)|,|0| |ID3|,|0| |Forgotten_Ten|,|0| |Machinima|,|0| -|This_Christmas_(Donny_Hathaway_song)|,|0| |Detroit–Windsor|,|0| -|Cambrian–Ordovician_extinction_event|,|0| -|Taghribat_Bani_Hilal|,|0| -|Edwin_Bunting_Bartram|,|0| |IBM_STAIRS|,|0| -|Ranong_human-smuggling_incident|,|0| -|Rudolph_Fentz|,|0| -|High_West|,|0| -|Horizont_(radio)|,|0| -|Frobot|,|0| |Cumberland_Plain|,|0| |Private_investigator|,|0| -|I've_Told_Ev'ry_Little_Star|,|0| -|Fires_in_Edo|,|0| |V603_Aquilae|,|0| -|Xar_(graphics)|,|0| |International_rankings_of_Cuba|,|0| |Batman:_The_Telltale_Series|,|0| |TSP_(econometrics_software)|,|0| |Microsoft-specific_exception_handling_mechanisms|,|0| -|Imamah_(Shia)|,|0| |Project_management_information_system|,|0| -|Ovidiu_Pecican|,|0| |Symbolic_chickens|,|0| |Prizma|,|0| |Idiophone|,|0| @@ -1867,15 +1121,10 @@ |Mu1_Cancri|,|0| |Richardson_Grove_State_Park|,|0| |Jarral|,|0| -|Grunge_speak|,|0| -|Anisotropy_energy|,|0| |Taste_(band)|,|0| |Haran's_diamond_theorem|,|0| -|List_of_number-one_hits_of_1962_(Italy)|,|0| -|Big_Girls_Don't_Cry_(Fergie_song)|,|0| |Quark_Author|,|0| |Mem_(computing)|,|0| -|Go_For_Soda|,|0| |Cheilanthes|,|0| |ChitChat|,|0| |Mount_Erebus|,|0| @@ -1883,62 +1132,34 @@ |MBC_Max|,|0| |32_Cygni|,|0| |Velificatio|,|0| -|Judith_Dupré|,|0| |SGR_1806-20|,|0| |HD_124639|,|0| -|The_Next_Time_I_Fall|,|0| -|Cape_Baba|,|0| |FlexGen|,|0| |JT_(visualization_format)|,|1| |Kinetic_Monte_Carlo|,|0| -|Sazonov's_theorem|,|0| |UberWriter|,|0| |Matchmaking|,|0| |Spider-Man|,|0| |Mamil|,|0| -|Hindu_studies|,|0| |Endless_(comics)|,|0| |List_of_OECD_countries_by_job_security|,|0| |Kummer's_function|,|0| -|Lead_Books_of_Sacromonte|,|0| -|How_Am_I_Supposed_to_Live_Without_You|,|0| |Ni1000|,|0| |Semantic_holism|,|0| -|Graph_center|,|0| -|Hamburg_temple_model|,|0| -|Psilocybe_subcubensis|,|0| -|Colin_Evans_(medium)|,|0| -|Turn_Me_Out_(Praxis_song)|,|0| |Karlite|,|0| |Zak_McKracken:_Between_Time_and_Space|,|0| -|Emeritus|,|0| -|Lingbao_Tianzun|,|0| -|Second-generation_gender_bias|,|0| -|Kate_and_Grant|,|0| |Flow_Cytometry_Standard|,|0| |Commensurability_(group_theory)|,|0| -|List_of_Major_League_Baseball_progressive_career_home_runs_leaders|,|0| -|Duele_el_Amor|,|0| -|Hold_Tight_(Change_song)|,|0| -|Michael_Shulman_(mathematician)|,|0| -|NGC_2204|,|0| |ISO_3166-2:RU|,|0| -|Julian_Johnson|,|0| -|John_Nichols_Thom|,|0| |Human_action_cycle|,|0| |Heliotropes_(band)|,|0| -|Script.aculo.us|,|0| |NGC_194|,|0| -|Gross_value_added|,|0| |Mehrotra_predictor–corrector_method|,|0| |Pig_Goat_Banana_Cricket|,|0| |Cogs_(video_game)|,|0| |Bit.Trip_Beat|,|0| |Cheat_Engine|,|0| -|Ulavi|,|0| |Ambric|,|0| -|Last_offices|,|0| -|Atziluth|,|0| |Llanos_de_Moxos|,|0| |Unleashed_Software|,|0| |Tillamook_Burn|,|0| @@ -1946,40 +1167,27 @@ |Cerro_El_Pital|,|0| |Kekistan|,|0| |E-OTD|,|0| -|Eight_Immortals_of_Huainan|,|0| |Crash_Course_(YouTube)|,|0| -|Obituary_poetry|,|0| -|Take_On_Me|,|0| |Toyota_MR2|,|0| -|Opinion_Space|,|0| |QNX|,|0| |Baidu_Baike|,|0| |List_of_Scottish_counties_by_highest_point|,|0| -|List_of_2012_box_office_number-one_films_in_Spain|,|0| |NGC_1491|,|0| |Nallamala_Hills|,|0| |IdeaScale|,|0| |Acxiom|,|0| |Leda_(programming_language)|,|1| -|Baton_(law_enforcement)|,|0| -|Antarctic|,|0| |IgHome|,|0| |Starry_Night_(planetarium_software)|,|0| |I_(pronoun)|,|0| |Logico-linguistic_modeling|,|0| |Imputation_(statistics)|,|0| -|Church–Turing–Deutsch_principle|,|0| |Integrated_master_plan|,|0| -|Women's_heptathlon_world_record_progression|,|0| -|Constructive_quantum_field_theory|,|0| |Train_track_map|,|0| |SX000i_-_International_guide_for_the_use_of_the_S-Series_of_Integrated_Logistics_Support_(ILS)_specifications|,|0| |CHELPG|,|0| |Roland_Sound_Canvas|,|0| |Maria_Kisito|,|0| -|Metadata_removal_tool|,|0| -|List_of_mountains_of_Queen_Maud_Land|,|0| -|Recept|,|0| |N_=_4_supersymmetric_Yang–Mills_theory|,|0| |George_Smoot|,|0| |Newborn_(band)|,|0| @@ -1990,10 +1198,318 @@ |Asiatic_Vespers|,|0| |Monophyly|,|0| |Drinkers_Mass|,|0| -|Satyashodhak_Samaj|,|0| |ECC_memory|,|0| -|AppleJack|,|0| -|Cardiff_Giant|,|0| -|Superthug|,|0| |Bropho_v_Western_Australia|,|0| |List_of_unicorn_startup_companies|,|0| +|Oneview|,|1| +|Johann_Christian_von_Hellbach|,|0| +|Public_Committee_(Israel)|,|0| +|ProCare|,|0| +|Optoutprescreen.com|,|0| +|Software_copyright_in_China|,|0| +|An_American_Revolution|,|0| +|Citizens'_Forum_on_Canada's_Future|,|0| +|Tokio_Jokio|,|0| +|Leland_Sundries|,|0| +|Choropleth_map|,|0| +|Francisco_Hudson|,|0| +|Delta1_Canis_Minoris|,|0| +|GeekBrief.TV|,|0| +|Tiffany_Mitchell|,|0| +|Political_consciousness|,|0| +|Endorphina|,|0| +|Inkurdish|,|0| +|Eastbank_Esplanade|,|0| +|Stalin_(Scheme_implementation)|,|1| +|York_Band_Instrument_Company|,|0| +|August_Novelization|,|0| +|Tokopah_Falls|,|0| +|Analytical_Sciences_Digital_Library|,|0| +|Frameserver|,|1| +|Yellow_hypergiant|,|0| +|Severstal|,|0| +|Mass_Effect:_Andromeda|,|0| +|Arbitration_Act_1996|,|0| +|Abelian_variety|,|0| +|Ports_of_the_Baltic_Sea|,|0| +|Sports_broadcasting_contracts_in_Thailand|,|0| +|Juan_María_Fernández_y_Krohn|,|0| +|Certified_copy|,|0| +|Kyocera|,|0| +|Sassui_Punnhun|,|0| +|Yume_Penguin_Monogatari|,|0| +|Ortelius_oval_projection|,|0| +|SS_Sapona|,|0| +|Creatorverse|,|0| +|Language_policies_of_Francoist_Spain|,|0| +|Wind-wave_dissipation|,|0| +|James_Hamblin_(journalist)|,|0| +|Boomj.com|,|0| +|741_Botolphia|,|0| +|Palestine_Papers|,|0| +|Data_library|,|0| +|Software_product_management|,|0| +|ProRec|,|0| +|Pig_Rush|,|0| +|Kappa1_Ceti|,|0| +|Tom_Clancy's_Splinter_Cell:_Blacklist|,|1| +|Frank_Clement_(racing_driver)|,|0| +|Qualcomm_Snapdragon|,|0| +|Charlie_Brooker's_Weekly_Wipe|,|0| +|Politician|,|0| +|Corniche_Beirut|,|0| +|Nickolas_Mohanna|,|0| +|NLUUG|,|0| +|Broadway_Melody_of_1940|,|0| +|Crazy_Talk|,|0| +|Tengen_(company)|,|0| +|Tales_of_the_World:_Tactics_Union|,|1| +|Joint_Probabilistic_Data_Association_Filter|,|0| +|United_Nations_Security_Council_Resolution_1390|,|0| +|Resource_Access_Control_Facility|,|1| +|Peter_Suchenwirt|,|0| +|Saalivaahanan|,|0| +|Bill_O'Reilly_(political_commentator)|,|0| +|PatrolBot|,|0| +|Elvis_by_Request:_Flaming_Star_and_3_Other_Great_Songs|,|0| +|Spruce-fir_forests|,|0| +|Total_permanent_disability_insurance|,|0| +|Central_Macedonia|,|0| +|IP_exchange|,|0| +|Lila_Tretikov|,|0| +|Josh_Neufeld|,|0| +|Tiny_Masters_of_Today|,|0| +|Psych:_The_Musical|,|0| +|Crack_(password_software)|,|1| +|Cosmos_(plant)|,|0| +|Matt_Cutts|,|0| +|IOS_10|,|1| +|Prosumware|,|0| +|1252_Celestia|,|0| +|GovNet|,|0| +|Postgraduate_Certificate_in_Laws|,|0| +|Cinema_Museum_of_Melgaço|,|0| +|Apensar|,|1| +|Schur_polynomial|,|0| +|The_Next_List|,|0| +|Focused_impedance_measurement|,|0| +|The_Other_Path:_The_Economic_Answer_to_Terrorism|,|0| +|Vehicle_registration_plates_of_Guyana|,|0| +|Rocket_Raccoon|,|0| +|Justice_in_the_World|,|0| +|Fufu|,|0| +|PetMed_Express|,|0| +|Jon-Marc_McDonald|,|0| +|PL360|,|1| +|Taken_3|,|0| +|Comparison_of_online_dating_websites|,|0| +|Margaret_Ng|,|0| +|Tomcat_Alley|,|1| +|Kingway_Brewery|,|0| +|Hollywood_East_Film_Festival|,|0| +|Vladivostok_Air_destinations|,|0| +|Comparison_of_3D_printers|,|0| +|FIRJAN_System|,|0| +|He's_a_Pirate|,|0| +|Jay_Maynard|,|0| +|Universal_Pictures|,|0| +|Tibetan_Sign_Language|,|0| +|Pole_Position_II|,|1| +|Idea_networking|,|0| +|MANCOOSI|,|0| +|Arab_television_drama|,|0| +|Evergreen_(Love_Theme_from_A_Star_Is_Born)|,|0| +|Parenthesis_(rhetoric)|,|0| +|Klonoa:_Empire_of_Dreams|,|1| +|Joseph_Anderson_Panton|,|0| +|Shield_Knight|,|0| +|Frequent_subtree_mining|,|0| +|Manuel_Senante_Martinez|,|0| +|ElseVR|,|0| +|Koingo_Software|,|0| +|Star_Wolves|,|1| +|PCGamesN|,|0| +|TNA_Impact!_(video_game)|,|1| +|From_a_Second_Story_Window|,|0| +|Willkommen_Collective|,|0| +|44th_parallel_north|,|0| +|Rockin'_with_Judy_Jetson|,|0| +|Nat_Ho|,|0| +|Shane_Webcke|,|0| +|SpiralFrog|,|0| +|PureSolo|,|0| +|Restrictions_on_the_import_of_cryptography|,|0| +|F2c|,|1| +|Magnetek|,|0| +|Hybrid_DVD|,|0| +|Psychosynthesis|,|0| +|Riot_Runners|,|1| +|LogicLocker|,|1| +|Johnny's_Theme|,|0| +|Flory–Huggins_solution_theory|,|0| +|3255_Tholen|,|0| +|Android_Dev_Phone|,|0| +|Bolocam_Galactic_Plane_Survey|,|0| +|Ignite_Channel|,|0| +|Intruder_in_the_Dust_(film)|,|0| +|Nickel–zinc_battery|,|0| +|Viña_del_Mar_psychopaths|,|0| +|Mr._Bones_(video_game)|,|1| +|Rigged_Hilbert_space|,|0| +|Osama_bin_Laden's_house_in_Khartoum|,|0| +|Drunken_Peasants|,|0| +|SDL_MultiTerm|,|1| +|Om_(band)|,|0| +|Bolt_(website)|,|0| +|Jason_X|,|0| +|While_New_York_Sleeps|,|0| +|Simulated_pregnancy|,|0| +|Subaltern_(postcolonialism)|,|0| +|Blue_Ensign|,|0| +|Safed|,|0| +|Epsilon_Indi|,|0| +|Zheng_Zhengqiu|,|0| +|Jubii|,|1| +|2017_China_bolide|,|0| +|Kaadhal_Mannan|,|0| +|Blancmange_curve|,|0| +|Bezold–Brücke_shift|,|0| +|Chrometa|,|1| +|MyVoucherCodes|,|0| +|Hard_Times_(1975_film)|,|0| +|Military_Tract_of_1812|,|0| +|Web_typography|,|1| +|Best_German_Novels_of_the_Twentieth_Century|,|0| +|INVEST_(mnemonic)|,|0| +|Dark_Rye|,|0| +|Cake_Wrecks|,|1| +|Newslookup|,|1| +|The_Return_of_the_Riddle_Rider|,|0| +|Shockproof|,|0| +|E.V.O.:_Search_for_Eden|,|1| +|Algebraic_matroid|,|0| +|Domain-specific_entertainment_language|,|1| +|Ceratobasidium_cornigerum|,|0| +|Apple_Lossless|,|1| +|Charon_(band)|,|0| +|ART_Holdings|,|0| +|Foam_weapon|,|0| +|Asteroids_Deluxe|,|0| +|Coat_of_arms_of_Curaçao|,|0| +|Turnabout_Glacier|,|0| +|Commonwealth_Association_of_Law_Reform_Agencies|,|0| +|Coupling_(computer_programming)|,|0| +|Andorran_political_reform_referendum,_1978|,|0| +|Crt0|,|1| +|Victoria_University_of_Wellington|,|0| +|Black_Cloud|,|0| +|Flame-Sim|,|1| +|Set_decorator|,|0| +|Ibansk|,|0| +|Trespass_to_the_person|,|0| +|1-2-Switch|,|1| +|Second_City_Television|,|0| +|Sisters_of_Perpetual_Indulgence|,|0| +|Arecont_Vision|,|0| +|PVH_(company)|,|0| +|We_Too_Walked_on_the_Moon|,|0| +|TypeCon2008_Buffalo|,|0| +|Nick_Vernier_Band|,|0| +|OER_Africa|,|0| +|Hotline_Communications|,|1| +|The_Child_of_Destiny|,|0| +|R-tools|,|1| +|Josef_Ludwig_Holub|,|0| +|Kamenický_encoding|,|0| +|Nicolae_Petrescu-Comnen|,|0| +|AMD_Radeon_500_series|,|0| +|Phelios|,|1| +|Farm_House_(film)|,|0| +|Eternal_September|,|0| +|Sébastien_Loeb_Rally_Evo|,|1| +|The_York_Water_Company|,|0| +|Courier_(typeface)|,|1| +|Youth_voice|,|0| +|Alexandre_Gomes|,|0| +|Azilsartan|,|0| +|ChileHardware|,|0| +|Weyl_character_formula|,|0| +|License_compatibility|,|0| +|Jimmy_Choo_Ltd|,|0| +|Unobtrusive_JavaScript|,|0| +|E4_Series_Shinkansen|,|0| +|Microdecisions,_Inc._v._Skinner|,|0| +|Icelandic_Braille|,|0| +|Method_of_continuity|,|0| +|Groundwater_on_Mars|,|0| +|Electroneum|,|0| +|Namco_System_11|,|0| +|Robotic_voice_effects|,|0| +|Sony_Imagesoft|,|0| +|Beraunite|,|0| +|Steve_Jackson_(British_game_designer)|,|0| +|Scad_(fraud)|,|0| +|Stealware|,|1| +|Alice_Remsen|,|0| +|Epsilon_Trianguli|,|0| +|Glazier_Systems|,|0| +|BitTyrant|,|1| +|Breaking_the_Code|,|0| +|Alfred_(software)|,|1| +|Lacuna_model|,|1| +|Takrut|,|0| +|Bipartite_graph|,|0| +|SomeOne|,|0| +|Chemistry_Central|,|0| +|Washington_D.C._Area_Film_Critics_Association_Award_for_Best_Documentary|,|0| +|Vector_addition_system|,|0| +|Junonia_genoveva|,|0| +|Service_assurance|,|0| +|New_England_Lost_Ski_Areas_Project|,|0| +|GIFBuilder|,|1| +|Devils'_Line|,|0| +|Coast|,|0| +|Jones_Angell|,|0| +|The_Darkside_Detective|,|0| +|Die_Hard_(film_series)|,|0| +|SymmetricDS|,|1| +|Asphalt:_Urban_GT_2|,|1| +|Sznajd_model|,|0| +|ClearDB_Documenter|,|0| +|StepStone|,|0| +|Parliament_of_England|,|0| +|Hierarchical_network_model|,|0| +|El_Djazairia_One|,|0| +|Bridge_Global|,|0| +|Special_Security_Office|,|0| +|Software_lockout|,|0| +|Roger_Cruz|,|0| +|Malacologica_Bohemoslovaca|,|0| +|While_Shepherds_Watched_Their_Flocks|,|0| +|PropertyShark|,|0| +|Gerard_Bancker|,|0| +|Rasmus_Nellemann|,|0| +|Thief_(soundtrack)|,|0| +|Kenneth_Lerer|,|0| +|Great_Wall_Pan_Asia_Holdings|,|0| +|Partial_autocorrelation_function|,|0| +|Resistance_3|,|1| +|Mafia_film|,|0| +|Ruin_value|,|0| +|OTS_44|,|0| +|Ford_Windstar|,|0| +|20th_Century_Fox_Film_Noir|,|0| +|George_MacDonald_(game_designer)|,|0| +|Kees_Koolen|,|0| +|Castlevania:_Circle_of_the_Moon|,|1| +|SuperKarts|,|1| +|Speech_sound_disorder|,|0| +|DCVG|,|0| +|BigDFT|,|1| +|The_Sniper_(1952_film)|,|0| +|Hall–Janko_graph|,|0| +|Zero_Critical|,|1| +|Antanas_Guoga|,|0| +|GRS_1915+105|,|0| +|NGC_278|,|0| +|Algebraic_torus|,|0| diff --git a/src/check/isstub.py b/src/check/isstub.py index 23e2e36..3baa9d5 100644 --- a/src/check/isstub.py +++ b/src/check/isstub.py @@ -2,6 +2,8 @@ from mine.dbpedia import get_all_templates from mine.dbpedia import to_uri from mine.wiki import wiki_request +from multiprocessing import Pool +from data import flat_list def chunk(l, n): @@ -14,8 +16,9 @@ def chunk(l, n): class IdentifyStubs(ArtdictCheck): def check(self, articledict): - qresult = get_all_templates(root=to_uri("Category:Formal_languages")) - qresult.update(get_all_templates(root=to_uri("Category:Computer_file_formats"))) + print("Checking for stubs...") + qresult = get_all_templates(root=to_uri("Category:Computing_platforms")) + qresult.update(get_all_templates(root=to_uri("Category:Software"))) for a in articledict: articledict[a]["IsStub"] = 0 for title, templates in qresult.items(): @@ -27,41 +30,62 @@ def check(self, articledict): class IdentifyStubsWikiApi(ArtdictCheck): def check(self, articledict): + print("Checking for stubs using wikimedia api...") for a in articledict: articledict[a]["IsStub"] = 0 titles = list(articledict.keys()) print(len(titles)) - titlechunks = chunk(titles, 1) - for titlechunk in titlechunks: - titlesstring = '|'.join(titlechunk) - - params = { - "titles": titlesstring, - "prop": "templates", - "tllimit": 500, - "tlnamespace": 10 - } - response = wiki_request(params) - - if "continue" in response: - print("continue...") - normalized = {} - if "normalized" in response["query"]: - normalized = {normentry["to"]: normentry["from"] for normentry in response["query"]["normalized"]} - - for pageentry in response["query"]["pages"]: - title = pageentry["title"] - if title in normalized: - title = normalized[title] - if "templates" not in pageentry: - continue - for templateentry in pageentry["templates"]: - if "-stub" in templateentry["title"].lower(): - articledict[title]["IsStub"] = 1 + titlechunks = chunk(titles, 15) + pool = Pool(processes=10) + stub_checks = pool.map(self.check_stub, titlechunks) + + stub_checks = dict(list(set(flat_list(stub_checks)))) + + print("All processed, writing dict.") + + for article in stub_checks: + articledict[article]["IsStub"] = stub_checks[article] return articledict + def check_stub(self, titlechunk): + print("Checking %s" % titlechunk) + titlesstring = '|'.join(titlechunk) + # print(titlesstring) + params = { + "titles": titlesstring, + "prop": "templates", + "tllimit": 500, + "tlnamespace": 10 + } + response = wiki_request(params) + if response is None: + print("No response") + return [] + if "continue" in response: + print("continue...") + normalized = {} + if "normalized" in response["query"]: + normalized = {normentry["to"]: normentry["from"] for normentry in response["query"]["normalized"]} + + result = [] + for pageentry in response["query"]["pages"]: + title = pageentry["title"] + if title in normalized: + title = normalized[title] + if "templates" not in pageentry: + continue + for templateentry in pageentry["templates"]: + if "-stub" in templateentry["title"].lower(): + result.append((title, 1)) + else: + result.append((title, 0)) + + print("...done.") + return list(set(result)) + + if __name__ == '__main__': - IdentifyStubs().solo() + IdentifyStubsWikiApi().solo() diff --git a/src/classify/decision_tree.py b/src/classify/decision_tree.py index 42fd4b0..84c371c 100644 --- a/src/classify/decision_tree.py +++ b/src/classify/decision_tree.py @@ -7,6 +7,7 @@ from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.metrics import balanced_accuracy_score, f1_score, recall_score, precision_score from sklearn.feature_selection import SelectKBest, SelectFpr, chi2, mutual_info_classif, f_classif +from sklearn.model_selection import train_test_split from classify.dottransformer import transform from imblearn.under_sampling import RepeatedEditedNearestNeighbours from imblearn.over_sampling import SMOTE @@ -17,6 +18,7 @@ import matplotlib.pyplot as plt from json import dump from collections import Counter +from random import sample F_SETNAMES = ["DbpediaInfoboxTemplate", "URL_Braces_Words", "COPHypernym", "Lemmas", "Wikipedia_Lists"] # Lemmas, @@ -52,6 +54,9 @@ def build_dok_matrix(id_to_a, f_to_id, ad, F_Names): flen = len(f_to_id) matrix = dok_matrix((len(id_to_a), flen), dtype=numpy.int8) for aid, a in id_to_a.items(): + if a not in ad: + print("%s not in articledict" % a) + continue for F_Name in F_Names: if F_Name in ad[a]: for f in ad[a][F_Name]: @@ -66,14 +71,25 @@ def build_train_data(ad, f_to_id, splitnr=3000): A_random, y_random = get_random_data() A_train, y_train = A_seed + A_random[:splitnr], y_seed + y_random[:splitnr] id_to_a_train = build_id_to_a(A_train) + print(id_to_a_train) X_train = build_dok_matrix(id_to_a_train, f_to_id, ad, F_SETNAMES) return X_train, y_train, id_to_a_train def build_validation_data(ad, f_to_id, splitnr=3000, splitsize=1000): - splitnr2 = splitnr + splitsize + # splitnr2 = splitnr + splitsize A_random, y_random = get_random_data() - A_validate, y_validate = A_random[splitnr:splitnr2], y_random[splitnr:splitnr2] + + validation_data = [] + while len([d for d in validation_data if d[1] == "1"]) < 100: + print("Sampling validation data...") + validation_data = sample(list(zip(A_random, y_random)), 1000) + + print("Sampled validation data with ") + print(Counter([d[1] for d in validation_data])) + A_validate = [d[0] for d in validation_data] + y_validate = [d[1] for d in validation_data] + id_to_a_validate = build_id_to_a(A_validate) X_validate = build_dok_matrix(id_to_a_validate, f_to_id, ad, F_SETNAMES) return X_validate, y_validate, id_to_a_validate @@ -92,6 +108,13 @@ def build_eval_data_double(ad, f_to_id): X_eval = build_dok_matrix(id_to_a_eval, f_to_id, ad, F_SETNAMES) return X_eval, y_eval, id_to_a_eval +def build_eval_data_random(ad, f_to_id): + A_eval, y_eval = get_random_data("random_eval.csv") + id_to_a_eval = build_id_to_a(A_eval) + X_eval = build_dok_matrix(id_to_a_eval, f_to_id, ad, F_SETNAMES) + return X_eval, y_eval, id_to_a_eval + + def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) @@ -125,9 +148,10 @@ def train_decisiontree_FPR(configurationname, train_data, score_function, unders print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) - if export: - export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) - transform(fitted_ids) + # if export: + print("Exporting decision tree image...") + export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) + transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) @@ -139,11 +163,12 @@ def train_decisiontree_with(configurationname, train_data, k, score_function, un assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data - dtc = DecisionTreeClassifier(random_state=0) + dtc = DecisionTreeClassifier(criterion="entropy", random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) + selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) @@ -169,6 +194,7 @@ def train_decisiontree_with(configurationname, train_data, k, score_function, un dtc = dtc.fit(X_train, y_train, check_input=True) if export: + print("Exporting tree to graph...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) @@ -216,9 +242,9 @@ def train_decisiontree_exploration(ad, trainsize): X_train, y_train, id_to_a_train = build_train_data(ad, f_to_id, trainsize) train_data = X_train, y_train, id_to_a_train # validation - # X_validate, y_validate, id_to_validate = build_validation_data(ad, f_to_id, 3000, 1000) + # X_eval2, y_eval2, id_to_a_eval2 = build_validation_data(ad, f_to_id, 3000, 1000) # test - X_eval2, y_eval2, id_to_a_eval2 = build_eval_data_double(ad, f_to_id) + X_eval2, y_eval2, id_to_a_eval2 = build_eval_data_random(ad, f_to_id) id_to_a_all = build_id_to_a([a for a in ad]) X_all0 = build_dok_matrix(id_to_a_all, f_to_id, ad, F_SETNAMES) @@ -301,19 +327,25 @@ def final_classification(ad, test=True): X_train, y_train, id_to_a_train = build_train_data(ad, f_to_id, 4000) train_data = X_train, y_train, id_to_a_train # validation - # X_validate, y_validate, id_to_validate = build_validation_data(ad, f_to_id, 3000, 1000) + # X_eval2, y_eval2, id_to_a_eval2 = build_validation_data(ad, f_to_id, 3000, 1000) # test - X_eval2, y_eval2, id_to_a_eval2 = build_eval_data_double(ad, f_to_id) + X_eval2, y_eval2, id_to_a_eval2 = build_eval_data_random(ad, f_to_id) id_to_a_all = build_id_to_a([a for a in ad]) X_all0 = build_dok_matrix(id_to_a_all, f_to_id, ad, F_SETNAMES) - k = 22 - configurationname = "KBest_" + str(k) + "_mutual_info_classif_Oversampling" - selector, classifier = train_decisiontree_with(configurationname, train_data, k, mutual_info_classif, - oversam=True, export=True) - X_allk = selector.transform(X_all0) - y_all = classifier.predict(X_allk) + evals = [] + + for k in range(100,2000,50): + configurationname = "KBest_" + str(k) + "_chi2_Oversampling" + selector, classifier = train_decisiontree_with(configurationname, train_data, k, chi2, oversam=True, export=True) + X_allk = selector.transform(X_all0) + y_all = classifier.predict(X_allk) + eval_dict = classifier_score(id_to_a_eval2, classifier, selector, X_eval2, y_eval2) + eval_dict["Positive"] = len([y for y in y_all if y == '1']) + eval_dict["Negative"] = len([y for y in y_all if y == '0']) + eval_dict["Name"] = configurationname + "_eval2" + evals.append(eval_dict) if not test: for x in range(len(y_all)): @@ -321,32 +353,27 @@ def final_classification(ad, test=True): ad[title]["Class"] = y_all[x] save_articledict(ad) - eval_dict = classifier_score(id_to_a_eval2, classifier, selector, X_eval2, y_eval2) - eval_dict["Positive"] = len([y for y in y_all if y == '1']) - eval_dict["Negative"] = len([y for y in y_all if y == '0']) - eval_dict["Name"] = configurationname + "_eval2" - - return pd.DataFrame([eval_dict]) + return pd.DataFrame(evals) if __name__ == '__main__': ad = load_articledict() - df = train_decisiontree_exploration(ad, 2000) - df = df.set_index("Name") - df.to_csv(DATAP + "/exploration_2000.csv") - df = train_decisiontree_exploration(ad, 3000) - df = df.set_index("Name") - df.to_csv(DATAP + "/exploration_3000.csv") - df = train_decisiontree_exploration(ad, 4000) - df = df.set_index("Name") - df.to_csv(DATAP + "/exploration_4000.csv") + # df = train_decisiontree_exploration(ad, 1000) + # df = df.set_index("Name") + # df.to_csv(DATAP + "/exploration_2000.csv") + # df = train_decisiontree_exploration(ad, 3000) + # df = df.set_index("Name") + # df.to_csv(DATAP + "/exploration_3000.csv") + df = final_classification(ad) + # df = df.set_index("Name") + # df.to_csv(DATAP + "/exploration_4000.csv") # ax = df.plot.scatter(x="FPR", y="TPR", style="x", c="blue") # df.plot(x="k", y=["Recall", "Negative-Recall", "Precision", "Balanced_Accuracy", "F_Measure"], title="KBest") # df = train_decisiontree_exploration(ad, splitnr=2000) # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="x", c="orange") # df = train_decisiontree_exploration(ad, splitnr=3000) # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="x", c="red") - # df = train_decisiontree_exploration(ad, train_data, undersam=True) + # df = train_decisiontree_explorati on(ad, train_data, undersam=True) # df.to_csv(DATAP + "/dct_kbest_undersam.csv") # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="o", c="purple") # df.plot(x="k", y=["TPR", "FPR", "Balanced_Accuracy", "F_Measure", "Self_Accuracy"], title="KBest Undersampling") @@ -359,3 +386,6 @@ def final_classification(ad, test=True): # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="o", c="red") # df.plot(x="k", y=["TPR", "FPR", "Balanced_Accuracy", "F_Measure", "Self_Accuracy"], title="KBest Combined Resampling") # plt.show() + # + # df = final_classification(ad,test=False) + # df.to_csv(DATAP + "/final_configuration.csv") diff --git a/src/cluster/hierarchical.py b/src/cluster/hierarchical.py new file mode 100644 index 0000000..5edd450 --- /dev/null +++ b/src/cluster/hierarchical.py @@ -0,0 +1,109 @@ +from docutils.nodes import label, General + +from data import DATAP, load_articledict, save_articledict,start_time, stop_time +from data.explore.feature_freq import analyze_feature_frequency +from pandas import DataFrame, Series +from sklearn.decomposition import PCA, FastICA +from sklearn.cluster import AgglomerativeClustering, SpectralClustering +from sklearn.mixture import GaussianMixture +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +import matplotlib.pyplot as plt +import seaborn as sns + +# %matplotlib inline + +FEATURE_NAMES = ["DbpediaInfoboxTemplate", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"] # Lemmas, +N_CLUSTERS = 2 + +def build_feature_vector(articles): + features_freqs = analyze_feature_frequency(articles, FEATURE_NAMES) + # print(features_freqs) + return features_freqs.keys() + + +def build_feature_matrix(features, articles): + feature_matrix = {f: [] for f in features} + for a in articles.items(): + # print(a) + feature_freq = analyze_feature_frequency({a[0]: a[1]}, FEATURE_NAMES) + for key in feature_matrix: + feature_matrix[key].append(1 if key in feature_freq else 0) + return feature_matrix + + +def create_dataframe(articles): + print("Creating data frame...") + features = build_feature_vector(articles) + feature_matrix = build_feature_matrix(features, articles) + feature_matrix["Name"] = [a for a in articles] + + df = DataFrame(feature_matrix, columns=["Name"] + list(features)) + df = df.set_index("Name") + return df + + +def plot_reduced(df, n_clusters, decompositer = PCA): + reduced = df + print("Plotting two-dimensional dataframe reduction...") + # reduced = DataFrame(decompositer(n_components=2).fit_transform(df.iloc[:, 0:-1])) + # reduced = reduced.assign(Class=Series(df.loc[:, "Class"].values, index=reduced.index)) + # print(reduced) + for n in range(n_clusters): + plt.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], cmap=plt.get_cmap("Spectral"), label= "Class %s" % n) + plt.legend() + plt.show() + + +def analyze_clusters(ad, df, n_clusters): + print("Analyzing clusters...") + for n in range(n_clusters): + analyze_cluster(ad, df[df["Class"] == n]) + + +def analyze_cluster(ad, cluster): + article_names = cluster.index.values + + print("Relevant features: %s" % str(cluster.columns.values)) + + articles = dict([a for a in ad.items() if a[0] in article_names]) + print("Feature Frequency for cluster %s" % cluster.loc[:, "Class"][0]) + print(dict(sorted(analyze_feature_frequency(articles, F_SetNames=FEATURE_NAMES).items(), key=lambda v: v[1], reverse=True))) + + +if __name__ == "__main__": + ad = load_articledict() + seed = dict([a for a in ad.items() if a[1]["Seed"] == 1 and a[1]["IsStub"] == 0]) + print("Got %s samples without stubs in seed." % str(len(seed))) + # seed = dict(list(seed.items())[0:2]) + # print(seed) + df = create_dataframe(seed) + + corr = df.corr() + + f, ax = plt.subplots(figsize=(10, 6)) + hm = sns.heatmap(round(corr, 2), annot=True, ax=ax, cmap="coolwarm", fmt='.2f', + linewidths=.05) + f.subplots_adjust(top=0.93) + t = f.suptitle('Correlation Heatmap', fontsize=14) + + plt.show() + + # print("Selecting features...") + # k_select = SelectKBest(score_func=chi2, k=100) + # df = k_select.fit(df) + # print("Applying FastICA to seed...") + # pca = PCA(n_components=50) + # df = DataFrame(pca.fit_transform(df), index=df.index) + # + # print("Creating cluster...") + # ac = GaussianMixture(n_components=N_CLUSTERS) + # # ac.fit(df) + # # y = ac.labels_ + # + # df = df.assign(Class=Series(ac.fit_predict(df), index=df.index)) + # # print(y) + # plot_reduced(df, N_CLUSTERS) + # + # analyze_clusters(ad, df, N_CLUSTERS) + + diff --git a/src/data/__init__.py b/src/data/__init__.py index 02b471a..17f5b1f 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -4,7 +4,7 @@ import time # UTIL -DATAP = "S:/Data/Wikipedia" +DATAP = "/Users/dnikonov/Uni/wikionto/data/mined" AP = DATAP + "/articledict.json" @@ -23,7 +23,7 @@ def save_articledict(ad): def backup_articledict(ad): - with open(DATAP+"/temp/articledict_backup.json", "w", encoding="utf-8") as f: + with open(DATAP+"/articledict_backup.json", "w", encoding="utf-8") as f: dump(ad, f) @@ -47,9 +47,13 @@ def stop_time(start_time): print(hms_string(time.time() - start_time)) +def flat_list(l): + return [item for sublist in l for item in sublist] + + # SCOPING DEPTH = 8 -ROOTS = ["Category:Formal_languages", "Category:Computer_file_formats"] +ROOTS = ["Category:Computing_platforms", "Category:Software"] FEATURE_SETNAMES = ["DbpediaInfoboxTemplate", "URL_Braces_Words", "COPHypernym", "Lemmas", "Wikipedia_Lists"] # Lemmas, INDICATORS = ["PositiveInfobox", "URLBracesPattern", "In_Wikipedia_List", "PlainTextKeyword", "POS", "COP", @@ -57,7 +61,7 @@ def stop_time(start_time): # CONFIG FOR INDICATORS # - for POS and URLBracesPattern -KEYWORDS = ['language', 'format', 'notation'] +KEYWORDS = ['software', 'system', 'application', 'framework', 'api', 'programming'] # - stretched keywords resulting which maybe hint at languages. Here, maybe means that we subjectively know that such # software has its own language, but! we cannot objectively present proof in the summary. @@ -65,7 +69,7 @@ def stop_time(start_time): ['template', 'system'], ['theorem', 'prover'], ['parser', 'generator'], ['typesetting', 'system']] # - infobox indication -POSITIVETEMPLATES = ["infobox_programming_language", "infobox_file_format"] +POSITIVETEMPLATES = ["infobox_software", "infobox_os"] # - Wikipedia Lists # LIST_ARTICLES = retrievelists() @@ -76,9 +80,7 @@ def stop_time(start_time): 'Category:Propositional_attitudes‎', 'Category:Theorems'] # - Note that all other infoboxes are negative. There are about 600 different templates used in the scope. -NEUTRALTEMPLATES = ["infobox_software", "infobox_technology_standard", "infobox_software_license"] + \ - ["infobox", "infobox_unit", "infobox_data_structure", "infobox_writing_system", - "infobox_quality_tool", "infobox_identifier"] +NEUTRALTEMPLATES = ["infobox", "infobox_windows_component", "infobox_software_license", "infobox_programming_language", "infobox_file_format", "infobox_os_component", "infobox_web_browser", "infobox_website", "infobox_file_system", "infobox_filesystem"] # - Negative URL keywords EX_URL_KEYWORD = ["List_of", "comparison", "Comparison"] diff --git a/src/data/eval/random_sampling.py b/src/data/eval/random_sampling.py index 6f8133d..18470e3 100644 --- a/src/data/eval/random_sampling.py +++ b/src/data/eval/random_sampling.py @@ -4,6 +4,7 @@ import pandas as pd import csv import webbrowser +from selenium import webdriver def perform_eval(): @@ -12,7 +13,7 @@ def perform_eval(): S = set(a for a in ad if ad[a]["Seed"]) articles_visited = set() - with open(DATAP + '/eval/random.csv', 'r', encoding="UTF8") as f: + with open(DATAP + '/eval/random.csv', 'r+', encoding="UTF8") as f: olddata = "" count = 0 for line in f: @@ -24,11 +25,12 @@ def perform_eval(): olddata += "\n" print("old size: " + str(len(articles_visited))) - with open(DATAP + '/eval/random.csv', 'w', encoding="UTF8") as f: + with open(DATAP + '/eval/random.csv', 'w+', encoding="UTF8") as f: f.write(olddata) articles = list(ad.keys()) articles.sort() x = count + driver = webdriver.Chrome() while x < 4000: index = random.randint(0, len(ad)) article = articles[index] @@ -37,13 +39,13 @@ def perform_eval(): else: articles_visited.add(article) print("https://en.wikipedia.org/wiki/" + article) - webbrowser.open("https://en.wikipedia.org/wiki/" + article, new=2) + driver.get("https://en.wikipedia.org/wiki/" + article) agreement = "" - while agreement not in ["yes", "no"]: - agreement = input(str(x) + " Enter 'yes' or 'no'!") - if agreement == "yes": + while agreement not in ["1", "2"]: + agreement = input(str(x) + " Enter '1 (yes)' or '2 (no)'!") + if agreement == "1": agreement_int = "1" - if agreement == "no": + if agreement == "2": agreement_int = "0" f.write("|" + article + "|,|" + agreement_int + "|\n") f.flush() @@ -56,13 +58,14 @@ def get_classification(title, langdict): return resultdict -def get_random_data(): - with open(DATAP + "/eval/random.csv", "r", encoding="utf-8") as f: +def get_random_data(filename="random_seed.csv"): + with open(DATAP + "/eval/" + filename, "r+", encoding="utf-8") as f: reader = csv.reader(f, quotechar='|', quoting=csv.QUOTE_MINIMAL) A_random = [] y = [] for row in reader: + print(row) A_random.append(row[0]) y.append(row[1]) diff --git a/src/data/explore/explore_seed_infoboxes.py b/src/data/explore/explore_seed_infoboxes.py index a47d7ac..86368dd 100644 --- a/src/data/explore/explore_seed_infoboxes.py +++ b/src/data/explore/explore_seed_infoboxes.py @@ -8,14 +8,14 @@ def explore(): f.close() freq = dict() for cl in langdict: - if langdict[cl]["Seed"] == 0 or "DbpediaInfoboxTemplate" not in langdict[cl]: + if langdict[cl]["Seed"] == 1 or "DbpediaInfoboxTemplate" not in langdict[cl]: continue for i in set(langdict[cl]["DbpediaInfoboxTemplate"]): if i in freq: freq[i] += 1 else: freq[i] = 1 - f = open(DATAP + '/explore_seed_infoboxes.json', 'w', encoding="UTF8") + f = open(DATAP + '/explore_seed_infoboxes_non_seed.json', 'w', encoding="UTF8") dump(freq,f,indent=2) f.flush() f.close() @@ -33,4 +33,4 @@ def find_software_pl(): if __name__ == '__main__': - find_software_pl() + explore() diff --git a/src/data/explore/explore_seed_lists.py b/src/data/explore/explore_seed_lists.py new file mode 100644 index 0000000..aa62f47 --- /dev/null +++ b/src/data/explore/explore_seed_lists.py @@ -0,0 +1,8 @@ +from data import DATAP, KEYWORDS +from json import load + +f = open(DATAP + '/articledict.json', 'r', encoding="UTF8") +articledict = load(f) +for title in [a[0] for a in articledict if a[1]["Seed"] == 1]: + if ("list" in title.lower() or "comparison" in title.lower()) and any(k in title for k in KEYWORDS): + print(title) diff --git a/src/data/explore/explore_seed_nouns.py b/src/data/explore/explore_seed_nouns.py index 4df9242..52bb827 100644 --- a/src/data/explore/explore_seed_nouns.py +++ b/src/data/explore/explore_seed_nouns.py @@ -1,15 +1,16 @@ -from multiprocessing import Pool +from multiprocessing.pool import ThreadPool from nltk.parse.corenlp import CoreNLPDependencyParser from requests.exceptions import HTTPError from json.decoder import JSONDecodeError from json import load, dump from data import DATAP - +from collections import OrderedDict def get_single(summary): if summary.startswith('.'): summary = summary[1:] dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') + print("Finished %s" % summary) try: parse, = dep_parser.raw_parse(summary) nouns = set() @@ -34,7 +35,7 @@ def explore(): d = load(f) f.close() cl_sums = list(d[cl]["Summary"] for cl in d if ("Summary" in d[cl]) and (d[cl]["Seed"] == 1)) - pool = Pool(processes=10) + pool = ThreadPool(processes=10) nnlists = pool.map(get_single, cl_sums) nouns_f = dict() for nnlist in nnlists: @@ -43,6 +44,8 @@ def explore(): nouns_f[nn] += 1 else: nouns_f[nn] = 1 + + nouns_f = OrderedDict(sorted(nouns_f.items(), key=lambda item: item[1])) f = open(DATAP + '/explore_seed_nouns.json', 'w', encoding='utf8') dump(nouns_f, f, indent=2) f.flush() @@ -61,4 +64,4 @@ def get_top10(): if __name__ == "__main__": - get_top10() + explore() diff --git a/src/features/cop_firstsentence.py b/src/features/cop_firstsentence.py index 6759046..d8c9531 100644 --- a/src/features/cop_firstsentence.py +++ b/src/features/cop_firstsentence.py @@ -27,6 +27,7 @@ def check_single(self, triple): try: cop = COPSemgrex(first_sentence).run() del cop['Aisa'] + print("Finished %s" % title) return title, cop except JSONDecodeError: print("Decode Error at :" + title) diff --git a/src/features/cop_semgrex.py b/src/features/cop_semgrex.py index 05d0af0..25365e0 100644 --- a/src/features/cop_semgrex.py +++ b/src/features/cop_semgrex.py @@ -1,3 +1,4 @@ +from requests.sessions import Session import requests import time from stanford.custom_stanford_api import StanfordCoreNLP, Exception500 @@ -17,9 +18,11 @@ def __init__(self, text): 'isfamilyof': '{pos:NN;word:family} >cop {pos:/VB.*/} >/nmod:of/ {pos:NNS}=hypernym' } self.text = text + self.session = Session() + self.session.trust_env = False def run(self): - while not is_alive(): + while not self.is_alive(): print("Not alive") time.sleep(5) cops = dict() @@ -32,30 +35,28 @@ def run(self): if not responsedict: cops[variant] = [] else: - cops[variant] = get_noun(responsedict) + cops[variant] = self.get_noun(responsedict) for hypernym in cops['isa']: if hypernym in cops['Aisa']: cops['isa'].remove(hypernym) return cops + def get_noun(self, responsedict): + if not responsedict: + return [] + else: + cops = [] + for sentence_result in responsedict['sentences']: + for x in range(sentence_result['length']): + cops.append(sentence_result[str(x)]['$hypernym']['text']) + return cops -def get_noun(responsedict): - if not responsedict: - return [] - else: - cops = [] - for sentence_result in responsedict['sentences']: - for x in range(sentence_result['length']): - cops.append(sentence_result[str(x)]['$hypernym']['text']) - return cops - - -def is_alive(): - try: - return requests.get("http://localhost:9000/ping").ok - except requests.exceptions.ConnectionError: - return False + def is_alive(self): + try: + return self.session.get("http://localhost:9000/ping").ok + except requests.exceptions.ConnectionError: + return False if __name__ == '__main__': diff --git a/src/features/lists_of.py b/src/features/lists_of.py index a6bb1d8..007dff8 100644 --- a/src/features/lists_of.py +++ b/src/features/lists_of.py @@ -8,7 +8,7 @@ def check(self, articledict): print("Checking Wikipedia's lists") with open(DATAP + "/listlinks.json", "r") as f: - lld = load(f) + lld = dict(load(f)) for article in articledict: articledict[article]['Wikipedia_Lists'] = [] @@ -18,7 +18,7 @@ def check(self, articledict): if listtitle in articledict: articledict[listtitle]["IsList"] = 1 for article in articles: - if article in articledict: + if article.replace("_", " ") in articledict: articledict[article]["Wikipedia_Lists"].append(listtitle) return articledict diff --git a/src/features/summary_lemma.py b/src/features/summary_lemma.py index 3e2fc61..e82b533 100644 --- a/src/features/summary_lemma.py +++ b/src/features/summary_lemma.py @@ -33,7 +33,7 @@ def get_lemmas(p): try: lemmas = [] parsedict = parser.annotate(text, annotators="tokenize,ssplit,pos,lemma") - if not parsedict['sentences']: + if "sentences" not in parsedict or not parsedict['sentences']: return title, [] for sentence in parsedict['sentences']: parsedict = sentence["tokens"] diff --git a/src/features/summary_words.py b/src/features/summary_words.py index 5db7343..edb0a3e 100644 --- a/src/features/summary_words.py +++ b/src/features/summary_words.py @@ -29,7 +29,7 @@ def get_words(pair): try: nouns = [] parsedict = parser.annotate(text) - if not parsedict['sentences']: + if "sentences" not in parsedict or not parsedict['sentences']: return title, [] for sentence in parsedict['sentences']: parsedict = sentence["tokens"] diff --git a/src/mine/dbpedia.py b/src/mine/dbpedia.py index 8c02343..c6bba57 100644 --- a/src/mine/dbpedia.py +++ b/src/mine/dbpedia.py @@ -223,6 +223,7 @@ def get_infobox_templates(root, mindepth, maxdepth): td[cl].append(t) return td + def get_all_templates(root, mindepth=0, maxdepth=8): querytext = """ PREFIX dbo: diff --git a/src/mine/mine_listlinks.py b/src/mine/mine_listlinks.py new file mode 100644 index 0000000..854d9f7 --- /dev/null +++ b/src/mine/mine_listlinks.py @@ -0,0 +1,27 @@ +import json +from mine.wiki import getlinks +from data import DATAP +from multiprocessing import Pool + + +def load_lists(): + with open(DATAP + "/lists.json", "r") as f: + return json.loads(f.read()) + +def mine_list_links(): + lists = load_lists() + pool = Pool(processes=10) + list_links = pool.map(get_list_links, lists) + + with open(DATAP + "/listlinks.json", "w+") as out: + json.dump(list_links, out, indent=4) + + +def get_list_links(list): + print("Retrieving for %s"%list) + links = getlinks(list) + return list, links + + +if __name__ == "__main__": + mine_list_links() \ No newline at end of file diff --git a/src/mine/pipeline.py b/src/mine/pipeline.py index 6a65230..a5c22d0 100644 --- a/src/mine/pipeline.py +++ b/src/mine/pipeline.py @@ -11,12 +11,21 @@ def article_indicators(): - extractors = [Seed, IdentifyDeletedFromWikipedia, IdentifyStubs, InfoboxDbEx, ExtractURLWords, WikiList, SumNouns, SumLemmas, - COPFirstSentence] + extractors = [ + # Seed, + # IdentifyDeletedFromWikipedia, + # IdentifyStubs, + # InfoboxDbEx, + # ExtractURLWords, + # WikiList, + # SumNouns, + # SumLemmas, + COPFirstSentence + ] for e in extractors: e().solo(backup=False) if __name__ == '__main__': - mine() + # mine() article_indicators() diff --git a/src/mine/wiki.py b/src/mine/wiki.py index 501eabb..f032dfe 100644 --- a/src/mine/wiki.py +++ b/src/mine/wiki.py @@ -1,5 +1,6 @@ import requests from json.decoder import JSONDecodeError +from requests.sessions import Session from json import dumps URL = "http://en.wikipedia.org/w/api.php" @@ -12,7 +13,9 @@ def wiki_request(params): params['action'] = 'query' params['utf8'] = '' try: - r = requests.get(URL, params=params, headers=HEADER).json() + s = Session() + s.trust_env = False + r = s.get(URL, params=params, headers=HEADER).json() except requests.ConnectionError as cer: print("Connection Error") print(cer) @@ -85,10 +88,11 @@ def getlinks(title): params = {'prop': 'links' , 'titles': title} wikijson = wiki_request(params) + # print(wikijson) links = [] try: - if 'missing' in next(iter(wikijson["query"]["pages"].values())): - return links + # if 'missing' in next(iter(wikijson["query"]["pages"])): + # return links while True: if "query" not in wikijson: print("None at query " + title) @@ -96,10 +100,10 @@ def getlinks(title): if "pages" not in wikijson["query"]: print("None at pages " + title) return links - if wikijson["query"]["pages"].values() is None: + if wikijson["query"]["pages"] is None: print("None at values " + title) return links - nextpages = next(iter(wikijson["query"]["pages"].values())) + nextpages = next(iter(wikijson["query"]["pages"])) if 'links' not in nextpages: print("None at links " + title) return links diff --git a/src/stanford/custom_stanford_api.py b/src/stanford/custom_stanford_api.py index 8aae04c..c6786f3 100644 --- a/src/stanford/custom_stanford_api.py +++ b/src/stanford/custom_stanford_api.py @@ -1,6 +1,8 @@ import requests import json from requests.sessions import Session +from http.client import HTTPConnection +import logging class StanfordCoreNLP: @@ -11,6 +13,7 @@ class StanfordCoreNLP: def __init__(self, url, session=Session()): self.server_url = url self.session = session + self.session.trust_env = False def annotate(self, text, annotators="tokenize,ssplit,pos", pattern=None, runnr=1): assert isinstance(text, str) @@ -31,8 +34,11 @@ def annotate(self, text, annotators="tokenize,ssplit,pos", pattern=None, runnr=1 try: with self.session.get(self.server_url) as req: data = text.encode('utf8') - r = requests.post( + rs = Session() + rs.trust_env = False + r = rs.post( self.server_url, params=params, data=data, headers={'Connection': 'close'}) + if r.status_code == 500: print(r.content) raise Exception500 From 8cc0c56a3ff8b7e945c8d42d5fb3a93fc3a3323c Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Fri, 26 Apr 2019 13:27:24 +0200 Subject: [PATCH 2/8] rename --- src/cluster/{hierarchical.py => subclasses.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/cluster/{hierarchical.py => subclasses.py} (100%) diff --git a/src/cluster/hierarchical.py b/src/cluster/subclasses.py similarity index 100% rename from src/cluster/hierarchical.py rename to src/cluster/subclasses.py From 4ebb730d1f8efc9a68e57429b38d72a9342f4356 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Thu, 2 May 2019 14:30:09 +0200 Subject: [PATCH 3/8] Further work --- src/cluster/subclasses.py | 147 ++++++++++++++++++++++++------- src/data/explore/feature_freq.py | 20 ++++- 2 files changed, 133 insertions(+), 34 deletions(-) diff --git a/src/cluster/subclasses.py b/src/cluster/subclasses.py index 5edd450..20808ae 100644 --- a/src/cluster/subclasses.py +++ b/src/cluster/subclasses.py @@ -2,21 +2,31 @@ from data import DATAP, load_articledict, save_articledict,start_time, stop_time from data.explore.feature_freq import analyze_feature_frequency -from pandas import DataFrame, Series +import pandas as pd +from sklearn.metrics import silhouette_score from sklearn.decomposition import PCA, FastICA -from sklearn.cluster import AgglomerativeClustering, SpectralClustering +from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans, DBSCAN from sklearn.mixture import GaussianMixture from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D import seaborn as sns +from skfeature.function.similarity_based import SPEC, lap_score # %matplotlib inline -FEATURE_NAMES = ["DbpediaInfoboxTemplate", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"] # Lemmas, -N_CLUSTERS = 2 +FEATURE_NAMES = [ + "DbpediaInfoboxTemplate", + "Lemmas", + "URL_Braces_Words", + "COPHypernym", + "Wikipedia_Lists" +] +N_CLUSTERS = 5 +N_MOST_IMPORTANT_FEATURES = 1000 def build_feature_vector(articles): - features_freqs = analyze_feature_frequency(articles, FEATURE_NAMES) + features_freqs = analyze_feature_frequency(articles, F_SetNames=FEATURE_NAMES) # print(features_freqs) return features_freqs.keys() @@ -25,7 +35,7 @@ def build_feature_matrix(features, articles): feature_matrix = {f: [] for f in features} for a in articles.items(): # print(a) - feature_freq = analyze_feature_frequency({a[0]: a[1]}, FEATURE_NAMES) + feature_freq = analyze_feature_frequency({a[0]: a[1]}, F_SetNames=FEATURE_NAMES) for key in feature_matrix: feature_matrix[key].append(1 if key in feature_freq else 0) return feature_matrix @@ -37,37 +47,105 @@ def create_dataframe(articles): feature_matrix = build_feature_matrix(features, articles) feature_matrix["Name"] = [a for a in articles] - df = DataFrame(feature_matrix, columns=["Name"] + list(features)) + df = pd.DataFrame(feature_matrix, columns=["Name"] + list(features)) df = df.set_index("Name") return df -def plot_reduced(df, n_clusters, decompositer = PCA): - reduced = df - print("Plotting two-dimensional dataframe reduction...") - # reduced = DataFrame(decompositer(n_components=2).fit_transform(df.iloc[:, 0:-1])) - # reduced = reduced.assign(Class=Series(df.loc[:, "Class"].values, index=reduced.index)) - # print(reduced) +def plot_2d(df, n_clusters): + # reduced = df + print("Plotting 2D dataframe reduction...") + reduced = pd.DataFrame(PCA(n_components=2).fit_transform(df.iloc[:, 0:-1])) + reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) + + for n in range(n_clusters): + plt.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) + plt.legend() + plt.show() + + +def plot_3d(df, n_clusters): + # reduced = df + print("Plotting 3D dataframe reduction...") + reduced = pd.DataFrame(PCA(n_components=3).fit_transform(df.iloc[:, 0:-1])) + reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) + + fig = plt.figure() + ax = Axes3D(fig) + for n in range(n_clusters): - plt.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], cmap=plt.get_cmap("Spectral"), label= "Class %s" % n) + ax.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], reduced[reduced["Class"] == n].iloc[:, 2], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) plt.legend() plt.show() +def explore_silhouette(df, n_clusters): + silhouettes = [] + for n in range(2, n_clusters, 1): + clustered = create_cluster(reduced_df, n) + silhouette = cluster_scores(clustered) + silhouettes.append((n, silhouette)) -def analyze_clusters(ad, df, n_clusters): + xs = [s[0] for s in silhouettes] + ys = [s[1] for s in silhouettes] + plt.plot(xs, ys, '-bX', markevery=True) + plt.show() + +def analyze_clusters(ad, df, n_clusters, relevant_features): print("Analyzing clusters...") + # print("Relevant features: %s" % str(relevant_features)) for n in range(n_clusters): - analyze_cluster(ad, df[df["Class"] == n]) + analyze_cluster(ad, df[df["Class"] == n], relevant_features) -def analyze_cluster(ad, cluster): +def analyze_cluster(ad, cluster, relevant_features=None): article_names = cluster.index.values - print("Relevant features: %s" % str(cluster.columns.values)) - articles = dict([a for a in ad.items() if a[0] in article_names]) + print("%s articles clustered in cluster %s" % (len(articles), cluster.loc[:, "Class"][0])) print("Feature Frequency for cluster %s" % cluster.loc[:, "Class"][0]) - print(dict(sorted(analyze_feature_frequency(articles, F_SetNames=FEATURE_NAMES).items(), key=lambda v: v[1], reverse=True))) + if relevant_features is not None: + print(dict(sorted(analyze_feature_frequency(articles, relevant_features=relevant_features).items(), key=lambda v: v[1], reverse=True))) + + +def calc_feature_importance(data): + print("Calculating feature importance matrix...") + X = SPEC.spec(data.values, style=0) + # X = SPEC.feature_ranking(X, style=0) + d_spec = pd.DataFrame([X], columns=data.columns.values, index=["SPEC"]) + d_spec = d_spec.sort_values("SPEC", axis=1, ascending=False) + d_spec.to_csv(DATAP + "/cluster/seed_SPEC.csv") + + return d_spec + + +def load_feature_importance_matrix(): + print("Loading feature importance matrix...") + matrix = pd.read_csv(DATAP + "/cluster/seed_SPEC.csv") + dropped_columns = [c for c in matrix.columns.values[1:] if c.split("::")[0] not in FEATURE_NAMES] + matrix = matrix.drop(labels=dropped_columns, axis=1) + + return matrix + + +def slice_n_relevant_features(data, feature_importance, n): + print("Selecting the %s most relevant features..." % str(n)) + print(data.shape) + print(feature_importance.shape) + relevant_columns = feature_importance.iloc[0, 0:n+1].index.values[1:] + return data.loc[:, relevant_columns] + + +def create_cluster(data, n): + X = data + y = pd.Series(AgglomerativeClustering(n_clusters=n).fit_predict(X), index=X.index) + + data = data.assign(Class=y, index=data.index) + return data + + +def cluster_scores(data): + silhouette = silhouette_score(data.iloc[:, 1:-1], labels=data["Class"].values) + return silhouette if __name__ == "__main__": @@ -78,22 +156,25 @@ def analyze_cluster(ad, cluster): # print(seed) df = create_dataframe(seed) - corr = df.corr() + # calc_feature_importance(df) + feature_importance = load_feature_importance_matrix() + # feature_importance = calc_feature_importance(df) + + reduced_df = slice_n_relevant_features(df, feature_importance, n=N_MOST_IMPORTANT_FEATURES) + + # print(reduced_df.shape) + + explore_silhouette(reduced_df, n_clusters=30) + + + + + # plot_2d(clustered, N_CLUSTERS) + # analyze_clusters(ad, clustered, N_CLUSTERS, relevant_features=[c for c in feature_importance.columns.values[1:] if "COPHypernym" in c]) - f, ax = plt.subplots(figsize=(10, 6)) - hm = sns.heatmap(round(corr, 2), annot=True, ax=ax, cmap="coolwarm", fmt='.2f', - linewidths=.05) - f.subplots_adjust(top=0.93) - t = f.suptitle('Correlation Heatmap', fontsize=14) - plt.show() - # print("Selecting features...") - # k_select = SelectKBest(score_func=chi2, k=100) - # df = k_select.fit(df) - # print("Applying FastICA to seed...") - # pca = PCA(n_components=50) - # df = DataFrame(pca.fit_transform(df), index=df.index) + # clustered["Class"].to_csv(DATAP + "/cluster/clusters.csv") # # print("Creating cluster...") # ac = GaussianMixture(n_components=N_CLUSTERS) diff --git a/src/data/explore/feature_freq.py b/src/data/explore/feature_freq.py index fbdfe21..518383f 100644 --- a/src/data/explore/feature_freq.py +++ b/src/data/explore/feature_freq.py @@ -1,8 +1,17 @@ from data import load_articledict -def analyze_feature_frequency(ad, F_SetNames): +def analyze_feature_frequency(ad, **kwargs): freq = dict() + + if "F_SetNames" not in kwargs: + kwargs["F_SetNames"] = [] + F_SetNames = kwargs["F_SetNames"] + + if "relevant_features" not in kwargs: + kwargs["relevant_features"] = [] + relevant_features = kwargs["relevant_features"] + for a in ad: for F_Name in F_SetNames: if F_Name not in ad[a]: @@ -11,4 +20,13 @@ def analyze_feature_frequency(ad, F_SetNames): if F_Name + "::" + f not in freq: freq[F_Name + "::" + f] = 0 freq[F_Name + "::" + f] += 1 + + for feature in relevant_features: + F_SetName, F_Name = feature.split("::") + if F_SetName not in ad[a]: + continue + if F_Name in ad[a][F_SetName]: + if feature not in freq: + freq[feature] = 0 + freq[feature] += 1 return freq From b3d1b417a9ed26655f17a2b1cfed04a21a7c49f0 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Wed, 19 Jun 2019 11:21:38 +0200 Subject: [PATCH 4/8] Work --- src/classify/decision_tree.py | 55 ++++-- src/cluster/feature_importance.py | 37 ++++ src/cluster/subclasses.py | 260 +++++++++---------------- src/cluster/summarylinks_subclasses.py | 73 +++++++ src/cluster/util.py | 101 ++++++++++ src/data/__init__.py | 43 ++++ src/data/explore/explore_seed_nouns.py | 6 +- src/data/explore/feature_freq.py | 2 - src/mine/mine_summary_links.py | 75 +++++++ src/mine/util.py | 6 +- src/mine/wiki.py | 61 +++++- 11 files changed, 529 insertions(+), 190 deletions(-) create mode 100644 src/cluster/feature_importance.py create mode 100644 src/cluster/summarylinks_subclasses.py create mode 100644 src/cluster/util.py create mode 100644 src/mine/mine_summary_links.py diff --git a/src/classify/decision_tree.py b/src/classify/decision_tree.py index 84c371c..f17e579 100644 --- a/src/classify/decision_tree.py +++ b/src/classify/decision_tree.py @@ -20,7 +20,13 @@ from collections import Counter from random import sample -F_SETNAMES = ["DbpediaInfoboxTemplate", "URL_Braces_Words", "COPHypernym", "Lemmas", "Wikipedia_Lists"] # Lemmas, +F_SETNAMES = [ + "DbpediaInfoboxTemplate", + "URL_Braces_Words", + "COPHypernym", + "Lemmas", + # "InternalWikiLinks", not yet derived for total set + "Wikipedia_Lists"] def get_seed(ad): @@ -30,7 +36,7 @@ def get_seed(ad): def build_f_to_id(FS, ad): - freq = analyze_feature_frequency(ad, FS) + freq = analyze_feature_frequency(ad, F_SetNames=FS) print("Total number of features:" + str(len(freq))) fs = [f for f, count in freq.items() if count > 10] @@ -159,20 +165,24 @@ def train_decisiontree_FPR(configurationname, train_data, score_function, unders def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, - export=False): + export=False, **kwargs): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data - dtc = DecisionTreeClassifier(criterion="entropy", random_state=0) + + max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"] + + dtc = DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=max_depth) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) selector = SelectKBest(score_function, k=k) - result = selector.fit(X_train, y_train) + selector = selector.fit(X_train, y_train) + X_train = selector.transform(X_train) - fitted_ids = [i for i in result.get_support(indices=True)] + fitted_ids = [i for i in selector.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) @@ -336,26 +346,35 @@ def final_classification(ad, test=True): evals = [] - for k in range(100,2000,50): - configurationname = "KBest_" + str(k) + "_chi2_Oversampling" - selector, classifier = train_decisiontree_with(configurationname, train_data, k, chi2, oversam=True, export=True) - X_allk = selector.transform(X_all0) - y_all = classifier.predict(X_allk) - eval_dict = classifier_score(id_to_a_eval2, classifier, selector, X_eval2, y_eval2) - eval_dict["Positive"] = len([y for y in y_all if y == '1']) - eval_dict["Negative"] = len([y for y in y_all if y == '0']) - eval_dict["Name"] = configurationname + "_eval2" - evals.append(eval_dict) + k = 23 + configurationname = "KBest_" + str(k) + "_f_classif_NoResampling" + # print(configurationname) + selector, classifier = train_decisiontree_with(configurationname, train_data, k, chi2, oversam=True, export=True) + + print(get_relevant_feature_names(selector, fs)) + + X_allk = selector.transform(X_all0) + y_all = classifier.predict(X_allk) + eval_dict = classifier_score(id_to_a_eval2, classifier, selector, X_eval2, y_eval2) + eval_dict["Positive"] = len([y for y in y_all if y == '1']) + eval_dict["Negative"] = len([y for y in y_all if y == '0']) + eval_dict["Name"] = configurationname + "_eval2" + evals.append(eval_dict) if not test: for x in range(len(y_all)): title = id_to_a_all[x] ad[title]["Class"] = y_all[x] - save_articledict(ad) + # save_articledict(ad) return pd.DataFrame(evals) +def get_relevant_feature_names(selector, columns): + targets = selector.get_support(indices=True) + return [columns[i] for i in targets] + + if __name__ == '__main__': ad = load_articledict() # df = train_decisiontree_exploration(ad, 1000) @@ -372,7 +391,7 @@ def final_classification(ad, test=True): # df = train_decisiontree_exploration(ad, splitnr=2000) # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="x", c="orange") # df = train_decisiontree_exploration(ad, splitnr=3000) - # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="x", c="red") + # df.plot.scatter(x="FPR", y="TPR", ax=a x, style="x", c="red") # df = train_decisiontree_explorati on(ad, train_data, undersam=True) # df.to_csv(DATAP + "/dct_kbest_undersam.csv") # df.plot.scatter(x="FPR", y="TPR", ax=ax, style="o", c="purple") diff --git a/src/cluster/feature_importance.py b/src/cluster/feature_importance.py new file mode 100644 index 0000000..9f89b37 --- /dev/null +++ b/src/cluster/feature_importance.py @@ -0,0 +1,37 @@ +import pandas as pd +from data import DATAP +from skfeature.function.similarity_based.lap_score import lap_score +from skfeature.utility.construct_W import construct_W + + +def calc_feature_importance(data): + print("Calculating feature importance matrix...") + X = laplacian_score(data.values) + # X = SPEC.feature_ranking(X, style=0) + d_spec = pd.DataFrame([X], columns=data.columns.values, index=["SPEC"]) + d_spec = d_spec.sort_values("SPEC", axis=1, ascending=True) + d_spec.to_csv(DATAP + "/cluster/seed_laplacian.csv") + + return d_spec + + +def load_feature_importance_matrix(): + print("Loading feature importance matrix...") + matrix = pd.read_csv(DATAP + "/cluster/seed_SPEC.csv") + # dropped_columns = [c for c in matrix.columns.values[1:] if c.split("::")[0] not in FEATURE_NAMES] + # matrix = matrix.drop(labels=dropped_columns, axis=1) + + return matrix + + +def slice_n_relevant_features(data, feature_importance, n): + print("Selecting the %s most relevant features..." % str(n)) + print(data.shape) + print(feature_importance.shape) + relevant_columns = feature_importance.iloc[0, 0:n+1].index.values[1:] + return data.loc[:, relevant_columns] + + +def laplacian_score(data): + W = construct_W(data) + return lap_score(data, W=W) \ No newline at end of file diff --git a/src/cluster/subclasses.py b/src/cluster/subclasses.py index 20808ae..b3d4f13 100644 --- a/src/cluster/subclasses.py +++ b/src/cluster/subclasses.py @@ -1,21 +1,26 @@ -from docutils.nodes import label, General +from builtins import range -from data import DATAP, load_articledict, save_articledict,start_time, stop_time -from data.explore.feature_freq import analyze_feature_frequency +from cluster.util import plot_3d, plot_2d, create_cluster, cluster_scores, save_classes +from data import DATAP, load_articledict, load_seedlist, save_articledict,start_time, stop_time import pandas as pd -from sklearn.metrics import silhouette_score -from sklearn.decomposition import PCA, FastICA from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans, DBSCAN from sklearn.mixture import GaussianMixture -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.metrics.pairwise import laplacian_kernel import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D -import seaborn as sns -from skfeature.function.similarity_based import SPEC, lap_score +from data import create_dataframe + # %matplotlib inline +CLUSTERS_ALGORITHMS = { + "kmeans": lambda n: KMeans(n_clusters=n), + "hierarchical": lambda n: AgglomerativeClustering(n_clusters=n), + "gaussian": lambda n: GaussianMixture(n_components=n), + +} + FEATURE_NAMES = [ + "InternalWikiLinks", "DbpediaInfoboxTemplate", "Lemmas", "URL_Braces_Words", @@ -25,166 +30,95 @@ N_CLUSTERS = 5 N_MOST_IMPORTANT_FEATURES = 1000 -def build_feature_vector(articles): - features_freqs = analyze_feature_frequency(articles, F_SetNames=FEATURE_NAMES) - # print(features_freqs) - return features_freqs.keys() - - -def build_feature_matrix(features, articles): - feature_matrix = {f: [] for f in features} - for a in articles.items(): - # print(a) - feature_freq = analyze_feature_frequency({a[0]: a[1]}, F_SetNames=FEATURE_NAMES) - for key in feature_matrix: - feature_matrix[key].append(1 if key in feature_freq else 0) - return feature_matrix - - -def create_dataframe(articles): - print("Creating data frame...") - features = build_feature_vector(articles) - feature_matrix = build_feature_matrix(features, articles) - feature_matrix["Name"] = [a for a in articles] - - df = pd.DataFrame(feature_matrix, columns=["Name"] + list(features)) - df = df.set_index("Name") - return df - - -def plot_2d(df, n_clusters): - # reduced = df - print("Plotting 2D dataframe reduction...") - reduced = pd.DataFrame(PCA(n_components=2).fit_transform(df.iloc[:, 0:-1])) - reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) - - for n in range(n_clusters): - plt.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) - plt.legend() - plt.show() - - -def plot_3d(df, n_clusters): - # reduced = df - print("Plotting 3D dataframe reduction...") - reduced = pd.DataFrame(PCA(n_components=3).fit_transform(df.iloc[:, 0:-1])) - reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) - - fig = plt.figure() - ax = Axes3D(fig) - - for n in range(n_clusters): - ax.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], reduced[reduced["Class"] == n].iloc[:, 2], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) - plt.legend() - plt.show() - -def explore_silhouette(df, n_clusters): - silhouettes = [] - for n in range(2, n_clusters, 1): - clustered = create_cluster(reduced_df, n) - silhouette = cluster_scores(clustered) - silhouettes.append((n, silhouette)) - - xs = [s[0] for s in silhouettes] - ys = [s[1] for s in silhouettes] - plt.plot(xs, ys, '-bX', markevery=True) - plt.show() - -def analyze_clusters(ad, df, n_clusters, relevant_features): - print("Analyzing clusters...") - # print("Relevant features: %s" % str(relevant_features)) - for n in range(n_clusters): - analyze_cluster(ad, df[df["Class"] == n], relevant_features) - - -def analyze_cluster(ad, cluster, relevant_features=None): - article_names = cluster.index.values - - articles = dict([a for a in ad.items() if a[0] in article_names]) - print("%s articles clustered in cluster %s" % (len(articles), cluster.loc[:, "Class"][0])) - print("Feature Frequency for cluster %s" % cluster.loc[:, "Class"][0]) - if relevant_features is not None: - print(dict(sorted(analyze_feature_frequency(articles, relevant_features=relevant_features).items(), key=lambda v: v[1], reverse=True))) - - -def calc_feature_importance(data): - print("Calculating feature importance matrix...") - X = SPEC.spec(data.values, style=0) - # X = SPEC.feature_ranking(X, style=0) - d_spec = pd.DataFrame([X], columns=data.columns.values, index=["SPEC"]) - d_spec = d_spec.sort_values("SPEC", axis=1, ascending=False) - d_spec.to_csv(DATAP + "/cluster/seed_SPEC.csv") - - return d_spec - - -def load_feature_importance_matrix(): - print("Loading feature importance matrix...") - matrix = pd.read_csv(DATAP + "/cluster/seed_SPEC.csv") - dropped_columns = [c for c in matrix.columns.values[1:] if c.split("::")[0] not in FEATURE_NAMES] - matrix = matrix.drop(labels=dropped_columns, axis=1) - - return matrix - - -def slice_n_relevant_features(data, feature_importance, n): - print("Selecting the %s most relevant features..." % str(n)) - print(data.shape) - print(feature_importance.shape) - relevant_columns = feature_importance.iloc[0, 0:n+1].index.values[1:] - return data.loc[:, relevant_columns] - - -def create_cluster(data, n): - X = data - y = pd.Series(AgglomerativeClustering(n_clusters=n).fit_predict(X), index=X.index) - - data = data.assign(Class=y, index=data.index) - return data - - -def cluster_scores(data): - silhouette = silhouette_score(data.iloc[:, 1:-1], labels=data["Class"].values) - return silhouette - - if __name__ == "__main__": - ad = load_articledict() - seed = dict([a for a in ad.items() if a[1]["Seed"] == 1 and a[1]["IsStub"] == 0]) - print("Got %s samples without stubs in seed." % str(len(seed))) - # seed = dict(list(seed.items())[0:2]) - # print(seed) - df = create_dataframe(seed) - - # calc_feature_importance(df) - feature_importance = load_feature_importance_matrix() - # feature_importance = calc_feature_importance(df) - - reduced_df = slice_n_relevant_features(df, feature_importance, n=N_MOST_IMPORTANT_FEATURES) - - # print(reduced_df.shape) - - explore_silhouette(reduced_df, n_clusters=30) - + ad = load_seedlist() + print("Got %s samples without stubs in seed." % str(len(ad))) + + configs = { + # "single_InternalWikiLinks_3": ["InternalWikiLinks"], + # "single_DbpediaInfoboxTemplate": ["DbpediaInfoboxTemplate"], + # "single_Lemmas": ["Lemmas"], + # "single_URL_Braces_Words": ["URL_Braces_Words"], + # "single_COPHypernym": ["COPHypernym"], + # "single_Wikipedia_Lists": ["Wikipedia_Lists"], + # "multi_Lemmas": ["Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + "multi_InternalLinks_5": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_simple_3": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"] + } + N = 5 + + cluster_func = lambda n: SpectralClustering( + n_clusters=n, + affinity="precomputed", + random_state=42 + ) + + for (name, config) in configs.items(): + print("Clustering %s" % name) + df = create_dataframe(ad, indicators=config) + + clustered = create_cluster(df, cluster_func(N), affinity_matrix=laplacian_kernel(df)) + plot_3d(clustered, name) + save_classes(clustered, name=name) + + # scores = cluster_scores(df, cluster_instantiator=cluster_func, max_n=10) + # ss = ss.append(pd.Series([name] + [s["silhouette"] for s in scores]), ignore_index=True) + # dbs = dbs.append(pd.Series([name] + [s["calinski_harabasz"] for s in scores]), ignore_index=True) + # + # ss.to_csv(DATAP + "/cluster/silhouettes.csv") + # dbs.to_csv(DATAP + "/cluster/calinski_harabasz_score.csv") + + # ss = pd.read_csv(DATAP + "/cluster/silhouettes.csv") + # dbs = pd.read_csv(DATAP + "/cluster/calinski_harabasz_score.csv") + # x = range(2, 10, 1) + # fig = plt.figure() + # ax = plt.subplot(111) + # + # plt.xlabel("No. of clusters") + # plt.ylabel("Avg. silhouette coeff.") + # plt.set_cmap(plt.get_cmap("Spectral")) + # + # for i in range(ss.shape[0]): + # label = ss.iloc[i, 0] + # + # y = ss.iloc[i, 1:] + # ax.plot(x, y, label=label) + # + # # Shrink current axis by 20% + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.4, box.height]) + # + # plt.show() + # print(dbs) + # + # ################################ + # x = range(2, 10, 1) + # fig = plt.figure() + # ax = plt.subplot(111) + # plt.xlabel("No. of clusters") + # plt.ylabel("Avg. calinski_harabasz_score") + # plt.set_cmap(plt.get_cmap("Spectral")) + # + # for i in range(dbs.shape[0]): + # label = dbs.iloc[i, 0] + # + # if label == "single_DbpediaInfoboxTemplate": + # continue + # + # y = dbs.iloc[i, 1:] + # ax.plot(x, y, label=label) + # + # # Shrink current axis by 20% + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.4, box.height]) + # + # ax.legend(loc="upper right", fontsize="xx-small") + # plt.show() - # plot_2d(clustered, N_CLUSTERS) - # analyze_clusters(ad, clustered, N_CLUSTERS, relevant_features=[c for c in feature_importance.columns.values[1:] if "COPHypernym" in c]) - # clustered["Class"].to_csv(DATAP + "/cluster/clusters.csv") - # - # print("Creating cluster...") - # ac = GaussianMixture(n_components=N_CLUSTERS) - # # ac.fit(df) - # # y = ac.labels_ - # - # df = df.assign(Class=Series(ac.fit_predict(df), index=df.index)) - # # print(y) - # plot_reduced(df, N_CLUSTERS) - # - # analyze_clusters(ad, df, N_CLUSTERS) diff --git a/src/cluster/summarylinks_subclasses.py b/src/cluster/summarylinks_subclasses.py new file mode 100644 index 0000000..ad1a0d1 --- /dev/null +++ b/src/cluster/summarylinks_subclasses.py @@ -0,0 +1,73 @@ +import json +from mine.util import flatten +from data import DATAP +from pprint import pprint +import numpy as np +import pandas as pd +from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering +from sklearn.mixture import GaussianMixture +from cluster.util import create_cluster, plot_2d, plot_3d, save_classes +from data.explore import feature_freq +from sklearn.decomposition import PCA + + +def get_feature_vector(links): + return sorted(list(set(flatten(links.values())))) + + +def load_links(): + return json.load(open(DATAP + "/seed_summary_links.json", "r")) + + +def get_feature_values(links, feature): + return np.array([feature in links[l] for l in links]).astype(int) + + +def create_dataframe(links): + features = get_feature_vector(links) + articles = list(links.keys()) + + matrix = { + f: get_feature_values(links, f) for f in features + } + matrix["name"] = articles + # print(matrix) + + df = pd.DataFrame(columns=features + ["name"], data=matrix) + df = df.set_index("name") + + df = preprocess(df) + + return df + + +def preprocess(df): + # drop all zero samples + df = df[(df != 0).any(axis=1)] + return df + + +if __name__ == "__main__": + links = load_links() + data = create_dataframe(links) + + # data = pd.DataFrame(PCA(n_components=20).fit_transform(data), index=data.index) + + h = SpectralClustering(n_clusters=3, affinity="cosine", random_state=42) + clustered = create_cluster(data, h) + + + + # print(clustered[clustered.columns[0:-2]]) + + save_classes(clustered) + + plot_3d(clustered, "Spectral n=3") + + grouped = clustered.groupby(by="Class").sum() + + + # grouped.to_csv(DATAP+"/cluster/cluster_analysis.csv") + + # pprint(set(flatten(links.values()))) + # pprint(len(get_feature_vector(links))) \ No newline at end of file diff --git a/src/cluster/util.py b/src/cluster/util.py new file mode 100644 index 0000000..e21bdcb --- /dev/null +++ b/src/cluster/util.py @@ -0,0 +1,101 @@ +from sklearn.decomposition import PCA, FastICA +import pandas as pd +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.pyplot as plt +from sklearn.metrics import silhouette_score, calinski_harabasz_score +from sklearn.metrics.pairwise import laplacian_kernel +from data import DATAP +from data.explore.feature_freq import analyze_feature_frequency + + +def plot_2d(df, title=""): + # reduced = df + print("Plotting 2D dataframe reduction...") + reduced = pd.DataFrame(PCA(n_components=2, random_state=42).fit_transform(df.iloc[:, 0:-2])) + reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) + + for n in set(reduced.loc[:, "Class"].values): + plt.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) + + plt.title(title) + + plt.legend() + plt.show() + + +def plot_3d(df, title=""): + # reduced = df + print("Plotting 3D dataframe reduction...") + reduced = pd.DataFrame(PCA(n_components=3, random_state=42).fit_transform(df.iloc[:, :-2])) + reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) + + fig = plt.figure() + ax = Axes3D(fig) + + for n in set(reduced.loc[:, "Class"].values): + ax.scatter(reduced[reduced["Class"] == n].iloc[:, 0], reduced[reduced["Class"] == n].iloc[:, 1], reduced[reduced["Class"] == n].iloc[:, 2], cmap=plt.get_cmap("Spectral"), label="Class %s" % n) + + plt.title(title) + plt.legend() + plt.show() + + +def create_cluster(data, clusterer, affinity_matrix=None): + print("Creating clusters...") + X = data + if affinity_matrix is not None: + X = affinity_matrix + y = pd.Series(clusterer.fit_predict(X), index=data.index) + + clustered = data.copy(deep=True) + clustered = clustered.assign(Class=y, index=data.index) + return clustered + + +def cluster_scores(df, cluster_instantiator, max_n): + silhouettes = [] + print("Calculating laplacian matrix") + affinity_matrix = laplacian_kernel(df) + for n in range(2, max_n, 1): + print("Step %s" % str(n)) + c = cluster_instantiator(n, affinity_matrix) + clustered = create_cluster(data=df, clusterer=c, affinity_matrix=affinity_matrix) + scores = __cluster_scores(clustered) + silhouettes.append(scores) + + return silhouettes + # xs = [s[0] for s in silhouettes] + # ys = [s[1] for s in silhouettes] + # plt.plot(xs, ys, '-bX', markevery=True) + # plt.show() + + +def __cluster_scores(data): + return { + "silhouette": silhouette_score(data.iloc[:, 1:-2], labels=data["Class"].values), + "calinski_harabasz": calinski_harabasz_score(data.iloc[:, 1:-2], labels=data["Class"].values) + } + + +def save_classes(data, name): + data["Class"].to_csv(DATAP + "/cluster/%s.csv" % name) + + +def analyze_clusters(ad, df, n_clusters, relevant_features): + print("Analyzing clusters...") + # print("Relevant features: %s" % str(relevant_features)) + for n in range(n_clusters): + analyze_cluster(ad, df[df["Class"] == n], relevant_features) + + +def analyze_cluster(ad, cluster, relevant_features=None): + article_names = cluster.index.values + + articles = dict([a for a in ad.items() if a[0] in article_names]) + print("%s articles clustered in cluster %s" % (len(articles), cluster.loc[:, "Class"][0])) + print("Feature Frequency for cluster %s" % cluster.loc[:, "Class"][0]) + if relevant_features is not None: + print(dict(sorted(analyze_feature_frequency(articles, relevant_features=relevant_features).items(), + key=lambda v: v[1], reverse=True))) + + diff --git a/src/data/__init__.py b/src/data/__init__.py index 17f5b1f..79bb222 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -2,12 +2,20 @@ # from mine.yago import get_artificial_languages from json import load, dump import time +import pandas as pd +from data.explore.feature_freq import analyze_feature_frequency # UTIL DATAP = "/Users/dnikonov/Uni/wikionto/data/mined" AP = DATAP + "/articledict.json" +def load_seedlist(): + ad = load_articledict() + seed = dict([a for a in ad.items() if a[1]["Seed"] == 1 and a[1]["IsStub"] == 0]) + return seed + + def load_articledict(): return load(open(AP, "r", encoding="utf-8")) @@ -51,6 +59,41 @@ def flat_list(l): return [item for sublist in l for item in sublist] +def create_dataframe(articles, indicators): + print("Creating data frame...") + features = _build_feature_vector(articles, indicators) + feature_matrix = _build_feature_matrix(features, articles, indicators) + feature_matrix["Name"] = [a for a in articles] + + df = pd.DataFrame(feature_matrix, columns=["Name"] + list(features)) + df = df.set_index("Name") + df = _preprocess(df) + + return df + + +def _preprocess(df): + # drop all zero samples + df = df[(df != 0).any(axis=1)] + return df + + +def _build_feature_vector(articles, indicators): + features_freqs = analyze_feature_frequency(articles, F_SetNames=indicators) + # print(features_freqs) + return features_freqs.keys() + + +def _build_feature_matrix(features, articles, indicators): + feature_matrix = {f: [] for f in features} + for a in articles.items(): + # print(a) + feature_freq = analyze_feature_frequency({a[0]: a[1]}, F_SetNames=indicators) + for key in feature_matrix: + feature_matrix[key].append(1 if key in feature_freq else 0) + return feature_matrix + + # SCOPING DEPTH = 8 ROOTS = ["Category:Computing_platforms", "Category:Software"] diff --git a/src/data/explore/explore_seed_nouns.py b/src/data/explore/explore_seed_nouns.py index 52bb827..3588cc6 100644 --- a/src/data/explore/explore_seed_nouns.py +++ b/src/data/explore/explore_seed_nouns.py @@ -34,11 +34,13 @@ def explore(): f = open(DATAP + '/articledict.json', 'r', encoding="utf8") d = load(f) f.close() - cl_sums = list(d[cl]["Summary"] for cl in d if ("Summary" in d[cl]) and (d[cl]["Seed"] == 1)) + cl_sums = list(d[cl]["Summary"] for cl in d if ("Summary" in d[cl])) pool = ThreadPool(processes=10) nnlists = pool.map(get_single, cl_sums) nouns_f = dict() for nnlist in nnlists: + if nnlist is None: + continue for nn in nnlist: if nn in nouns_f: nouns_f[nn] += 1 @@ -46,7 +48,7 @@ def explore(): nouns_f[nn] = 1 nouns_f = OrderedDict(sorted(nouns_f.items(), key=lambda item: item[1])) - f = open(DATAP + '/explore_seed_nouns.json', 'w', encoding='utf8') + f = open(DATAP + '/explore_nouns.json', 'w+', encoding='utf8') dump(nouns_f, f, indent=2) f.flush() f.close() diff --git a/src/data/explore/feature_freq.py b/src/data/explore/feature_freq.py index 518383f..1642e36 100644 --- a/src/data/explore/feature_freq.py +++ b/src/data/explore/feature_freq.py @@ -1,5 +1,3 @@ -from data import load_articledict - def analyze_feature_frequency(ad, **kwargs): freq = dict() diff --git a/src/mine/mine_summary_links.py b/src/mine/mine_summary_links.py new file mode 100644 index 0000000..da99fc5 --- /dev/null +++ b/src/mine/mine_summary_links.py @@ -0,0 +1,75 @@ +from mine.wiki import get_summary_links, get_redirect +from data import load_seedlist, DATAP, load, load_articledict, save_articledict +from multiprocessing import Pool +from mine.util import flatten +import json + + +def mine_summary_links(): + seed = load_seedlist() + titles = list(seed.keys()) + + pool = Pool(processes=10) + summary_links = dict(pool.map(article_summary_links, titles)) + + with open(DATAP + "/seed_summary_links.json", "w+") as out: + json.dump(summary_links, out, indent=4) + + +def article_summary_links(title): + target_title = get_redirect(title) + print("Retrieving for %s..." % target_title) + links = get_summary_links(target_title) + + links = sanitize_links(links) + links = resolve_redirects(links) + + return title, links + + +def bad_link(link): + return any([ + "Wikipedia:" in link, + "Template:" in link, + "Template talk:" in link, + "Help:" in link, + "Category:" in link, + "Talk:" in link, + "Software categories" in link, + "Software developer" in link, + "Software license" in link, + "Software release life cycle" in link, + ]) + + +def sanitize_links(seed_links): + return [l for l in seed_links if not bad_link(l)] + + +def resolve_redirects(seed_links): + resolved_tuples = [resolve_redirect(l) for l in seed_links] + return [t[1] for t in resolved_tuples] + + +def resolve_redirect(title): + target = get_redirect(title) + return title, target + + +if __name__ == "__main__": + # mine_summary_links() + # seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r")) + # + # seed_links = { + # s[0]: [l for l in s[1] if not bad_link(l)] for s in seed_links.items() + # } + # + # json.dump(seed_links, open(DATAP + "/seed_summary_links.json", "w+"), indent=4) + + ad = load_articledict() + seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r")) + + for s in seed_links: + ad[s]["InternalWikiLinks"] = seed_links[s] + + save_articledict(ad) diff --git a/src/mine/util.py b/src/mine/util.py index 641226c..488177d 100644 --- a/src/mine/util.py +++ b/src/mine/util.py @@ -1,2 +1,6 @@ def normalize_name(name): - return name \ No newline at end of file + return name + + +def flatten(lists): + return [item for sublist in lists for item in sublist] \ No newline at end of file diff --git a/src/mine/wiki.py b/src/mine/wiki.py index f032dfe..c5a507b 100644 --- a/src/mine/wiki.py +++ b/src/mine/wiki.py @@ -7,10 +7,10 @@ HEADER = {'User-Agent': 'WikiOnto'} -def wiki_request(params): +def wiki_request(params, action="query"): params['format'] = 'json' params['formatversion'] = 2 - params['action'] = 'query' + params['action'] = action params['utf8'] = '' try: s = Session() @@ -85,8 +85,11 @@ def getcontent(revid): def getlinks(title): - params = {'prop': 'links' - , 'titles': title} + params = { + 'prop': 'links', + 'titles': title + } + wikijson = wiki_request(params) # print(wikijson) links = [] @@ -120,6 +123,56 @@ def getlinks(title): return links +def get_redirect(title): + params = { + 'titles': title, + 'redirects': '1' + } + try: + wikijson = wiki_request(params) + + if "query" not in wikijson: + print("None at query " + title) + return [] + if "pages" not in wikijson["query"]: + print("None at pages " + title) + return [] + if wikijson["query"]["pages"] is None: + print("None at values " + title) + return [] + + target_page = next(iter(wikijson["query"]["pages"])) + + if "title" not in target_page: + print("None at title") + return [] + + print("%s -> %s" % (title, target_page["title"])) + + return target_page["title"] + except KeyError: + print("Redirect resolved in KeyError for %s" % title) + return [] + + +def get_summary_links(title): + params = { + 'page' : title, + 'section': 0 + } + + try: + wikijson = wiki_request(params, action="parse") + + # print(wikijson) + raw_links = wikijson["parse"]["links"] + + return [l["title"] for l in raw_links] + except KeyError: + print("KeyError for %s." % title) + return [] + + def getlinks_multi(titles): params = {'prop': 'links', 'titles': titles} From f4030e69c3652405e7dcf3827c97b911627930d2 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Thu, 27 Jun 2019 17:17:41 +0200 Subject: [PATCH 5/8] So work much wow --- src/cluster/subclasses.py | 202 ++++++++++++++++++++++++++------------ src/cluster/util.py | 42 +++++--- src/data/__init__.py | 2 +- 3 files changed, 169 insertions(+), 77 deletions(-) diff --git a/src/cluster/subclasses.py b/src/cluster/subclasses.py index b3d4f13..33195ec 100644 --- a/src/cluster/subclasses.py +++ b/src/cluster/subclasses.py @@ -1,112 +1,186 @@ from builtins import range -from cluster.util import plot_3d, plot_2d, create_cluster, cluster_scores, save_classes +from cluster.util import plot_3d, plot_2d, create_cluster, cluster_scores, save_classes, best_pca from data import DATAP, load_articledict, load_seedlist, save_articledict,start_time, stop_time import pandas as pd -from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans, DBSCAN +from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans, DBSCAN, OPTICS from sklearn.mixture import GaussianMixture -from sklearn.metrics.pairwise import laplacian_kernel +from sklearn.metrics.pairwise import laplacian_kernel, rbf_kernel +from sklearn.neighbors import NearestNeighbors import matplotlib.pyplot as plt from data import create_dataframe +import numpy as np +from sklearn.decomposition import PCA +from pprint import pprint +def plot_score(ss, max_n, title): + x = range(2, max_n, 1) + # fig = plt.figure() + ax = plt.subplot(111) + + plt.xlabel("# clusters") + plt.ylabel(title) + plt.set_cmap(plt.get_cmap("Spectral")) + + # ax.set_xticks([2, 3, 4, 5] + [10, 15, 20, 25, 30]) + + for i in range(ss.shape[0]): + label = ss.iloc[i, 0] + + y = ss.iloc[i, 1:] + ax.plot(x, y, label=label) + + max_x = np.argmax(y) + 1 + max_y = np.max(y) -# %matplotlib inline + print(max_x) + print(max_y) -CLUSTERS_ALGORITHMS = { - "kmeans": lambda n: KMeans(n_clusters=n), - "hierarchical": lambda n: AgglomerativeClustering(n_clusters=n), - "gaussian": lambda n: GaussianMixture(n_components=n), + ax.plot([max_x], [max_y], color="red", marker="o") -} + # Shrink current axis by 20% + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.4, box.height]) + + plt.grid(True, linestyle="--") + plt.legend() + plt.show() + + +def eval_manual(clustered, apps, libs, frameworks): + + scores = { + "apps": [], + "libs": [], + "frameworks": [] + } + print("a size", len(apps)) + print("l size", len(libs)) + print("f size", len(frameworks)) + + + clustered = clustered.assign(Name=pd.Series(clustered.index, index=clustered.index)) + + for clazz in sorted(set(clustered.loc[:, "Class"].values)): + c_i = clustered[clustered["Class"] == clazz] + print("Class", clazz) + print(c_i.shape) + app_score = np.sum(np.isin(c_i["Name"].values, [a[0] for a in apps])) / len(apps) + framework_score = np.sum(np.isin(c_i["Name"].values, [a[0] for a in frameworks])) / len(frameworks) + lib_score = np.sum(np.isin(c_i["Name"].values, [a[0] for a in libs])) / len(libs) + + print("a", np.sum(np.isin(c_i["Name"].values, [a[0] for a in apps]))) + print("f", np.sum(np.isin(c_i["Name"].values, [a[0] for a in frameworks]))) + print("l", np.sum(np.isin(c_i["Name"].values, [a[0] for a in libs]))) + + scores["apps"].append(app_score) + scores["libs"].append(lib_score) + scores["frameworks"].append(framework_score) + + pprint(scores) + print("+++++++++++++++++++++++++++++++++") -FEATURE_NAMES = [ - "InternalWikiLinks", - "DbpediaInfoboxTemplate", - "Lemmas", - "URL_Braces_Words", - "COPHypernym", - "Wikipedia_Lists" -] -N_CLUSTERS = 5 -N_MOST_IMPORTANT_FEATURES = 1000 if __name__ == "__main__": ad = load_seedlist() - print("Got %s samples without stubs in seed." % str(len(ad))) + + frameworks = [a for a in ad.items() if "framework" in a[1]["COPHypernym"]] + apps = [a for a in ad.items() if "application" in a[1]["COPHypernym"]] + # url_frameworks = [a for a in ad.items() if "framework" in a[1]["URL_Braces_Words"] and "framework" not in a[1]["COPHypernym"]] + libs = [a for a in ad.items() if "library" in a[1]["COPHypernym"]] + + # seed = dict(frameworks + apps + libs) + seed = ad + print("Got %s samples for clustering" % str(len(seed))) configs = { - # "single_InternalWikiLinks_3": ["InternalWikiLinks"], + # "single_InternalWikiLinks": ["InternalWikiLinks"], # "single_DbpediaInfoboxTemplate": ["DbpediaInfoboxTemplate"], # "single_Lemmas": ["Lemmas"], # "single_URL_Braces_Words": ["URL_Braces_Words"], # "single_COPHypernym": ["COPHypernym"], # "single_Wikipedia_Lists": ["Wikipedia_Lists"], # "multi_Lemmas": ["Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], - "multi_InternalLinks_5": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], - # "multi_simple_3": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"] + # "multi_Lemmas_pca": ["Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + "multi_InternalLinks": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_InternalLinks_pca": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_simple": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_simple_pca": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + "multi_no_hypernym": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], + "multi_no_hypernym_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], + # "multi_all": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_all_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + } + # N = 6 + + best_N = { + "multi_InternalLinks": 6, + "multi_no_hypernym": 4, + "multi_no_hypernym_pca": 5 } - N = 5 - cluster_func = lambda n: SpectralClustering( - n_clusters=n, - affinity="precomputed", - random_state=42 - ) + # max_n = 10 + cluster_func = lambda n: AgglomerativeClustering(n_clusters=n) + ss = pd.DataFrame() + dbs = pd.DataFrame() for (name, config) in configs.items(): - print("Clustering %s" % name) - df = create_dataframe(ad, indicators=config) + print("Clustering %s with n=%i" % (name, best_N[name])) + df = create_dataframe(seed, indicators=config) - clustered = create_cluster(df, cluster_func(N), affinity_matrix=laplacian_kernel(df)) - plot_3d(clustered, name) - save_classes(clustered, name=name) + # vt = VarianceThreshold(threshold=0.05) + # vt = vt.fit(df) - # scores = cluster_scores(df, cluster_instantiator=cluster_func, max_n=10) + # df = df.iloc[:, vt.get_support(indices=True)] + if "_pca" in name: + print("Performing pca...") + df = pd.DataFrame(best_pca(df), index=df.index) + + N = best_N[name] + + c_name = name + "_" + str(N) + + # print("Calculating laplacian matrix...") + # X = laplacian_kernel(df) + + clustered = create_cluster(df, cluster_func(N)) + # clustered = df.copy(deep=True) + # clustered = pd.DataFrame(best_pca(df), index=df.index).assign(Class=pd.Series(np.zeros(df.shape[0], dtype=np.int), index=df.index)) + eval_manual(clustered, apps=apps, libs=libs, frameworks=frameworks) + + # plot_2d(clustered, c_name) + # save_classes(clustered, name=c_name) + + # scores = cluster_scores(df, cluster_instantiator=cluster_func, max_n=max_n, kernel=None) # ss = ss.append(pd.Series([name] + [s["silhouette"] for s in scores]), ignore_index=True) # dbs = dbs.append(pd.Series([name] + [s["calinski_harabasz"] for s in scores]), ignore_index=True) - # - # ss.to_csv(DATAP + "/cluster/silhouettes.csv") + # ss.to_csv(DATAP + "/cluster/silhouettes_simpler.csv") # dbs.to_csv(DATAP + "/cluster/calinski_harabasz_score.csv") - - # ss = pd.read_csv(DATAP + "/cluster/silhouettes.csv") - # dbs = pd.read_csv(DATAP + "/cluster/calinski_harabasz_score.csv") - # x = range(2, 10, 1) - # fig = plt.figure() - # ax = plt.subplot(111) # - # plt.xlabel("No. of clusters") - # plt.ylabel("Avg. silhouette coeff.") - # plt.set_cmap(plt.get_cmap("Spectral")) - # - # for i in range(ss.shape[0]): - # label = ss.iloc[i, 0] - # - # y = ss.iloc[i, 1:] - # ax.plot(x, y, label=label) + # plot_score(ss, max_n, "Silhouette coeff.") + # plot_score(dbs, max_n, "Calinski Harabasz") + + # ss = pd.read_csv(DATAP + "/cluster/silhouettes.csv", index_col=0) # - # # Shrink current axis by 20% - # box = ax.get_position() - # ax.set_position([box.x0, box.y0, box.width * 0.4, box.height]) + # ss = ss.iloc[0:6, :] # - # plt.show() + # # dbs = pd.read_csv(DATAP + "/cluster/calinski_harabasz_score.csv") - # print(dbs) # - # ################################ - # x = range(2, 10, 1) + # # print(dbs) + # # + # # ################################ + # x = range(2, max_n, 1) # fig = plt.figure() # ax = plt.subplot(111) - # plt.xlabel("No. of clusters") - # plt.ylabel("Avg. calinski_harabasz_score") + # plt.xlabel("# clusters") + # plt.ylabel("Calinski_harabasz_score") # plt.set_cmap(plt.get_cmap("Spectral")) # # for i in range(dbs.shape[0]): # label = dbs.iloc[i, 0] # - # if label == "single_DbpediaInfoboxTemplate": - # continue - # # y = dbs.iloc[i, 1:] # ax.plot(x, y, label=label) # diff --git a/src/cluster/util.py b/src/cluster/util.py index e21bdcb..c0cfefa 100644 --- a/src/cluster/util.py +++ b/src/cluster/util.py @@ -6,12 +6,13 @@ from sklearn.metrics.pairwise import laplacian_kernel from data import DATAP from data.explore.feature_freq import analyze_feature_frequency +import numpy as np +from sklearn.decomposition import PCA - -def plot_2d(df, title=""): +def plot_2d(df, title="", export=False): # reduced = df print("Plotting 2D dataframe reduction...") - reduced = pd.DataFrame(PCA(n_components=2, random_state=42).fit_transform(df.iloc[:, 0:-2])) + reduced = pd.DataFrame(PCA(n_components=2, random_state=42).fit_transform(df.loc[:, df.columns != "Class"])) reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) for n in set(reduced.loc[:, "Class"].values): @@ -22,11 +23,14 @@ def plot_2d(df, title=""): plt.legend() plt.show() + if export: + plt.savefig(DATAP + "/cluster/" + title + ".png") + -def plot_3d(df, title=""): +def plot_3d(df, title="", export=False): # reduced = df print("Plotting 3D dataframe reduction...") - reduced = pd.DataFrame(PCA(n_components=3, random_state=42).fit_transform(df.iloc[:, :-2])) + reduced = pd.DataFrame(PCA(n_components=3, random_state=42).fit_transform(df.loc[:, df.columns != "Class"])) reduced = reduced.assign(Class=pd.Series(df.loc[:, "Class"].values, index=reduced.index)) fig = plt.figure() @@ -39,6 +43,9 @@ def plot_3d(df, title=""): plt.legend() plt.show() + if export: + plt.savefig(DATAP + "/cluster/" + title + ".png") + def create_cluster(data, clusterer, affinity_matrix=None): print("Creating clusters...") @@ -48,17 +55,21 @@ def create_cluster(data, clusterer, affinity_matrix=None): y = pd.Series(clusterer.fit_predict(X), index=data.index) clustered = data.copy(deep=True) - clustered = clustered.assign(Class=y, index=data.index) + clustered = clustered.assign(Class=y) return clustered -def cluster_scores(df, cluster_instantiator, max_n): +def cluster_scores(df, cluster_instantiator, max_n, kernel=laplacian_kernel): silhouettes = [] - print("Calculating laplacian matrix") - affinity_matrix = laplacian_kernel(df) + + affinity_matrix = None + if kernel is not None: + print("Calculating laplacian matrix") + affinity_matrix = kernel(df.values) + for n in range(2, max_n, 1): print("Step %s" % str(n)) - c = cluster_instantiator(n, affinity_matrix) + c = cluster_instantiator(n) clustered = create_cluster(data=df, clusterer=c, affinity_matrix=affinity_matrix) scores = __cluster_scores(clustered) silhouettes.append(scores) @@ -72,8 +83,8 @@ def cluster_scores(df, cluster_instantiator, max_n): def __cluster_scores(data): return { - "silhouette": silhouette_score(data.iloc[:, 1:-2], labels=data["Class"].values), - "calinski_harabasz": calinski_harabasz_score(data.iloc[:, 1:-2], labels=data["Class"].values) + "silhouette": silhouette_score(data.iloc[:, 1:-1], labels=data.loc[:, "Class"].values), + "calinski_harabasz": calinski_harabasz_score(data.iloc[:, 1:-1], labels=data.loc[:, "Class"].values) } @@ -99,3 +110,10 @@ def analyze_cluster(ad, cluster, relevant_features=None): key=lambda v: v[1], reverse=True))) +def best_pca(data): + pca = PCA().fit(data) + best_n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.998) + + print("...reducing to %s components." % str(best_n_components[0][0])) + + return PCA(n_components=best_n_components[0][0]).fit_transform(data) diff --git a/src/data/__init__.py b/src/data/__init__.py index 79bb222..a08a881 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -12,7 +12,7 @@ def load_seedlist(): ad = load_articledict() - seed = dict([a for a in ad.items() if a[1]["Seed"] == 1 and a[1]["IsStub"] == 0]) + seed = dict([a for a in ad.items() if a[1]["Seed"] and valid_article(a[0], ad)]) return seed From b6ebf260ccf3c160c566de9180fc3987438ec398 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Thu, 27 Jun 2019 20:36:40 +0200 Subject: [PATCH 6/8] add standardsclaer --- src/cluster/subclasses.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/cluster/subclasses.py b/src/cluster/subclasses.py index 33195ec..bf6069f 100644 --- a/src/cluster/subclasses.py +++ b/src/cluster/subclasses.py @@ -12,6 +12,7 @@ import numpy as np from sklearn.decomposition import PCA from pprint import pprint +from sklearn.preprocessing import StandardScaler def plot_score(ss, max_n, title): x = range(2, max_n, 1) @@ -102,16 +103,16 @@ def eval_manual(clustered, apps, libs, frameworks): # "single_Wikipedia_Lists": ["Wikipedia_Lists"], # "multi_Lemmas": ["Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], # "multi_Lemmas_pca": ["Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], - "multi_InternalLinks": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_InternalLinks": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], # "multi_InternalLinks_pca": ["InternalWikiLinks", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], # "multi_simple": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], # "multi_simple_pca": ["URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], - "multi_no_hypernym": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], - "multi_no_hypernym_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], - # "multi_all": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + # "multi_no_hypernym": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], + # "multi_no_hypernym_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], + "multi_all": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], # "multi_all_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], } - # N = 6 + N = 4 best_N = { "multi_InternalLinks": 6, @@ -125,7 +126,7 @@ def eval_manual(clustered, apps, libs, frameworks): ss = pd.DataFrame() dbs = pd.DataFrame() for (name, config) in configs.items(): - print("Clustering %s with n=%i" % (name, best_N[name])) + print("Clustering %s" % name) df = create_dataframe(seed, indicators=config) # vt = VarianceThreshold(threshold=0.05) @@ -136,19 +137,22 @@ def eval_manual(clustered, apps, libs, frameworks): print("Performing pca...") df = pd.DataFrame(best_pca(df), index=df.index) - N = best_N[name] + # N = best_N[name] c_name = name + "_" + str(N) # print("Calculating laplacian matrix...") # X = laplacian_kernel(df) - clustered = create_cluster(df, cluster_func(N)) - # clustered = df.copy(deep=True) - # clustered = pd.DataFrame(best_pca(df), index=df.index).assign(Class=pd.Series(np.zeros(df.shape[0], dtype=np.int), index=df.index)) - eval_manual(clustered, apps=apps, libs=libs, frameworks=frameworks) + # clustered = create_cluster(df, cluster_func(N)) + clustered = df.copy(deep=True) + clustered = StandardScaler(with_mean=False).fit_transform(clustered) + clustered = pd.DataFrame(best_pca(df), index=df.index).assign(Class=pd.Series(np.zeros(df.shape[0], dtype=np.int), index=df.index)) + # eval_manual(clustered, apps=apps, libs=libs, frameworks=frameworks) - # plot_2d(clustered, c_name) + + + plot_2d(clustered, c_name) # save_classes(clustered, name=c_name) # scores = cluster_scores(df, cluster_instantiator=cluster_func, max_n=max_n, kernel=None) From 185bf2c83b3b6c6807d08bf74302f009e8c25fe9 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Fri, 30 Aug 2019 15:40:40 +0200 Subject: [PATCH 7/8] Work --- src/cluster/subclasses.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cluster/subclasses.py b/src/cluster/subclasses.py index bf6069f..d9e4dd2 100644 --- a/src/cluster/subclasses.py +++ b/src/cluster/subclasses.py @@ -110,7 +110,7 @@ def eval_manual(clustered, apps, libs, frameworks): # "multi_no_hypernym": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], # "multi_no_hypernym_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "Wikipedia_Lists"], "multi_all": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], - # "multi_all_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], + "multi_all_pca": ["DbpediaInfoboxTemplate", "InternalWikiLinks", "Lemmas", "URL_Braces_Words", "COPHypernym", "Wikipedia_Lists"], } N = 4 @@ -144,15 +144,15 @@ def eval_manual(clustered, apps, libs, frameworks): # print("Calculating laplacian matrix...") # X = laplacian_kernel(df) - # clustered = create_cluster(df, cluster_func(N)) - clustered = df.copy(deep=True) - clustered = StandardScaler(with_mean=False).fit_transform(clustered) - clustered = pd.DataFrame(best_pca(df), index=df.index).assign(Class=pd.Series(np.zeros(df.shape[0], dtype=np.int), index=df.index)) - # eval_manual(clustered, apps=apps, libs=libs, frameworks=frameworks) + clustered = create_cluster(df, cluster_func(N)) + # clustered = df.copy(deep=True) + # clustered = StandardScaler(with_mean=False).fit_transform(clustered) + # clustered = pd.DataFrame(best_pca(df), index=df.index).assign(Class=pd.Series(np.zeros(df.shape[0], dtype=np.int), index=df.index)) + eval_manual(clustered, apps=apps, libs=libs, frameworks=frameworks) - plot_2d(clustered, c_name) + # plot_2d(clustered, c_name) # save_classes(clustered, name=c_name) # scores = cluster_scores(df, cluster_instantiator=cluster_func, max_n=max_n, kernel=None) From 71a8ff1051e07edede5ab6e49327a7d04e249617 Mon Sep 17 00:00:00 2001 From: Dmitri Nikonov Date: Sat, 7 Sep 2019 17:40:40 +0200 Subject: [PATCH 8/8] change datap --- src/data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/__init__.py b/src/data/__init__.py index a08a881..6d45177 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -6,7 +6,7 @@ from data.explore.feature_freq import analyze_feature_frequency # UTIL -DATAP = "/Users/dnikonov/Uni/wikionto/data/mined" +DATAP = "/Users/dnikonov/Uni/wikionto/data" AP = DATAP + "/articledict.json"