diff --git a/.gitignore b/.gitignore index 81151ab..bc777a2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ -/target/ +target/ **/*.rs.bk Cargo.lock -/target/ -**/*.rs.bk - +modules/** +*.bz2 +*.xml Session.vim diff --git a/src/bin/build_definitions_db.rs b/src/bin/build_definitions_db.rs index 8499a19..2c0a43f 100644 --- a/src/bin/build_definitions_db.rs +++ b/src/bin/build_definitions_db.rs @@ -1,18 +1,18 @@ extern crate define3; +extern crate getopts; extern crate regex; extern crate rusqlite; -extern crate getopts; -use define3::{Module, Template, Word}; -use define3::PageContent; use define3::parse_wikitext::parse_wikitext; +use define3::PageContent; +use define3::{Module, Template, Word}; use getopts::Options; use regex::Regex; use rusqlite::{Connection, Transaction}; use std::collections::{HashMap, HashSet}; -use std::fs::File; use std::fs; +use std::fs::File; use std::io::Write; use std::path::Path; @@ -28,7 +28,10 @@ fn main() { "Japanese", "Korean", "Lojban", - ].iter().cloned().collect(); + ] + .iter() + .cloned() + .collect(); // TODO: figure out POS list automatically let parts_of_speech: HashSet<&str> = [ @@ -51,14 +54,20 @@ fn main() { "Rafsi", "Romanization", "Verb", - ].iter().cloned().collect(); + ] + .iter() + .cloned() + .collect(); let args: Vec = std::env::args().collect(); let mut opts = Options::new(); opts.optflag("h", "help", "print this help text"); let matches = opts.parse(&args[1..]).unwrap(); if matches.opt_present("h") || matches.free.len() != 1 { - let brief = format!("Usage: {} PATH_TO_enwiktionary-YYYYMMDD-pages-meta-current.xml [options]", args[0]); + let brief = format!( + "Usage: {} PATH_TO_enwiktionary-YYYYMMDD-pages-meta-current.xml [options]", + args[0] + ); print!("{}", opts.usage(&brief)); return; } @@ -88,7 +97,8 @@ fn main() { content text not null )", [], - ).unwrap(); + ) + .unwrap(); tx.execute("DROP TABLE IF EXISTS modules", []).unwrap(); tx.execute( @@ -97,7 +107,8 @@ fn main() { content text not null )", [], - ).unwrap(); + ) + .unwrap(); let re_noinclude = Regex::new(r"(?P(?s:.)*?)").unwrap(); let re_includeonly = Regex::new(r"(?P(?s:.)*?)").unwrap(); @@ -123,17 +134,19 @@ fn main() { tx.execute( "insert into templates (name, content) values (?1, ?2)", &[&title, &content.as_str()], - ).unwrap(); + ) + .unwrap(); templates.insert(title.to_owned(), content); } else if page.title.starts_with("Module:") { let title = &page.title[7..]; tx.execute( "insert into modules (name, content) values (?1, ?2)", [&title, &page.content.as_str()], - ).unwrap(); + ) + .unwrap(); println!("Saved module: {}", page.title); - let path = format!("/trove/data/enwikt/modules/{}.lua", page.title); + let path = format!("modules/{}.lua", page.title); let path = Path::new(&path); fs::create_dir_all(path.parent().unwrap()).unwrap(); let mut file = File::create(path).unwrap(); @@ -153,7 +166,8 @@ fn main() { definition text not null )", [], - ).unwrap(); + ) + .unwrap(); define3::parse_xml::for_pages(&xml_path, |page| { let page_content = match page.title.split(':').next() { @@ -196,7 +210,8 @@ fn main() { &meaning.part_of_speech, &defn.into_owned(), ], - ).unwrap(); + ) + .unwrap(); } } _ => (), @@ -207,7 +222,8 @@ fn main() { "create index words_name_idx on words(name); create index words_language_idx on words(language); create index words_part_of_speech_idx on words(part_of_speech);", - ).unwrap(); + ) + .unwrap(); tx.commit().unwrap(); } diff --git a/src/bin/define.rs b/src/bin/define.rs index 394e60f..6670d39 100644 --- a/src/bin/define.rs +++ b/src/bin/define.rs @@ -1,10 +1,10 @@ extern crate colored; extern crate define3; extern crate getopts; +extern crate nom; extern crate regex; extern crate rusqlite; extern crate textwrap; -extern crate nom; use define3::Meaning; @@ -20,14 +20,18 @@ fn get_defns_by_lang( conn: &Connection, word: &str, ) -> Box>>> { - let mut stmt = conn.prepare( - "SELECT language, part_of_speech, definition FROM words WHERE name = ?1", - ).unwrap(); - let word_iter = stmt.query_map(&[&word], |row| Ok (Meaning { - language: row.get(0).unwrap(), - part_of_speech: row.get(1).unwrap(), - definition: row.get(2).unwrap(), - })).unwrap(); + let mut stmt = conn + .prepare("SELECT language, part_of_speech, definition FROM words WHERE name = ?1") + .unwrap(); + let word_iter = stmt + .query_map(&[&word], |row| { + Ok(Meaning { + language: row.get(0).unwrap(), + part_of_speech: row.get(1).unwrap(), + definition: row.get(2).unwrap(), + }) + }) + .unwrap(); let mut langs: BTreeMap>> = BTreeMap::new(); @@ -47,6 +51,7 @@ fn get_defns_by_lang( // functions and often call out into Lua code. // https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions // https://www.mediawiki.org/wiki/Extension:Scribunto +#[allow(dead_code)] fn expand_template(conn: &Connection, args: &[&str]) -> String { fn get_template_content(conn: &Connection, name: &str) -> String { let result = conn.query_row( @@ -59,6 +64,7 @@ fn expand_template(conn: &Connection, args: &[&str]) -> String { } get_template_content(conn, args[0]) } +#[warn(dead_code)] // For now, we just hardcode a couple common templates. fn replace_template(_conn: &Connection, caps: &Captures) -> String { @@ -68,24 +74,15 @@ fn replace_template(_conn: &Connection, caps: &Captures) -> String { // _ => expand_template(conn, &elems) //} match elems[0] { - "," => - ",".to_owned(), - "ngd" | "unsupported" | "non-gloss definition" => - elems[1].to_owned(), - "alternative form of" => - format!("Alternative form of {}", elems[1]), - "ja-romanization of" => - format!("Rōmaji transcription of {}", elems[1]), - "sumti" => - format!("x{}", elems[1]), - "ja-def" => - format!("{}:", elems[1]), - "qualifier" => - format!("({})", elems[1]), - "lb" => - format!("({})", elems[2]), - "m" | "l" => - elems[2].to_owned(), + "," => ",".to_owned(), + "ngd" | "unsupported" | "non-gloss definition" => elems[1].to_owned(), + "alternative form of" => format!("Alternative form of {}", elems[1]), + "ja-romanization of" => format!("Rōmaji transcription of {}", elems[1]), + "sumti" => format!("x{}", elems[1]), + "ja-def" => format!("{}:", elems[1]), + "qualifier" => format!("({})", elems[1]), + "lb" => format!("({})", elems[2]), + "m" | "l" => elems[2].to_owned(), _ => caps.get(0).unwrap().as_str().to_owned(), } } @@ -139,27 +136,27 @@ fn main() { let conn = Connection::open(Path::new(&sqlite_path)).unwrap(); let all_langs = *get_defns_by_lang(&conn, &matches.free[0]); - let langs = - match matches.opt_str("l") { - None => all_langs, - Some(lang) => { - let mut result = BTreeMap::new(); - for &result_for_lang in all_langs.get(&lang).iter() { - result.insert(lang.clone(), result_for_lang.clone()); - } - result - }, + let langs = match matches.opt_str("l") { + None => all_langs, + Some(lang) => { + let mut result = BTreeMap::new(); + for &result_for_lang in all_langs.get(&lang).iter() { + result.insert(lang.clone(), result_for_lang.clone()); + } + result } - ; + }; print_words(&langs, |s| { let replace_template = |caps: &Captures| -> String { replace_template(&conn, caps) }; let mut result = s.to_owned(); if !matches.opt_present("r") { loop { - let result_ = re_template.replace_all(&result, &replace_template).to_string(); + let result_ = re_template + .replace_all(&result, &replace_template) + .to_string(); //println!("{}", result_); if result == result_ { - break + break; } result = result_; } diff --git a/src/lib.rs b/src/lib.rs index 19fb63b..896c89d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ -pub mod parse_xml; pub mod parse_wikitext; +pub mod parse_xml; #[derive(Debug)] pub struct Meaning { diff --git a/src/parse_wikitext.rs b/src/parse_wikitext.rs index 297672c..adcab54 100644 --- a/src/parse_wikitext.rs +++ b/src/parse_wikitext.rs @@ -96,31 +96,63 @@ pub fn parse_wikitext( let mut result: Vec = Vec::new(); let mut context_stack: ContextStack = ContextStack::new(); - let stack_apply = |context_stack: &mut ContextStack, wiki_context: &dyn Fn(String) -> WikiContext, line: &str, slice: &Option<&str>| { - slice.map_or_else(|| { - println!("Could not parse line: {}", line); - }, |slice| { - context_stack.apply( - wiki_context(slice.to_owned()), - languages, - parts_of_speech, - ); - }); + let stack_apply = |context_stack: &mut ContextStack, + wiki_context: &dyn Fn(String) -> WikiContext, + line: &str, + slice: &Option<&str>| { + slice.map_or_else( + || { + println!("Could not parse line: {}", line); + }, + |slice| { + context_stack.apply(wiki_context(slice.to_owned()), languages, parts_of_speech); + }, + ); }; for line in text.lines() { if line.starts_with("======") && line.len() > 12 { - stack_apply(&mut context_stack, &|x| Heading6(x), line, &line.get(6..line.len()-6)); + stack_apply( + &mut context_stack, + &|x| Heading6(x), + line, + &line.get(6..line.len() - 6), + ); } else if line.starts_with("=====") && line.len() > 10 { - stack_apply(&mut context_stack, &|x| Heading5(x), line, &line.get(5..line.len()-5)); + stack_apply( + &mut context_stack, + &|x| Heading5(x), + line, + &line.get(5..line.len() - 5), + ); } else if line.starts_with("====") && line.len() > 8 { - stack_apply(&mut context_stack, &|x| Heading4(x), line, &line.get(4..line.len()-4)); + stack_apply( + &mut context_stack, + &|x| Heading4(x), + line, + &line.get(4..line.len() - 4), + ); } else if line.starts_with("===") && line.len() > 6 { - stack_apply(&mut context_stack, &|x| Heading3(x), line, &line.get(3..line.len()-3)); + stack_apply( + &mut context_stack, + &|x| Heading3(x), + line, + &line.get(3..line.len() - 3), + ); } else if line.starts_with("==") && line.len() > 4 { - stack_apply(&mut context_stack, &|x| Heading2(x), line, &line.get(2..line.len()-2)); + stack_apply( + &mut context_stack, + &|x| Heading2(x), + line, + &line.get(2..line.len() - 2), + ); } else if line.starts_with("=") && line.len() > 2 { - stack_apply(&mut context_stack, &|x| Heading1(x), line, &line.get(1..line.len()-1)); + stack_apply( + &mut context_stack, + &|x| Heading1(x), + line, + &line.get(1..line.len() - 1), + ); } else if line.starts_with("# ") { context_stack.language.as_ref().and_then(|language| { context_stack.part_of_speech.as_ref().map(|part_of_speech| { diff --git a/src/parse_xml.rs b/src/parse_xml.rs index bf903bb..a5cf4fe 100644 --- a/src/parse_xml.rs +++ b/src/parse_xml.rs @@ -1,10 +1,8 @@ extern crate quick_xml; +use parse_xml::quick_xml::{events::Event, Reader}; -use parse_xml::quick_xml::Reader; -use parse_xml::quick_xml::events::Event; - -use std::path::Path; use std::io::BufRead; +use std::path::Path; use Page; @@ -12,18 +10,18 @@ fn parse_revision(reader: &mut Reader) -> Option { let mut buf = Vec::new(); let mut result = None; loop { - match reader.read_event(&mut buf) { - Ok(Event::Start(ref e)) if e.name() == b"text" => { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) if e.name().as_ref() == b"text" => { let mut buf = Vec::new(); - match reader.read_event(&mut buf) { + match reader.read_event_into(&mut buf) { Ok(Event::Text(e)) => { - let text = e.unescape_and_decode(&reader).unwrap(); + let text = e.unescape().unwrap().to_string(); result = Some(text); } _ => (), } } - Ok(Event::End(ref e)) if e.name() == b"revision" => break, + Ok(Event::End(ref e)) if e.name().as_ref() == b"revision" => break, _ => (), } } @@ -35,14 +33,12 @@ pub fn parse_page(mut reader: &mut Reader) -> Option { let mut title = None; let mut content = None; loop { - match reader.read_event(&mut buf) { + match reader.read_event_into(&mut buf) { Ok(Event::Start(ref e)) => { let mut buf = Vec::new(); - match e.name() { - b"title" => match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => { - title = Some(e.unescape_and_decode(&reader).unwrap().clone()) - } + match e.name().as_ref() { + b"title" => match reader.read_event_into(&mut buf) { + Ok(Event::Text(e)) => title = Some(e.unescape().unwrap().to_string()), _ => (), }, b"revision" => { @@ -51,7 +47,7 @@ pub fn parse_page(mut reader: &mut Reader) -> Option { _ => (), } } - Ok(Event::End(ref e)) if e.name() == b"page" => break, + Ok(Event::End(ref e)) if e.name().as_ref() == b"page" => break, _ => (), } } @@ -66,8 +62,8 @@ where let mut buf = Vec::new(); let mut reader = Reader::from_file(Path::new(filename)).unwrap(); 'read_words: loop { - match reader.read_event(&mut buf) { - Ok(Event::Start(ref e)) => match e.name() { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => match e.name().as_ref() { b"page" => { let page = parse_page(&mut reader); page.map(|page| f(page));