Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/target/
target/
**/*.rs.bk
Cargo.lock
/target/
**/*.rs.bk

modules/**
*.bz2
*.xml
Session.vim
46 changes: 31 additions & 15 deletions src/bin/build_definitions_db.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
extern crate define3;
extern crate getopts;
extern crate regex;
extern crate rusqlite;
extern crate getopts;

use define3::{Module, Template, Word};
use define3::PageContent;
use define3::parse_wikitext::parse_wikitext;
use define3::PageContent;
use define3::{Module, Template, Word};

use getopts::Options;
use regex::Regex;
use rusqlite::{Connection, Transaction};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::fs;
use std::fs::File;
use std::io::Write;
use std::path::Path;

Expand All @@ -28,7 +28,10 @@ fn main() {
"Japanese",
"Korean",
"Lojban",
].iter().cloned().collect();
]
.iter()
.cloned()
.collect();

// TODO: figure out POS list automatically
let parts_of_speech: HashSet<&str> = [
Expand All @@ -51,14 +54,20 @@ fn main() {
"Rafsi",
"Romanization",
"Verb",
].iter().cloned().collect();
]
.iter()
.cloned()
.collect();

let args: Vec<String> = std::env::args().collect();
let mut opts = Options::new();
opts.optflag("h", "help", "print this help text");
let matches = opts.parse(&args[1..]).unwrap();
if matches.opt_present("h") || matches.free.len() != 1 {
let brief = format!("Usage: {} PATH_TO_enwiktionary-YYYYMMDD-pages-meta-current.xml [options]", args[0]);
let brief = format!(
"Usage: {} PATH_TO_enwiktionary-YYYYMMDD-pages-meta-current.xml [options]",
args[0]
);
print!("{}", opts.usage(&brief));
return;
}
Expand Down Expand Up @@ -88,7 +97,8 @@ fn main() {
content text not null
)",
[],
).unwrap();
)
.unwrap();

tx.execute("DROP TABLE IF EXISTS modules", []).unwrap();
tx.execute(
Expand All @@ -97,7 +107,8 @@ fn main() {
content text not null
)",
[],
).unwrap();
)
.unwrap();

let re_noinclude = Regex::new(r"<noinclude>(?P<text>(?s:.)*?)</noinclude>").unwrap();
let re_includeonly = Regex::new(r"<includeonly>(?P<text>(?s:.)*?)</includeonly>").unwrap();
Expand All @@ -123,17 +134,19 @@ fn main() {
tx.execute(
"insert into templates (name, content) values (?1, ?2)",
&[&title, &content.as_str()],
).unwrap();
)
.unwrap();
templates.insert(title.to_owned(), content);
} else if page.title.starts_with("Module:") {
let title = &page.title[7..];
tx.execute(
"insert into modules (name, content) values (?1, ?2)",
[&title, &page.content.as_str()],
).unwrap();
)
.unwrap();

println!("Saved module: {}", page.title);
let path = format!("/trove/data/enwikt/modules/{}.lua", page.title);
let path = format!("modules/{}.lua", page.title);
let path = Path::new(&path);
fs::create_dir_all(path.parent().unwrap()).unwrap();
let mut file = File::create(path).unwrap();
Expand All @@ -153,7 +166,8 @@ fn main() {
definition text not null
)",
[],
).unwrap();
)
.unwrap();

define3::parse_xml::for_pages(&xml_path, |page| {
let page_content = match page.title.split(':').next() {
Expand Down Expand Up @@ -196,7 +210,8 @@ fn main() {
&meaning.part_of_speech,
&defn.into_owned(),
],
).unwrap();
)
.unwrap();
}
}
_ => (),
Expand All @@ -207,7 +222,8 @@ fn main() {
"create index words_name_idx on words(name);
create index words_language_idx on words(language);
create index words_part_of_speech_idx on words(part_of_speech);",
).unwrap();
)
.unwrap();

tx.commit().unwrap();
}
77 changes: 37 additions & 40 deletions src/bin/define.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
extern crate colored;
extern crate define3;
extern crate getopts;
extern crate nom;
extern crate regex;
extern crate rusqlite;
extern crate textwrap;
extern crate nom;

use define3::Meaning;

Expand All @@ -20,14 +20,18 @@ fn get_defns_by_lang(
conn: &Connection,
word: &str,
) -> Box<BTreeMap<String, BTreeMap<String, Vec<String>>>> {
let mut stmt = conn.prepare(
"SELECT language, part_of_speech, definition FROM words WHERE name = ?1",
).unwrap();
let word_iter = stmt.query_map(&[&word], |row| Ok (Meaning {
language: row.get(0).unwrap(),
part_of_speech: row.get(1).unwrap(),
definition: row.get(2).unwrap(),
})).unwrap();
let mut stmt = conn
.prepare("SELECT language, part_of_speech, definition FROM words WHERE name = ?1")
.unwrap();
let word_iter = stmt
.query_map(&[&word], |row| {
Ok(Meaning {
language: row.get(0).unwrap(),
part_of_speech: row.get(1).unwrap(),
definition: row.get(2).unwrap(),
})
})
.unwrap();

let mut langs: BTreeMap<String, BTreeMap<String, Vec<String>>> = BTreeMap::new();

Expand All @@ -47,6 +51,7 @@ fn get_defns_by_lang(
// functions and often call out into Lua code.
// https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
// https://www.mediawiki.org/wiki/Extension:Scribunto
#[allow(dead_code)]
fn expand_template(conn: &Connection, args: &[&str]) -> String {
fn get_template_content(conn: &Connection, name: &str) -> String {
let result = conn.query_row(
Expand All @@ -59,6 +64,7 @@ fn expand_template(conn: &Connection, args: &[&str]) -> String {
}
get_template_content(conn, args[0])
}
#[warn(dead_code)]

// For now, we just hardcode a couple common templates.
fn replace_template(_conn: &Connection, caps: &Captures) -> String {
Expand All @@ -68,24 +74,15 @@ fn replace_template(_conn: &Connection, caps: &Captures) -> String {
// _ => expand_template(conn, &elems)
//}
match elems[0] {
"," =>
",".to_owned(),
"ngd" | "unsupported" | "non-gloss definition" =>
elems[1].to_owned(),
"alternative form of" =>
format!("Alternative form of {}", elems[1]),
"ja-romanization of" =>
format!("Rōmaji transcription of {}", elems[1]),
"sumti" =>
format!("x{}", elems[1]),
"ja-def" =>
format!("{}:", elems[1]),
"qualifier" =>
format!("({})", elems[1]),
"lb" =>
format!("({})", elems[2]),
"m" | "l" =>
elems[2].to_owned(),
"," => ",".to_owned(),
"ngd" | "unsupported" | "non-gloss definition" => elems[1].to_owned(),
"alternative form of" => format!("Alternative form of {}", elems[1]),
"ja-romanization of" => format!("Rōmaji transcription of {}", elems[1]),
"sumti" => format!("x{}", elems[1]),
"ja-def" => format!("{}:", elems[1]),
"qualifier" => format!("({})", elems[1]),
"lb" => format!("({})", elems[2]),
"m" | "l" => elems[2].to_owned(),
_ => caps.get(0).unwrap().as_str().to_owned(),
}
}
Expand Down Expand Up @@ -139,27 +136,27 @@ fn main() {
let conn = Connection::open(Path::new(&sqlite_path)).unwrap();

let all_langs = *get_defns_by_lang(&conn, &matches.free[0]);
let langs =
match matches.opt_str("l") {
None => all_langs,
Some(lang) => {
let mut result = BTreeMap::new();
for &result_for_lang in all_langs.get(&lang).iter() {
result.insert(lang.clone(), result_for_lang.clone());
}
result
},
let langs = match matches.opt_str("l") {
None => all_langs,
Some(lang) => {
let mut result = BTreeMap::new();
for &result_for_lang in all_langs.get(&lang).iter() {
result.insert(lang.clone(), result_for_lang.clone());
}
result
}
;
};
print_words(&langs, |s| {
let replace_template = |caps: &Captures| -> String { replace_template(&conn, caps) };
let mut result = s.to_owned();
if !matches.opt_present("r") {
loop {
let result_ = re_template.replace_all(&result, &replace_template).to_string();
let result_ = re_template
.replace_all(&result, &replace_template)
.to_string();
//println!("{}", result_);
if result == result_ {
break
break;
}
result = result_;
}
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pub mod parse_xml;
pub mod parse_wikitext;
pub mod parse_xml;

#[derive(Debug)]
pub struct Meaning {
Expand Down
64 changes: 48 additions & 16 deletions src/parse_wikitext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,31 +96,63 @@ pub fn parse_wikitext(
let mut result: Vec<Meaning> = Vec::new();
let mut context_stack: ContextStack = ContextStack::new();

let stack_apply = |context_stack: &mut ContextStack, wiki_context: &dyn Fn(String) -> WikiContext, line: &str, slice: &Option<&str>| {
slice.map_or_else(|| {
println!("Could not parse line: {}", line);
}, |slice| {
context_stack.apply(
wiki_context(slice.to_owned()),
languages,
parts_of_speech,
);
});
let stack_apply = |context_stack: &mut ContextStack,
wiki_context: &dyn Fn(String) -> WikiContext,
line: &str,
slice: &Option<&str>| {
slice.map_or_else(
|| {
println!("Could not parse line: {}", line);
},
|slice| {
context_stack.apply(wiki_context(slice.to_owned()), languages, parts_of_speech);
},
);
};

for line in text.lines() {
if line.starts_with("======") && line.len() > 12 {
stack_apply(&mut context_stack, &|x| Heading6(x), line, &line.get(6..line.len()-6));
stack_apply(
&mut context_stack,
&|x| Heading6(x),
line,
&line.get(6..line.len() - 6),
);
} else if line.starts_with("=====") && line.len() > 10 {
stack_apply(&mut context_stack, &|x| Heading5(x), line, &line.get(5..line.len()-5));
stack_apply(
&mut context_stack,
&|x| Heading5(x),
line,
&line.get(5..line.len() - 5),
);
} else if line.starts_with("====") && line.len() > 8 {
stack_apply(&mut context_stack, &|x| Heading4(x), line, &line.get(4..line.len()-4));
stack_apply(
&mut context_stack,
&|x| Heading4(x),
line,
&line.get(4..line.len() - 4),
);
} else if line.starts_with("===") && line.len() > 6 {
stack_apply(&mut context_stack, &|x| Heading3(x), line, &line.get(3..line.len()-3));
stack_apply(
&mut context_stack,
&|x| Heading3(x),
line,
&line.get(3..line.len() - 3),
);
} else if line.starts_with("==") && line.len() > 4 {
stack_apply(&mut context_stack, &|x| Heading2(x), line, &line.get(2..line.len()-2));
stack_apply(
&mut context_stack,
&|x| Heading2(x),
line,
&line.get(2..line.len() - 2),
);
} else if line.starts_with("=") && line.len() > 2 {
stack_apply(&mut context_stack, &|x| Heading1(x), line, &line.get(1..line.len()-1));
stack_apply(
&mut context_stack,
&|x| Heading1(x),
line,
&line.get(1..line.len() - 1),
);
} else if line.starts_with("# ") {
context_stack.language.as_ref().and_then(|language| {
context_stack.part_of_speech.as_ref().map(|part_of_speech| {
Expand Down
Loading