From 622e080b791f3fd8a47dc43d4f3ee0cfe5a21f2e Mon Sep 17 00:00:00 2001 From: tingyuan Date: Mon, 19 Oct 2020 00:18:12 +0800 Subject: [PATCH] update requests decoding and corpus set --- getngrams.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/getngrams.py b/getngrams.py index 83e7b8d..3894574 100644 --- a/getngrams.py +++ b/getngrams.py @@ -7,12 +7,18 @@ import subprocess import sys -corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_gb_2012=18, eng_gb_2009=6, - chi_sim_2012=23, chi_sim_2009=11, eng_2012=15, eng_2009=0, +corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_us_2019=28, + eng_gb_2012=18, eng_gb_2009=6, eng_gb_2019=26, + chi_sim_2019=34, chi_sim_2012=23, chi_sim_2009=11, + eng_2012=15, eng_2009=0, eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1, - fre_2012=19, fre_2009=7, ger_2012=20, ger_2009=8, heb_2012=24, - heb_2009=9, spa_2012=21, spa_2009=10, rus_2012=25, rus_2009=12, - ita_2012=22) + fre_2019=30, fre_2012=19, fre_2009=7, + ger_2019=31, ger_2012=20, ger_2009=8, + heb_2012=24, + heb_2009=9, + spa_2019=32, spa_2012=21, spa_2009=10, + rus_2019=36, rus_2012=25, rus_2009=12, + ita_2019=33, ita_2012=22) def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive): @@ -25,20 +31,31 @@ def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive): params['content'] = params['content'].replace('?', '*') if '@' in params['content']: params['content'] = params['content'].replace('@', '=>') + req = requests.get('http://books.google.com/ngrams/graph', params=params) - res = re.findall('var data = (.*?);\\n', req.text) + res = re.findall('ngrams.data = .*\];', req.text) + assert(len(res)==1) + if res: + dataDict = literal_eval(res[0].replace( + "ngrams.data = ", "").replace(";", "")) data = {qry['ngram']: qry['timeseries'] - for qry in literal_eval(res[0])} + for qry in dataDict} df = DataFrame(data) df.insert(0, 'year', list(range(startYear, endYear + 1))) else: df = DataFrame() return req.url, params['content'], df +def trimSpaceNearComma(argumentString): + while (argumentString.find(', ')>=0): + argumentString = argumentString.replace(', ',',') + while (argumentString.find(' ,')>=0): + argumentString = argumentString.replace(' ,',',') + return argumentString -def runQuery(argumentString): - arguments = argumentString.split() +def runQuery(argumentString): + arguments = trimSpaceNearComma(argumentString).split() query = ' '.join([arg for arg in arguments if not arg.startswith('-')]) if '?' in query: query = query.replace('?', '*')