From 18bbd6de30ba9398e43ba1115ae2c560b5a93248 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 17:45:58 -0400 Subject: [PATCH 01/16] Use list comprehension in extractFeatures() --- hackmatch.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 6490f02..991cc3d 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -82,12 +82,13 @@ def extractFeatures(self, item, doc_words, fields=[]): if tokens: s_tokens.extend(tokens) - s_features = [] - for token in doc_words: - if token in s_tokens: - s_features.append(1) - else: - s_features.append(0) + #s_features = [] + #for token in doc_words: + # if token in s_tokens: + # s_features.append(1) + # else: + # s_features.append(0) + s_features = [token in s_tokens for token in doc_words] if sum(s_features) <= self.COMPLETENESS_THRESHOLD: return None From a24e1990e27c52eab9589ed1afa51f4eb342b96e Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 17:58:41 -0400 Subject: [PATCH 02/16] Generators defaultdict(int) set(get_stopwords()) operator.itemgetter() --- hackmatch.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 991cc3d..420ecc5 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -16,6 +16,7 @@ from nltk.tokenize import * from nltk.corpus import stopwords from hcluster import jaccard +from operator import itemgetter # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc @@ -26,7 +27,7 @@ class HackMatch(object): COMPLETENESS_THRESHOLD = 4 # num of words necessary to match def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): - self.stopwords = self.get_stopwords() + self.stopwords = set(self.get_stopwords()) self.distance = distance student_data = self.parseCSV(student_file) @@ -42,8 +43,10 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): def printMatches(self, matches, num_matches): for n, m in matches.items(): print n - for item, score in sorted(m.items(), key=lambda(i,c):(-c, i))[:num_matches]: - print "\t%s :: %s" % (item, score) + all_matches = sorted(m.items(), key=itemgetter(1), reverse=True) + top_matched = all_matches[:num_matches] + for item, score in top_matches: + print "\t%(item)s :: %(score)s" % locals() # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) print '\n' @@ -99,31 +102,28 @@ def defineFeatures(self, data, fields=[]): """ define the global bag of words features """ - ngram_freq = {} + ngram_freq = defaultdict(int) - for d in data: - for r in d: - for f in fields: - tokens = None - try: - tokens = word_tokenize(r[f]) - except (KeyError, TypeError): - pass - - if tokens: - for t in [t.lower() for t in tokens if t.lower() not in self.stopwords]: - t = t.strip('.') - ngram_freq[t] = ngram_freq.get(t, 0) + 1 + featureiter = ( + (d, r, f) + for d in data + for r in d + for f in fields + if f in r + ) + for d, r, f in featureiter: + tokeniter = (t.lower() for t in word_tokenize(r[f])) + legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords) + for t in legaliter: + ngram_freq[t] += 1 - ngram_freq = dict([(w,c) for w,c in ngram_freq.items() if c > 1]) + ngram_freq = dict((w,c) for w,c in ngram_freq.items() if c > 1) if self.DEBUG: print "Global vocabulary: %s" % len(ngram_freq) return ngram_freq def get_stopwords(self): - sw = stopwords.words('english') - sw.extend([',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']) - return sw + return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'] def parseCSV(self, filename): """ From c13cab2aa9a111884c1e77430f5c75539f10d666 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:01:04 -0400 Subject: [PATCH 03/16] x if cond else y --- hackmatch.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 420ecc5..29bc4f4 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -49,8 +49,7 @@ def printMatches(self, matches, num_matches): print "\t%(item)s :: %(score)s" % locals() # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) print '\n' - - + def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'): """ do ranking @@ -92,11 +91,7 @@ def extractFeatures(self, item, doc_words, fields=[]): # else: # s_features.append(0) s_features = [token in s_tokens for token in doc_words] - - if sum(s_features) <= self.COMPLETENESS_THRESHOLD: - return None - - return s_features + return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None def defineFeatures(self, data, fields=[]): """ From 8689e6b0f6dcf94a26ae52b3e8807b6f76c2e7a7 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:02:59 -0400 Subject: [PATCH 04/16] generator + map in extractFeatures() --- hackmatch.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 29bc4f4..3cf4530 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -73,23 +73,8 @@ def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field return matches def extractFeatures(self, item, doc_words, fields=[]): - s_tokens = [] - for f in fields: - tokens = None - try: - tokens = word_tokenize(item[f]) - except (KeyError, TypeError): - pass - - if tokens: - s_tokens.extend(tokens) - - #s_features = [] - #for token in doc_words: - # if token in s_tokens: - # s_features.append(1) - # else: - # s_features.append(0) + tokeniter = (item[f] for f in fields if f in item) + s_tokens = map(list.extend, tokeniter) s_features = [token in s_tokens for token in doc_words] return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None From f0f6972f712a14a813122f844150c70c603dff1f Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:08:34 -0400 Subject: [PATCH 05/16] Hoist common expression out of loop Create dict in one line Change reverse-sort to sort --- hackmatch.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 3cf4530..312a6a6 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -43,8 +43,8 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): def printMatches(self, matches, num_matches): for n, m in matches.items(): print n - all_matches = sorted(m.items(), key=itemgetter(1), reverse=True) - top_matched = all_matches[:num_matches] + all_matches = sorted(m.items(), key=itemgetter(1)) + top_matched = all_matches[-num_matches:] for item, score in top_matches: print "\t%(item)s :: %(score)s" % locals() # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) @@ -54,19 +54,17 @@ def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field """ do ranking """ - base = {} - for item in base_data: - base[item[base_name_field]] = self.extractFeatures(item, doc_words, fields) - + base = dict((item[base_name_field], self.extractFeatures(item, doc_words, fields)) for item in base_data) + matches = defaultdict(dict) for match_item in match_data: match_features = self.extractFeatures(match_item, doc_words, fields) - + d = matches[match_item[match_name_field]] for base_item, base_item_features in base.items(): # actually do the comparison if not base_item_features or not match_features: - matches[match_item[match_name_field]][base_item] = 0.0 + d[base_item] = 0.0 else: - matches[match_item[match_name_field]][base_item] = self.distance(base_item_features, match_features) + d[base_item] = self.distance(base_item_features, match_features) if self.DEBUG: print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features)) From d6d717b5539a7be246f218eea8f695f511155712 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:12:21 -0400 Subject: [PATCH 06/16] Eliminate unneeded fields in generator --- hackmatch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 312a6a6..cd5003e 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -83,14 +83,14 @@ def defineFeatures(self, data, fields=[]): ngram_freq = defaultdict(int) featureiter = ( - (d, r, f) + r[f] for d in data for r in d for f in fields if f in r ) - for d, r, f in featureiter: - tokeniter = (t.lower() for t in word_tokenize(r[f])) + for field in featureiter: + tokeniter = (t.lower() for t in word_tokenize(field)) legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords) for t in legaliter: ngram_freq[t] += 1 From a0b56b7f345f2a170942bf376caef1f00a9bdbad Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:13:43 -0400 Subject: [PATCH 07/16] Remove unneeded imports --- hackmatch.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index cd5003e..586a383 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -8,9 +8,6 @@ Copyright (c) 2010 hackNY. All rights reserved. """ -import sys, os -import csv -import string from collections import defaultdict from optparse import OptionParser from nltk.tokenize import * From 78a3ac1991aaa81ac63169ea830bf7091dfbacd4 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 18:18:20 -0400 Subject: [PATCH 08/16] Added .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dc84959 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +build/ + From d5cccd3ba871a6220d2735645d9d7b55d8d6a13c Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 19:12:02 -0400 Subject: [PATCH 09/16] pylint code --- hackmatch.py | 119 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index 586a383..dff41b7 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -7,76 +7,107 @@ Created by Hilary Mason, Chris Wiggins, and Evan Korth. Copyright (c) 2010 hackNY. All rights reserved. """ +# pylint: disable=W0614 +# pylint: disable=C0301 from collections import defaultdict from optparse import OptionParser -from nltk.tokenize import * from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize from hcluster import jaccard from operator import itemgetter +from csv import DictReader # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc +def get_stopwords(): + """ + get_stopwords: generate a list of stop words + """ + return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'] + +def parse_csv(filename): + """ + parse_csv: parses the CSV file to a dict + """ + csv_reader = DictReader(open(filename)) + return [r for r in csv_reader] + +def print_matches(matches, num_matches): + """ + print_matches: print the top 'num_matches' matches + """ + for key, value_dict in matches.items(): + print key + all_matches = sorted(value_dict.items(), key=itemgetter(1)) + top_matches = all_matches[-num_matches:] + # pylint: disable=W0612 + for item, score in top_matches: + print "\t%(item)s :: %(score)s" % locals() + # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) + # pylint: enable=W0612 + print '\n' + class HackMatch(object): + """ + HackMatch: class to encapsulate matching companies versus startups on selected fields + """ DEBUG = False BOW_FIELDS = ['Environment', 'Project', 'Skills', 'Misc'] COMPLETENESS_THRESHOLD = 4 # num of words necessary to match def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): - self.stopwords = set(self.get_stopwords()) + self.stopwords = set(get_stopwords()) self.distance = distance - student_data = self.parseCSV(student_file) - startup_data = self.parseCSV(startup_file) + student_data = parse_csv(student_file) + startup_data = parse_csv(startup_file) - doc_words = self.defineFeatures([student_data, startup_data], self.BOW_FIELDS) + doc_words = self.define_features([student_data, startup_data], self.BOW_FIELDS) - # matches = self.doRanking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company') - matches = self.doRanking(startup_data, student_data, doc_words, self.BOW_FIELDS) + # matches = self.do_ranking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company') + matches = self.do_ranking(startup_data, student_data, doc_words, self.BOW_FIELDS) - self.printMatches(matches, num_matches) - - def printMatches(self, matches, num_matches): - for n, m in matches.items(): - print n - all_matches = sorted(m.items(), key=itemgetter(1)) - top_matched = all_matches[-num_matches:] - for item, score in top_matches: - print "\t%(item)s :: %(score)s" % locals() - # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) - print '\n' - - def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'): + print_matches(matches, num_matches) + + # pylint: disable=R0913 + def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'): """ do ranking """ - base = dict((item[base_name_field], self.extractFeatures(item, doc_words, fields)) for item in base_data) + fields = fields or [] + base = dict((item[base_name_field], self.extract_features(item, doc_words, fields)) for item in base_data) matches = defaultdict(dict) for match_item in match_data: - match_features = self.extractFeatures(match_item, doc_words, fields) - d = matches[match_item[match_name_field]] + match_features = self.extract_features(match_item, doc_words, fields) + temp_dict = matches[match_item[match_name_field]] for base_item, base_item_features in base.items(): # actually do the comparison if not base_item_features or not match_features: - d[base_item] = 0.0 + temp_dict[base_item] = 0.0 else: - d[base_item] = self.distance(base_item_features, match_features) + temp_dict[base_item] = self.distance(base_item_features, match_features) if self.DEBUG: print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features)) return matches - def extractFeatures(self, item, doc_words, fields=[]): - tokeniter = (item[f] for f in fields if f in item) - s_tokens = map(list.extend, tokeniter) + def extract_features(self, item_dict, doc_words, fields=None): + """ + extract_features: Determine whether features pass test + """ + fields = fields or [] + tokeniter = (item_dict[f] for f in fields if f in item_dict) + s_tokens = reduce(list.extend, tokeniter) s_features = [token in s_tokens for token in doc_words] return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None - def defineFeatures(self, data, fields=[]): + def define_features(self, data, fields=None): """ define the global bag of words features """ + fields = fields or [] ngram_freq = defaultdict(int) featureiter = ( @@ -87,32 +118,22 @@ def defineFeatures(self, data, fields=[]): if f in r ) for field in featureiter: - tokeniter = (t.lower() for t in word_tokenize(field)) - legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords) - for t in legaliter: - ngram_freq[t] += 1 + tokeniter = (word.lower() for word in word_tokenize(field)) + legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords) + for legal_word in legaliter: + ngram_freq[legal_word] += 1 - ngram_freq = dict((w,c) for w,c in ngram_freq.items() if c > 1) + ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1) if self.DEBUG: print "Global vocabulary: %s" % len(ngram_freq) return ngram_freq - def get_stopwords(self): - return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'] - - def parseCSV(self, filename): - """ - parseCSV: parses the CSV file to a dict - """ - csv_reader = csv.DictReader(open(filename)) - return [r for r in csv_reader] - - + if __name__ == '__main__': parser = OptionParser() - parser.add_option("-n","--number", action="store", type="int", dest="num_matches",default=10,help="number of results to return") - parser.add_option("-s","--student", action="store", type="string", dest="student_file",default="unmatched_students.csv",help="csv of student data") - parser.add_option("-t","--startup", action="store", type="string", dest="startup_file",default="unmatched_top_startups.csv",help="csv of startup data") + parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return") + parser.add_option("-s", "--student", action="store", type="string", dest="student_file", default="unmatched_students.csv", help="csv of student data") + parser.add_option("-t", "--startup", action="store", type="string", dest="startup_file", default="unmatched_top_startups.csv", help="csv of startup data") (options, args) = parser.parse_args() - h = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file) \ No newline at end of file + hackmatch = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file) From b922eb9e6bdec5159e2374bca02debbb4ecdb6d2 Mon Sep 17 00:00:00 2001 From: hbrown Date: Tue, 26 Oct 2010 21:52:31 -0400 Subject: [PATCH 10/16] Added requirements file --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fe8566b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +hcluster>=0.2.0 +nltk>=2.0b9 From f0337c764e72c89f8d65e5f25747c038ce832f5a Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 01:16:15 -0400 Subject: [PATCH 11/16] Added data files --- startups.txt | 2 ++ students.txt | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 startups.txt create mode 100644 students.txt diff --git a/startups.txt b/startups.txt new file mode 100644 index 0000000..6662658 --- /dev/null +++ b/startups.txt @@ -0,0 +1,2 @@ +"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc" +"Foobar Corp","x@foo.com","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook" diff --git a/students.txt b/students.txt new file mode 100644 index 0000000..45fa9ce --- /dev/null +++ b/students.txt @@ -0,0 +1,2 @@ +Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc +Hugh,hughdbrown@yahoo.com,Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance" From b9fdb626bae7dc45518f1a845b736a61dcd549cb Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 11:56:34 -0400 Subject: [PATCH 12/16] Remove blank lines, pylint directives --- hackmatch.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/hackmatch.py b/hackmatch.py index dff41b7..4bfddc3 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -26,7 +26,7 @@ def get_stopwords(): get_stopwords: generate a list of stop words """ return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'] - + def parse_csv(filename): """ parse_csv: parses the CSV file to a dict @@ -42,11 +42,9 @@ def print_matches(matches, num_matches): print key all_matches = sorted(value_dict.items(), key=itemgetter(1)) top_matches = all_matches[-num_matches:] - # pylint: disable=W0612 for item, score in top_matches: print "\t%(item)s :: %(score)s" % locals() # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) - # pylint: enable=W0612 print '\n' class HackMatch(object): @@ -71,7 +69,6 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): print_matches(matches, num_matches) - # pylint: disable=R0913 def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'): """ do ranking @@ -90,7 +87,6 @@ def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_fi temp_dict[base_item] = self.distance(base_item_features, match_features) if self.DEBUG: print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features)) - return matches def extract_features(self, item_dict, doc_words, fields=None): @@ -122,12 +118,11 @@ def define_features(self, data, fields=None): legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords) for legal_word in legaliter: ngram_freq[legal_word] += 1 - ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1) if self.DEBUG: print "Global vocabulary: %s" % len(ngram_freq) return ngram_freq - + if __name__ == '__main__': parser = OptionParser() From 9af97d754253a244ff058feaaeda244478eab828 Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 12:54:13 -0400 Subject: [PATCH 13/16] Code to get the nltk stopwords corpus --- download_stopwords.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 download_stopwords.py diff --git a/download_stopwords.py b/download_stopwords.py new file mode 100644 index 0000000..8bafa70 --- /dev/null +++ b/download_stopwords.py @@ -0,0 +1,2 @@ +from nltk import download +download('stopwords') From fda38f2c8bb92fb2b4e9b87bceaf0a9c53af239c Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 12:57:23 -0400 Subject: [PATCH 14/16] Added list_reducer(), removed reduce() call --- hackmatch.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hackmatch.py b/hackmatch.py index 4bfddc3..2578022 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -21,6 +21,16 @@ # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc +# Hack +# I'd like to write this: +# return reduce(list.extend, list_of_lists) +# but it generates an error I don't get +def list_reducer(list_iter): + result = [] + for l in list_iter: + result.extend(l) + return result + def get_stopwords(): """ get_stopwords: generate a list of stop words @@ -95,7 +105,7 @@ def extract_features(self, item_dict, doc_words, fields=None): """ fields = fields or [] tokeniter = (item_dict[f] for f in fields if f in item_dict) - s_tokens = reduce(list.extend, tokeniter) + s_tokens = list_reducer(tokeniter) s_features = [token in s_tokens for token in doc_words] return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None From a34ec17286db96489423bf3a5f964948289efb4b Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 12:59:29 -0400 Subject: [PATCH 15/16] Added requirements.txt instructions Added download_stopwords.py instructions --- README | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README b/README index b2ab13d..501be1e 100644 --- a/README +++ b/README @@ -14,6 +14,17 @@ Dependencies hackMatch uses NLTK (nltk.org) and hcluster (for distance metrics). +To install the dependencies, run 'pip install -r requirements.txt'. + +After installing nltk, you need to get the stopwords corpus. Run download_stopwords.py at the command line: + + $ python download_stopwords.py + [nltk_data] Downloading package 'stopwords' to + [nltk_data] /home/hbrown/nltk_data... + [nltk_data] Unzipping corpora/stopwords.zip. + True + $ + ======= License ======= From 88cce70d9639b68ed18941c37b8fd5b25c039f42 Mon Sep 17 00:00:00 2001 From: hbrown Date: Wed, 27 Oct 2010 13:02:58 -0400 Subject: [PATCH 16/16] Remove blank line --- hackmatch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hackmatch.py b/hackmatch.py index 2578022..8efa583 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -133,7 +133,6 @@ def define_features(self, data, fields=None): print "Global vocabulary: %s" % len(ngram_freq) return ngram_freq - if __name__ == '__main__': parser = OptionParser() parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return")