From 18bbd6de30ba9398e43ba1115ae2c560b5a93248 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 17:45:58 -0400
Subject: [PATCH 01/16] Use list comprehension in extractFeatures()

---
 hackmatch.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 6490f02..991cc3d 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -82,12 +82,13 @@ def extractFeatures(self, item, doc_words, fields=[]):
             if tokens:
                 s_tokens.extend(tokens)
         
-        s_features = []        
-        for token in doc_words:
-            if token in s_tokens:
-                s_features.append(1)
-            else:
-                s_features.append(0)
+        #s_features = []        
+        #for token in doc_words:
+        #    if token in s_tokens:
+        #        s_features.append(1)
+        #    else:
+        #        s_features.append(0)
+        s_features = [token in s_tokens for token in doc_words]
         
         if sum(s_features) <= self.COMPLETENESS_THRESHOLD:
             return None

From a24e1990e27c52eab9589ed1afa51f4eb342b96e Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 17:58:41 -0400
Subject: [PATCH 02/16] Generators defaultdict(int) set(get_stopwords())
 operator.itemgetter()

---
 hackmatch.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 991cc3d..420ecc5 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -16,6 +16,7 @@
 from nltk.tokenize import *
 from nltk.corpus import stopwords
 from hcluster import jaccard
+from operator import itemgetter
 
 # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc
 # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
@@ -26,7 +27,7 @@ class HackMatch(object):
     COMPLETENESS_THRESHOLD = 4 # num of words necessary to match
     
     def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
-        self.stopwords = self.get_stopwords()
+        self.stopwords = set(self.get_stopwords())
         self.distance = distance
         
         student_data = self.parseCSV(student_file)
@@ -42,8 +43,10 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
     def printMatches(self, matches, num_matches):
         for n, m in matches.items():
             print n
-            for item, score in sorted(m.items(), key=lambda(i,c):(-c, i))[:num_matches]:
-                print "\t%s :: %s" % (item, score)
+            all_matches = sorted(m.items(), key=itemgetter(1), reverse=True)
+            top_matched = all_matches[:num_matches]
+            for item, score in top_matches:
+                print "\t%(item)s :: %(score)s" % locals()
                 # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
             print '\n'
             
@@ -99,31 +102,28 @@ def defineFeatures(self, data, fields=[]):
         """
         define the global bag of words features
         """
-        ngram_freq = {}
+        ngram_freq = defaultdict(int)
         
-        for d in data:
-            for r in d:
-                for f in fields:
-                    tokens = None
-                    try:
-                        tokens = word_tokenize(r[f])
-                    except (KeyError, TypeError):
-                        pass
-                    
-                    if tokens:
-                        for t in [t.lower() for t in tokens if t.lower() not in self.stopwords]:
-                            t = t.strip('.')
-                            ngram_freq[t] = ngram_freq.get(t, 0) + 1
+        featureiter = (
+            (d, r, f)
+            for d in data
+            for r in d
+            for f in fields
+            if f in r
+        )
+        for d, r, f in featureiter:
+            tokeniter = (t.lower() for t in word_tokenize(r[f]))
+            legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords)
+            for t in legaliter:
+                ngram_freq[t] += 1
                             
-        ngram_freq = dict([(w,c) for w,c in ngram_freq.items() if c > 1])
+        ngram_freq = dict((w,c) for w,c in ngram_freq.items() if c > 1)
         if self.DEBUG:
             print "Global vocabulary: %s" % len(ngram_freq)        
         return ngram_freq
     
     def get_stopwords(self):
-        sw = stopwords.words('english')
-        sw.extend([',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'])
-        return sw
+        return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']
             
     def parseCSV(self, filename):
         """

From c13cab2aa9a111884c1e77430f5c75539f10d666 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:01:04 -0400
Subject: [PATCH 03/16] x if cond else y

---
 hackmatch.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 420ecc5..29bc4f4 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -49,8 +49,7 @@ def printMatches(self, matches, num_matches):
                 print "\t%(item)s :: %(score)s" % locals()
                 # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
             print '\n'
-            
-        
+
     def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'):
         """
         do ranking
@@ -92,11 +91,7 @@ def extractFeatures(self, item, doc_words, fields=[]):
         #    else:
         #        s_features.append(0)
         s_features = [token in s_tokens for token in doc_words]
-        
-        if sum(s_features) <= self.COMPLETENESS_THRESHOLD:
-            return None
-            
-        return s_features
+        return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None
 
     def defineFeatures(self, data, fields=[]):
         """

From 8689e6b0f6dcf94a26ae52b3e8807b6f76c2e7a7 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:02:59 -0400
Subject: [PATCH 04/16] generator + map in extractFeatures()

---
 hackmatch.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 29bc4f4..3cf4530 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -73,23 +73,8 @@ def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field
         return matches
 
     def extractFeatures(self, item, doc_words, fields=[]):
-        s_tokens = []
-        for f in fields:
-            tokens = None
-            try:
-                tokens = word_tokenize(item[f])
-            except (KeyError, TypeError):
-                pass
-                
-            if tokens:
-                s_tokens.extend(tokens)
-        
-        #s_features = []        
-        #for token in doc_words:
-        #    if token in s_tokens:
-        #        s_features.append(1)
-        #    else:
-        #        s_features.append(0)
+        tokeniter = (item[f] for f in fields if f in item)
+        s_tokens = map(list.extend, tokeniter)
         s_features = [token in s_tokens for token in doc_words]
         return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None
 

From f0f6972f712a14a813122f844150c70c603dff1f Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:08:34 -0400
Subject: [PATCH 05/16] Hoist common expression out of loop Create dict in one
 line Change reverse-sort to sort

---
 hackmatch.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 3cf4530..312a6a6 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -43,8 +43,8 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
     def printMatches(self, matches, num_matches):
         for n, m in matches.items():
             print n
-            all_matches = sorted(m.items(), key=itemgetter(1), reverse=True)
-            top_matched = all_matches[:num_matches]
+            all_matches = sorted(m.items(), key=itemgetter(1))
+            top_matched = all_matches[-num_matches:]
             for item, score in top_matches:
                 print "\t%(item)s :: %(score)s" % locals()
                 # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
@@ -54,19 +54,17 @@ def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field
         """
         do ranking
         """
-        base = {}
-        for item in base_data:
-            base[item[base_name_field]] = self.extractFeatures(item, doc_words, fields)
-            
+        base = dict((item[base_name_field], self.extractFeatures(item, doc_words, fields)) for item in base_data)
+
         matches = defaultdict(dict)
         for match_item in match_data:
             match_features = self.extractFeatures(match_item, doc_words, fields)
-
+            d = matches[match_item[match_name_field]]
             for base_item, base_item_features in base.items(): # actually do the comparison
                 if not base_item_features or not match_features:
-                    matches[match_item[match_name_field]][base_item] = 0.0
+                    d[base_item] = 0.0
                 else:
-                    matches[match_item[match_name_field]][base_item] = self.distance(base_item_features, match_features)
+                    d[base_item] = self.distance(base_item_features, match_features)
                 if self.DEBUG:
                     print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features))
 

From d6d717b5539a7be246f218eea8f695f511155712 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:12:21 -0400
Subject: [PATCH 06/16] Eliminate unneeded fields in generator

---
 hackmatch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 312a6a6..cd5003e 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -83,14 +83,14 @@ def defineFeatures(self, data, fields=[]):
         ngram_freq = defaultdict(int)
         
         featureiter = (
-            (d, r, f)
+            r[f]
             for d in data
             for r in d
             for f in fields
             if f in r
         )
-        for d, r, f in featureiter:
-            tokeniter = (t.lower() for t in word_tokenize(r[f]))
+        for field in featureiter:
+            tokeniter = (t.lower() for t in word_tokenize(field))
             legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords)
             for t in legaliter:
                 ngram_freq[t] += 1

From a0b56b7f345f2a170942bf376caef1f00a9bdbad Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:13:43 -0400
Subject: [PATCH 07/16] Remove unneeded imports

---
 hackmatch.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index cd5003e..586a383 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -8,9 +8,6 @@
 Copyright (c) 2010 hackNY. All rights reserved.
 """
 
-import sys, os
-import csv
-import string
 from collections import defaultdict
 from optparse import OptionParser
 from nltk.tokenize import *

From 78a3ac1991aaa81ac63169ea830bf7091dfbacd4 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 18:18:20 -0400
Subject: [PATCH 08/16] Added .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dc84959
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+build/
+

From d5cccd3ba871a6220d2735645d9d7b55d8d6a13c Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 19:12:02 -0400
Subject: [PATCH 09/16] pylint code

---
 hackmatch.py | 119 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index 586a383..dff41b7 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -7,76 +7,107 @@
 Created by Hilary Mason, Chris Wiggins, and Evan Korth.
 Copyright (c) 2010 hackNY. All rights reserved.
 """
+# pylint: disable=W0614
+# pylint: disable=C0301
 
 from collections import defaultdict
 from optparse import OptionParser
-from nltk.tokenize import *
 from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
 from hcluster import jaccard
 from operator import itemgetter
+from csv import DictReader
 
 # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc
 # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
 
+def get_stopwords():
+    """
+    get_stopwords: generate a list of stop words
+    """
+    return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']
+        
+def parse_csv(filename):
+    """
+    parse_csv: parses the CSV file to a dict
+    """
+    csv_reader = DictReader(open(filename))
+    return [r for r in csv_reader]
+
+def print_matches(matches, num_matches):
+    """
+    print_matches: print the top 'num_matches' matches
+    """
+    for key, value_dict in matches.items():
+        print key
+        all_matches = sorted(value_dict.items(), key=itemgetter(1))
+        top_matches = all_matches[-num_matches:]
+        # pylint: disable=W0612
+        for item, score in top_matches:
+            print "\t%(item)s :: %(score)s" % locals()
+            # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
+        # pylint: enable=W0612
+        print '\n'
+
 class HackMatch(object):
+    """
+    HackMatch: class to encapsulate matching companies versus startups on selected fields
+    """
     DEBUG = False
     BOW_FIELDS = ['Environment', 'Project', 'Skills', 'Misc']
     COMPLETENESS_THRESHOLD = 4 # num of words necessary to match
     
     def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
-        self.stopwords = set(self.get_stopwords())
+        self.stopwords = set(get_stopwords())
         self.distance = distance
         
-        student_data = self.parseCSV(student_file)
-        startup_data = self.parseCSV(startup_file)
+        student_data = parse_csv(student_file)
+        startup_data = parse_csv(startup_file)
         
-        doc_words = self.defineFeatures([student_data, startup_data], self.BOW_FIELDS)
+        doc_words = self.define_features([student_data, startup_data], self.BOW_FIELDS)
 
-        # matches = self.doRanking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
-        matches = self.doRanking(startup_data, student_data, doc_words, self.BOW_FIELDS)
+        # matches = self.do_ranking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
+        matches = self.do_ranking(startup_data, student_data, doc_words, self.BOW_FIELDS)
 
-        self.printMatches(matches, num_matches)
-        
-    def printMatches(self, matches, num_matches):
-        for n, m in matches.items():
-            print n
-            all_matches = sorted(m.items(), key=itemgetter(1))
-            top_matched = all_matches[-num_matches:]
-            for item, score in top_matches:
-                print "\t%(item)s :: %(score)s" % locals()
-                # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
-            print '\n'
-
-    def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'):
+        print_matches(matches, num_matches)
+    
+    # pylint: disable=R0913
+    def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'):
         """
         do ranking
         """
-        base = dict((item[base_name_field], self.extractFeatures(item, doc_words, fields)) for item in base_data)
+        fields = fields or []
+        base = dict((item[base_name_field], self.extract_features(item, doc_words, fields)) for item in base_data)
 
         matches = defaultdict(dict)
         for match_item in match_data:
-            match_features = self.extractFeatures(match_item, doc_words, fields)
-            d = matches[match_item[match_name_field]]
+            match_features = self.extract_features(match_item, doc_words, fields)
+            temp_dict = matches[match_item[match_name_field]]
             for base_item, base_item_features in base.items(): # actually do the comparison
                 if not base_item_features or not match_features:
-                    d[base_item] = 0.0
+                    temp_dict[base_item] = 0.0
                 else:
-                    d[base_item] = self.distance(base_item_features, match_features)
+                    temp_dict[base_item] = self.distance(base_item_features, match_features)
                 if self.DEBUG:
                     print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features))
 
         return matches
 
-    def extractFeatures(self, item, doc_words, fields=[]):
-        tokeniter = (item[f] for f in fields if f in item)
-        s_tokens = map(list.extend, tokeniter)
+    def extract_features(self, item_dict, doc_words, fields=None):
+        """
+        extract_features: Determine whether features pass test
+        """
+        fields = fields or []
+        tokeniter = (item_dict[f] for f in fields if f in item_dict)
+        s_tokens = reduce(list.extend, tokeniter)
         s_features = [token in s_tokens for token in doc_words]
         return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None
 
-    def defineFeatures(self, data, fields=[]):
+    def define_features(self, data, fields=None):
         """
         define the global bag of words features
         """
+        fields = fields or []
         ngram_freq = defaultdict(int)
         
         featureiter = (
@@ -87,32 +118,22 @@ def defineFeatures(self, data, fields=[]):
             if f in r
         )
         for field in featureiter:
-            tokeniter = (t.lower() for t in word_tokenize(field))
-            legaliter = (t.strip('.') for t in tokeniter if t not in self.stopwords)
-            for t in legaliter:
-                ngram_freq[t] += 1
+            tokeniter = (word.lower() for word in word_tokenize(field))
+            legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords)
+            for legal_word in legaliter:
+                ngram_freq[legal_word] += 1
                             
-        ngram_freq = dict((w,c) for w,c in ngram_freq.items() if c > 1)
+        ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1)
         if self.DEBUG:
             print "Global vocabulary: %s" % len(ngram_freq)        
         return ngram_freq
     
-    def get_stopwords(self):
-        return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']
-            
-    def parseCSV(self, filename):
-        """
-        parseCSV: parses the CSV file to a dict
-        """
-        csv_reader = csv.DictReader(open(filename))
-        return [r for r in csv_reader]
-        
-        
+
 if __name__ == '__main__':
     parser = OptionParser()
-    parser.add_option("-n","--number", action="store", type="int", dest="num_matches",default=10,help="number of results to return")
-    parser.add_option("-s","--student", action="store", type="string", dest="student_file",default="unmatched_students.csv",help="csv of student data")
-    parser.add_option("-t","--startup", action="store", type="string", dest="startup_file",default="unmatched_top_startups.csv",help="csv of startup data")
+    parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return")
+    parser.add_option("-s", "--student", action="store", type="string", dest="student_file", default="unmatched_students.csv", help="csv of student data")
+    parser.add_option("-t", "--startup", action="store", type="string", dest="startup_file", default="unmatched_top_startups.csv", help="csv of startup data")
     (options, args) = parser.parse_args()
     
-    h = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)
\ No newline at end of file
+    hackmatch = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)

From b922eb9e6bdec5159e2374bca02debbb4ecdb6d2 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Tue, 26 Oct 2010 21:52:31 -0400
Subject: [PATCH 10/16] Added requirements file

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..fe8566b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+hcluster>=0.2.0
+nltk>=2.0b9

From f0337c764e72c89f8d65e5f25747c038ce832f5a Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 01:16:15 -0400
Subject: [PATCH 11/16] Added data files

---
 startups.txt | 2 ++
 students.txt | 2 ++
 2 files changed, 4 insertions(+)
 create mode 100644 startups.txt
 create mode 100644 students.txt

diff --git a/startups.txt b/startups.txt
new file mode 100644
index 0000000..6662658
--- /dev/null
+++ b/startups.txt
@@ -0,0 +1,2 @@
+"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc"
+"Foobar Corp","x@foo.com","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook"
diff --git a/students.txt b/students.txt
new file mode 100644
index 0000000..45fa9ce
--- /dev/null
+++ b/students.txt
@@ -0,0 +1,2 @@
+Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
+Hugh,hughdbrown@yahoo.com,Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance"

From b9fdb626bae7dc45518f1a845b736a61dcd549cb Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 11:56:34 -0400
Subject: [PATCH 12/16] Remove blank lines, pylint directives

---
 hackmatch.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/hackmatch.py b/hackmatch.py
index dff41b7..4bfddc3 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -26,7 +26,7 @@ def get_stopwords():
     get_stopwords: generate a list of stop words
     """
     return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']
-        
+
 def parse_csv(filename):
     """
     parse_csv: parses the CSV file to a dict
@@ -42,11 +42,9 @@ def print_matches(matches, num_matches):
         print key
         all_matches = sorted(value_dict.items(), key=itemgetter(1))
         top_matches = all_matches[-num_matches:]
-        # pylint: disable=W0612
         for item, score in top_matches:
             print "\t%(item)s :: %(score)s" % locals()
             # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
-        # pylint: enable=W0612
         print '\n'
 
 class HackMatch(object):
@@ -71,7 +69,6 @@ def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
 
         print_matches(matches, num_matches)
     
-    # pylint: disable=R0913
     def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'):
         """
         do ranking
@@ -90,7 +87,6 @@ def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_fi
                     temp_dict[base_item] = self.distance(base_item_features, match_features)
                 if self.DEBUG:
                     print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features))
-
         return matches
 
     def extract_features(self, item_dict, doc_words, fields=None):
@@ -122,12 +118,11 @@ def define_features(self, data, fields=None):
             legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords)
             for legal_word in legaliter:
                 ngram_freq[legal_word] += 1
-                            
         ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1)
         if self.DEBUG:
             print "Global vocabulary: %s" % len(ngram_freq)        
         return ngram_freq
-    
+
 
 if __name__ == '__main__':
     parser = OptionParser()

From 9af97d754253a244ff058feaaeda244478eab828 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 12:54:13 -0400
Subject: [PATCH 13/16] Code to get the nltk stopwords corpus

---
 download_stopwords.py | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 download_stopwords.py

diff --git a/download_stopwords.py b/download_stopwords.py
new file mode 100644
index 0000000..8bafa70
--- /dev/null
+++ b/download_stopwords.py
@@ -0,0 +1,2 @@
+from nltk import download
+download('stopwords')

From fda38f2c8bb92fb2b4e9b87bceaf0a9c53af239c Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 12:57:23 -0400
Subject: [PATCH 14/16] Added list_reducer(), removed reduce() call

---
 hackmatch.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/hackmatch.py b/hackmatch.py
index 4bfddc3..2578022 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -21,6 +21,16 @@
 # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc
 # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
 
+# Hack
+# I'd like to write this:
+#   return reduce(list.extend, list_of_lists)
+# but it generates an error I don't get
+def list_reducer(list_iter):
+    result = []
+    for l in list_iter:
+        result.extend(l)
+    return result
+
 def get_stopwords():
     """
     get_stopwords: generate a list of stop words
@@ -95,7 +105,7 @@ def extract_features(self, item_dict, doc_words, fields=None):
         """
         fields = fields or []
         tokeniter = (item_dict[f] for f in fields if f in item_dict)
-        s_tokens = reduce(list.extend, tokeniter)
+        s_tokens = list_reducer(tokeniter)
         s_features = [token in s_tokens for token in doc_words]
         return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None
 

From a34ec17286db96489423bf3a5f964948289efb4b Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 12:59:29 -0400
Subject: [PATCH 15/16] Added requirements.txt instructions Added
 download_stopwords.py instructions

---
 README | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README b/README
index b2ab13d..501be1e 100644
--- a/README
+++ b/README
@@ -14,6 +14,17 @@ Dependencies
 
 hackMatch uses NLTK (nltk.org) and hcluster (for distance metrics).
 
+To install the dependencies, run 'pip install -r requirements.txt'.
+
+After installing nltk, you need to get the stopwords corpus. Run download_stopwords.py at the command line:
+
+	$ python download_stopwords.py
+	[nltk_data] Downloading package 'stopwords' to
+	[nltk_data]     /home/hbrown/nltk_data...
+	[nltk_data]   Unzipping corpora/stopwords.zip.
+	True
+	$	
+
 =======
 License
 =======

From 88cce70d9639b68ed18941c37b8fd5b25c039f42 Mon Sep 17 00:00:00 2001
From: hbrown <hbrown@wgen.net>
Date: Wed, 27 Oct 2010 13:02:58 -0400
Subject: [PATCH 16/16] Remove blank line

---
 hackmatch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hackmatch.py b/hackmatch.py
index 2578022..8efa583 100644
--- a/hackmatch.py
+++ b/hackmatch.py
@@ -133,7 +133,6 @@ def define_features(self, data, fields=None):
             print "Global vocabulary: %s" % len(ngram_freq)        
         return ngram_freq
 
-
 if __name__ == '__main__':
     parser = OptionParser()
     parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return")