textclf · lukedeo · Feb 7, 2016 · Feb 7, 2016
diff --git a/datahandlers/__init__.py b/datahandlers/__init__.py
diff --git a/nlpdatahandlers/.DS_Store b/nlpdatahandlers/.DS_Store
diff --git a/nlpdatahandlers/__init__.py b/nlpdatahandlers/__init__.py
diff --git a/datahandlers/base_handler.py → nlpdatahandlers/base.py b/datahandlers/base_handler.py → nlpdatahandlers/base.py
@@ -1,8 +1,21 @@
+'''
+base.py -- ABC for data handler.
+'''
+
+
+import abc
+
+import numpy as np
+
+from .util.parallel import parallel_run
+
 class DataHandlerException(Exception):
     pass
 
 class BaseDataHandler(object):
 
+    __metaclass__  = abc.ABCMeta
+
     DATA_ALL = 1
     DATA_TRAIN = 2
     DATA_VALIDATION = 4
@@ -11,17 +24,24 @@ class BaseDataHandler(object):
     def __init__(self, source):
         self.source = source
 
+    @abc.abstractmethod
     def get_data(self, type=DATA_ALL):
         """
         Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used
         """
         raise NotImplementedError()
 
     @staticmethod
-    def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
+    def shuffle_data(train_values, labels):
+        combined_lists = zip(train_values, labels)
+        np.random.shuffle(combined_lists)
+        return zip(*combined_lists)
+
+    @staticmethod
+    def word_level_ix(texts_list, words_per_document, wv_container, prepend=False, needs_tokenizing=False):
         """
-        Receives a list of texts. For each text, it converts the text into sentences and converts the words into
-        indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.
+        Receives a list of texts. For each text, it converts the text into indices of a word 
+        vector container (Glove, WordToVec) for later use in the embedding of a neural network.
 
         Sentences are padded (or reduced) up to words_per_sentence elements.
         Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
@@ -32,29 +52,41 @@ def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence,
             [  [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0]  ]
             using sentences_per_paragraph = 4, words_per_sentence = 4
         """
-        def parallel_run(f, params):
-            '''
-            performs multi-core map of the function `f`
-            over the parameter space spanned by parms.
 
-            `f` MUST take only one argument.
-            '''
-            from multiprocessing import Pool
+
+
+        if needs_tokenizing:
+            from util.language import tokenize_text
+            texts_list = parallel_run(tokenize_text, texts_list)
+
+        text_with_normalized_documents = BaseDataHandler.__normalize(wv_container.get_indices(texts_list), words_per_document, prepend)
+        return text_with_normalized_documents
+
+
+    @staticmethod
+    def sentence_level_ix(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
+        """
+        Receives a list of texts. For each text, it converts the text into sentences and converts the words into
+        indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.
+
+        Sentences are padded (or reduced) up to words_per_sentence elements.
+        Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
+        If prepend = True, padding is added at the beginning
 
-            pool = Pool()
-            ret = pool.map(f, params)
-            pool.close()
-            pool.join()
-            return ret
+        Ex: [[This might be cumbersome. Hopefully not.], [Another text]]
+               to
+            [  [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0]  ]
+            using sentences_per_paragraph = 4, words_per_sentence = 4
+        """
 
         from util.language import parse_paragraph
 
         text_sentences = parallel_run(parse_paragraph, texts_list)
-        paragraphs = []
-        text_with_normalized_sentences = [BaseDataHandler.__normalize(review, words_per_sentence, prepend)
+
+        text_with_normalized_sentences = [BaseDataHandler.__normalize(review, size=words_per_sentence, prepend=prepend)
                                           for review in wv_container.get_indices(text_sentences)]
         text_padded_paragraphs = BaseDataHandler.__normalize(text_with_normalized_sentences,
-                                                      sentences_per_paragraph, [0] * words_per_sentence)
+                                                      size=sentences_per_paragraph, filler=[0] * words_per_sentence)
 
         return text_padded_paragraphs
 

diff --git a/nlpdatahandlers/imdb/__init__.py b/nlpdatahandlers/imdb/__init__.py
@@ -0,0 +1 @@
+from .imdb_handler import ImdbDataHandler
diff --git a/datahandlers/imdb_handler.py → nlpdatahandlers/imdb/imdb_handler.py b/datahandlers/imdb_handler.py → nlpdatahandlers/imdb/imdb_handler.py
@@ -1,4 +1,4 @@
-from base_handler import BaseDataHandler, DataHandlerException
+from ..base import BaseDataHandler, DataHandlerException
 
 import glob
 import os
@@ -9,16 +9,26 @@ class ImdbDataHandler(BaseDataHandler):
     http://ai.stanford.edu/~amaas/data/sentiment/
 
     source defines the folder where the data is downloaded
+
+    Args:
+    -----
+        source: the path to the root aclImdb/ folder for the downloaded data
+
+    Examples:
+    ---------
+
+        >>> imdb = ImdbHandler('./aclImdb')
+        >>> train_data, train_labels = imdb.get_data()
     """
 
-    def get_data(self, type=BaseDataHandler.DATA_TRAIN):
+    def get_data(self, type=BaseDataHandler.DATA_TRAIN, shuffle=True):
         """
         Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used
 
         Data is not shuffled
         """
         if type not in (BaseDataHandler.DATA_TRAIN, BaseDataHandler.DATA_TEST):
-            raise DataHandlerException("Only train and test data supported for ImdbHandler")
+            raise DataHandlerException("Only train and test data supported for ImdbDataHandler")
         else:
             which_data = 'train' if type == BaseDataHandler.DATA_TRAIN else 'test'
 
@@ -34,5 +44,7 @@ def get_data(self, type=BaseDataHandler.DATA_TRAIN):
             data.append((open(f, 'rb').read().lower()).replace('<br /><br />', '\n'))
             labels.append(0)
 
+        if shuffle:
+            return self.shuffle_data(data, labels)
         return (data, labels)
 
diff --git a/nlpdatahandlers/util/__init__.py b/nlpdatahandlers/util/__init__.py
diff --git a/nlpdatahandlers/util/language.py b/nlpdatahandlers/util/language.py
@@ -0,0 +1,71 @@
+try:
+    from spacy.en import English
+except ImportError:
+    raise ImportError('[!] You need to install spaCy! Visit spacy.io/#install')
+
+# Spacy.en provides a faster tokenizer than nltk
+nlp = English()
+
+def parse_paragraph(txt):
+    """
+    Takes a text and returns a list of lists of tokens, where each sublist is a sentence
+    """
+    sentences = nlp(u'' + txt.decode('ascii', errors='ignore')).sents
+    return [[t.text for t in s] for s in sentences]
+
+def tokenize_text(text):
+    """
+    Gets tokens from a text in English
+    """
+    if not isinstance(text, unicode):
+        text = unicode(text)
+
+    tokens = [token.lower_ for token in nlp(text)]
+
+    return tokens
+
+def _calculate_languages_ratios(text):
+    """
+    Calculate probability of given text to be written in several languages and
+    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
+
+    @param text: Text whose language want to be detected
+    @type text: str
+
+    @return: Dictionary with languages and unique stopwords seen in analyzed text
+    @rtype: dict
+    """
+    from nltk.corpus import stopwords
+
+
+    languages_ratios = {}
+    tokens = tokenize_text(text)
+
+    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
+    for language in stopwords.fileids():
+        stopwords_set = set(stopwords.words(language))
+        words_set = set(tokens)
+        common_elements = words_set.intersection(stopwords_set)
+
+        languages_ratios[language] = len(common_elements) # language "score"
+
+    return languages_ratios
+
+def detect_language(text):
+    """
+    Calculate probability of given text to be written in several languages and
+    return the highest scored.
+
+    It uses a stopwords based approach, counting how many unique stopwords
+    are seen in analyzed text.
+
+    @param text: Text whose language want to be detected
+    @type text: str
+
+    @return: Most scored language guessed
+    @rtype: str
+    """
+    ratios = _calculate_languages_ratios(text)
+    most_rated_language = max(ratios, key=ratios.get)
+
+    return most_rated_language
diff --git a/nlpdatahandlers/util/misc.py b/nlpdatahandlers/util/misc.py
@@ -0,0 +1,35 @@
+from .language import tokenize_text
+
+def normalize_sos(sq, sz=30, filler=0, prepend=True):
+    '''
+    Take a list of lists and ensure that they are all of length `sz`
+
+    Args:
+    -----
+        e: a non-generator iterable of lists
+
+        sz: integer, the size that each sublist should be normalized to
+
+        filler: obj -- what should be added to fill out the size?
+
+        prepend: should `filler` be added to the front or the back of the list?
+
+    '''
+    if not prepend:
+        def _normalize(e, sz):
+            return e[:sz] if len(e) >= sz else e + [filler] * (sz - len(e))
+        return [_normalize(e, sz) for e in sq]
+    else:
+        def _normalize(e, sz):
+            return e[-sz:] if len(e) >= sz else [filler] * (sz - len(e)) + e
+        return [_normalize(e, sz) for e in sq]
+
+
+def to_glove_vectors(text, glovebox):
+    tokens = tokenize_text(text)
+
+    wvs = []
+    for token in tokens:
+        wvs.append(glovebox[token])
+
+    return wvs
diff --git a/nlpdatahandlers/util/parallel.py b/nlpdatahandlers/util/parallel.py
@@ -0,0 +1,18 @@
+'''
+util.py -- utilities for data loading
+'''
+
+def parallel_run(f, params):
+      '''
+      performs multi-core map of the function `f`
+      over the parameter space spanned by parms.
+
+      `f` MUST take only one argument.
+      '''
+      from multiprocessing import Pool
+
+      pool = Pool()
+      ret = pool.map(f, params)
+      pool.close()
+      pool.join()
+      return ret
diff --git a/setup.py b/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup
+from setuptools import find_packages
+
+setup(name='NLPDataHandlers',
+      version='0.0.1',
+      description='Library for loading datasets for deep learning.',
+      author='Luke de Oliveira, Alfredo Lainez',
+      author_email='lukedeo@stanford.edu, alainez@stanford.edu',
+      url='https://github.com/textclf/data-handler',
+      # install_requires=['pandas'],
+      packages=find_packages())