Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions datahandlers/__init__.py

This file was deleted.

Binary file added nlpdatahandlers/.DS_Store
Binary file not shown.
Empty file added nlpdatahandlers/__init__.py
Empty file.
68 changes: 50 additions & 18 deletions datahandlers/base_handler.py → nlpdatahandlers/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
'''
base.py -- ABC for data handler.
'''


import abc

import numpy as np

from .util.parallel import parallel_run

class DataHandlerException(Exception):
pass

class BaseDataHandler(object):

__metaclass__ = abc.ABCMeta

DATA_ALL = 1
DATA_TRAIN = 2
DATA_VALIDATION = 4
Expand All @@ -11,17 +24,24 @@ class BaseDataHandler(object):
def __init__(self, source):
self.source = source

@abc.abstractmethod
def get_data(self, type=DATA_ALL):
"""
Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used
"""
raise NotImplementedError()

@staticmethod
def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
def shuffle_data(train_values, labels):
combined_lists = zip(train_values, labels)
np.random.shuffle(combined_lists)
return zip(*combined_lists)

@staticmethod
def word_level_ix(texts_list, words_per_document, wv_container, prepend=False, needs_tokenizing=False):
"""
Receives a list of texts. For each text, it converts the text into sentences and converts the words into
indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.
Receives a list of texts. For each text, it converts the text into indices of a word
vector container (Glove, WordToVec) for later use in the embedding of a neural network.

Sentences are padded (or reduced) up to words_per_sentence elements.
Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
Expand All @@ -32,29 +52,41 @@ def to_sentence_vectors(texts_list, sentences_per_paragraph, words_per_sentence,
[ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ]
using sentences_per_paragraph = 4, words_per_sentence = 4
"""
def parallel_run(f, params):
'''
performs multi-core map of the function `f`
over the parameter space spanned by parms.

`f` MUST take only one argument.
'''
from multiprocessing import Pool


if needs_tokenizing:
from util.language import tokenize_text
texts_list = parallel_run(tokenize_text, texts_list)

text_with_normalized_documents = BaseDataHandler.__normalize(wv_container.get_indices(texts_list), words_per_document, prepend)
return text_with_normalized_documents


@staticmethod
def sentence_level_ix(texts_list, sentences_per_paragraph, words_per_sentence, wv_container, prepend=False):
"""
Receives a list of texts. For each text, it converts the text into sentences and converts the words into
indices of a word vector container (Glove, WordToVec) for later use in the embedding of a neural network.

Sentences are padded (or reduced) up to words_per_sentence elements.
Texts ("paragraphs") are padded (or reduced) up to sentences_per_paragraph
If prepend = True, padding is added at the beginning

pool = Pool()
ret = pool.map(f, params)
pool.close()
pool.join()
return ret
Ex: [[This might be cumbersome. Hopefully not.], [Another text]]
to
[ [[5, 24, 3, 223], [123, 25, 0, 0]]. [[34, 25, 0, 0], [0, 0, 0, 0] ]
using sentences_per_paragraph = 4, words_per_sentence = 4
"""

from util.language import parse_paragraph

text_sentences = parallel_run(parse_paragraph, texts_list)
paragraphs = []
text_with_normalized_sentences = [BaseDataHandler.__normalize(review, words_per_sentence, prepend)

text_with_normalized_sentences = [BaseDataHandler.__normalize(review, size=words_per_sentence, prepend=prepend)
for review in wv_container.get_indices(text_sentences)]
text_padded_paragraphs = BaseDataHandler.__normalize(text_with_normalized_sentences,
sentences_per_paragraph, [0] * words_per_sentence)
size=sentences_per_paragraph, filler=[0] * words_per_sentence)

return text_padded_paragraphs

Expand Down
1 change: 1 addition & 0 deletions nlpdatahandlers/imdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .imdb_handler import ImdbDataHandler
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from base_handler import BaseDataHandler, DataHandlerException
from ..base import BaseDataHandler, DataHandlerException

import glob
import os
Expand All @@ -9,16 +9,26 @@ class ImdbDataHandler(BaseDataHandler):
http://ai.stanford.edu/~amaas/data/sentiment/

source defines the folder where the data is downloaded

Args:
-----
source: the path to the root aclImdb/ folder for the downloaded data

Examples:
---------

>>> imdb = ImdbHandler('./aclImdb')
>>> train_data, train_labels = imdb.get_data()
"""

def get_data(self, type=BaseDataHandler.DATA_TRAIN):
def get_data(self, type=BaseDataHandler.DATA_TRAIN, shuffle=True):
"""
Process the data from its source and returns two lists: texts and labels, ready for a classifier to be used

Data is not shuffled
"""
if type not in (BaseDataHandler.DATA_TRAIN, BaseDataHandler.DATA_TEST):
raise DataHandlerException("Only train and test data supported for ImdbHandler")
raise DataHandlerException("Only train and test data supported for ImdbDataHandler")
else:
which_data = 'train' if type == BaseDataHandler.DATA_TRAIN else 'test'

Expand All @@ -34,5 +44,7 @@ def get_data(self, type=BaseDataHandler.DATA_TRAIN):
data.append((open(f, 'rb').read().lower()).replace('<br /><br />', '\n'))
labels.append(0)

if shuffle:
return self.shuffle_data(data, labels)
return (data, labels)

Empty file.
71 changes: 71 additions & 0 deletions nlpdatahandlers/util/language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
try:
from spacy.en import English
except ImportError:
raise ImportError('[!] You need to install spaCy! Visit spacy.io/#install')

# Spacy.en provides a faster tokenizer than nltk
nlp = English()

def parse_paragraph(txt):
"""
Takes a text and returns a list of lists of tokens, where each sublist is a sentence
"""
sentences = nlp(u'' + txt.decode('ascii', errors='ignore')).sents
return [[t.text for t in s] for s in sentences]

def tokenize_text(text):
"""
Gets tokens from a text in English
"""
if not isinstance(text, unicode):
text = unicode(text)

tokens = [token.lower_ for token in nlp(text)]

return tokens

def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}

@param text: Text whose language want to be detected
@type text: str

@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
from nltk.corpus import stopwords


languages_ratios = {}
tokens = tokenize_text(text)

# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(tokens)
common_elements = words_set.intersection(stopwords_set)

languages_ratios[language] = len(common_elements) # language "score"

return languages_ratios

def detect_language(text):
"""
Calculate probability of given text to be written in several languages and
return the highest scored.

It uses a stopwords based approach, counting how many unique stopwords
are seen in analyzed text.

@param text: Text whose language want to be detected
@type text: str

@return: Most scored language guessed
@rtype: str
"""
ratios = _calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)

return most_rated_language
35 changes: 35 additions & 0 deletions nlpdatahandlers/util/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from .language import tokenize_text

def normalize_sos(sq, sz=30, filler=0, prepend=True):
'''
Take a list of lists and ensure that they are all of length `sz`

Args:
-----
e: a non-generator iterable of lists

sz: integer, the size that each sublist should be normalized to

filler: obj -- what should be added to fill out the size?

prepend: should `filler` be added to the front or the back of the list?

'''
if not prepend:
def _normalize(e, sz):
return e[:sz] if len(e) >= sz else e + [filler] * (sz - len(e))
return [_normalize(e, sz) for e in sq]
else:
def _normalize(e, sz):
return e[-sz:] if len(e) >= sz else [filler] * (sz - len(e)) + e
return [_normalize(e, sz) for e in sq]


def to_glove_vectors(text, glovebox):
tokens = tokenize_text(text)

wvs = []
for token in tokens:
wvs.append(glovebox[token])

return wvs
18 changes: 18 additions & 0 deletions nlpdatahandlers/util/parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
'''
util.py -- utilities for data loading
'''

def parallel_run(f, params):
'''
performs multi-core map of the function `f`
over the parameter space spanned by parms.

`f` MUST take only one argument.
'''
from multiprocessing import Pool

pool = Pool()
ret = pool.map(f, params)
pool.close()
pool.join()
return ret
11 changes: 11 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from setuptools import setup
from setuptools import find_packages

setup(name='NLPDataHandlers',
version='0.0.1',
description='Library for loading datasets for deep learning.',
author='Luke de Oliveira, Alfredo Lainez',
author_email='lukedeo@stanford.edu, alainez@stanford.edu',
url='https://github.com/textclf/data-handler',
# install_requires=['pandas'],
packages=find_packages())