Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,18 @@ that respects these characters.
keyword.extract('Lörem Ipsüm Lörem Ipsüm.', {alternativeTokenizer: true})
```

#### Option: ignoreStartWordOnlyPhrases

Setting `ignoreStartWordOnlyPhrases` to `true` will exclude phrases that are made up
entirely of words in the `startWords` list.

```js
keyword.extract('foo but bar and not with foo but not with bar',
{startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true})
```

Returns `['foo but', 'bar']`

--------------------------------------------------------
<a name="stream"></a>
### gramophone.stream([options])
Expand Down
36 changes: 25 additions & 11 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ exports.extract = function(text, options){
options.ngrams = [1, 2, 3];
}else if (typeof options.ngrams === 'number'){
options.ngrams = [options.ngrams];
}else{
options.ngrams = _.map(options.ngrams, Number);
}
if (!options.cutoff) options.cutoff = 0.5;
if (!options.min) options.min = 2;
Expand Down Expand Up @@ -80,7 +82,7 @@ exports.extract = function(text, options){
});
tf.addDocument(tokenized);
keywordsForNgram = tf.listMostFrequestTerms(0);
keywordsForNgram = _.select(keywordsForNgram, function(item){
keywordsForNgram = _.filter(keywordsForNgram, function(item){
return usePhrase(item.term, options);
});
results = results.concat(keywordsForNgram);
Expand All @@ -92,19 +94,19 @@ exports.extract = function(text, options){
});

// Combine results from each ngram to remove redundancy phrases
combined = exports.combine(combinedResults, options.cutoff);
combined = exports.combine(combinedResults, options.cutoff, options.min);

// Convert to a list of objects sorted by tf (term frequency)
combined = _.chain(combined)
.pairs()
.toPairs()
.sortBy(_.last)
.reverse()
.map(function(combination){ return {term: combination[0], tf: combination[1] }; })
.value();

// Only return results over a given frequency (default is 2 or more)
if (options.min){
combined = _.select(combined, function(result){
combined = _.filter(combined, function(result){
return result.tf >= options.min;
});
}
Expand All @@ -130,7 +132,7 @@ exports.extract = function(text, options){
);
}else{
// Return results with scores or without depending on options
combined = options.score ? combined : _.pluck(combined, 'term');
combined = options.score ? combined : _.map(combined, 'term');
}


Expand Down Expand Up @@ -197,11 +199,16 @@ exports.transformStream = function(options){
// was used 22 times (within the cutoff of 20 * 0.2), then it would be removed
// from the results. If "national broadband" was used more than the cutoff,
// e.g. 30 times, it would be left in the results.
exports.combine = function(phrases, cutoff){
exports.combine = function(phrases, cutoff, min){
var combined = _.clone(phrases);

_.each(_.keys(phrases), function(phrase){
var ngramToTry, subPhrases;
var ngramToTry;

// Skip this check if the composite phrase doesn't meet the minimum
// requirements for the results.
if (phrases[phrase] < min) return;

ngramToTry = phrase.split(' ').length - 1;

if (ngramToTry < 1) return;
Expand Down Expand Up @@ -234,7 +241,7 @@ Tf.prototype.listMostFrequestTerms = function(d) {
};

function whitelisted(term, startWords){
return startWords.indexOf(term) !== -1;
return _.indexOf(startWords, term) !== -1;
}

function blacklisted(term, extraStopWords){
Expand All @@ -246,8 +253,15 @@ function blacklisted(term, extraStopWords){
}

function usePhrase(phrase, options){
return whitelisted(phrase, options.startWords) ||
!_.detect(phrase.split(' '), function(term){
return blacklisted(term, options.stopWords);
// check if any of the terms in the phrase are blacklisted and not whitelisted. If not, it's usable
var doUsePhrase = !_.find(phrase.split(' '), function(term){
return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords);
});
// if options specify, exclude phrases that are only startWords
if (doUsePhrase && options.ignoreStartWordOnlyPhrases) {
doUsePhrase = !_.every(phrase.split(' '), function(term){
return whitelisted(term, options.startWords);
});
}
return doUsePhrase;
}
12 changes: 6 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "gramophone",
"version": "0.0.3",
"version": "0.0.5",
"main": "index.js",
"description": "extracts most frequently used keywords and phrases from text",
"keywords": [
Expand All @@ -14,12 +14,12 @@
},
"license": "MIT",
"dependencies": {
"lodash": "~0.8.2",
"natural": "~0.1.18",
"event-stream": "~3.0.8",
"underscore.string": "~2.3.1"
"event-stream": "^3.3.2",
"lodash": "^4.11.1",
"natural": "^0.4.0",
"underscore.string": "^3.3.4"
},
"devDependencies": {
"tap": "~0.3.3"
"tap": "^5.7.1"
}
}
23 changes: 23 additions & 0 deletions test/extract.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ test('with {startWords: [word]} as option', function(t){
t.end();
});

test('that start words aren\'t only matched when they\'re the entire ngram', function(t){
var options = { startWords: ['is'] };
var results = k.extract(text, options);
t.ok(results.indexOf('beep beep is') !== -1, 'treat start words as words');
t.end();
});

test('with {ngram: number} as option', function(t){
var text = "test node code. And test and node and code and node. And test node code";
var options = { ngrams: [3] };
Expand Down Expand Up @@ -107,6 +114,22 @@ test('with {cutoff: float} as option', function(t){
t.end();
});

test('with {ignoreStartWordOnlyPhrases: true} as option', function(t){
var text = "foo but bar and not with foo but not with bar";
var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true};
var results = k.extract(text, options);
t.ok(t.deepEqual(results, ['foo but', 'bar']), "should not include phrases of only start words");
t.end();
});

test('default cutoff should not exclude component phrases when composite is below minimum', function(t){
var text = "foo but bar and not with foo but not with bar";
var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true};
var results = k.extract(text, options);
t.ok(results.indexOf('bar') > -1, "should not exclude component phrases when composite is below minimum");
t.end();
});

test('extract apostrophe', function (t){
var text = "Today is 15 July - St Swithin's Day. Legend has it that if it rains on St Swithin's Day then the wet weather will continue for 40 days.";
var options = {alternativeTokenizer: true};
Expand Down