From 19250509a4f996de7e16649252c6837f7dd68f60 Mon Sep 17 00:00:00 2001 From: Elliot Vos Date: Fri, 29 Apr 2016 11:36:39 -0400 Subject: [PATCH 1/5] startWords should be treated as words to whitelist, not entire phrases --- index.js | 7 +++---- test/extract.js | 7 +++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/index.js b/index.js index 28e29f8..15570ea 100644 --- a/index.js +++ b/index.js @@ -234,7 +234,7 @@ Tf.prototype.listMostFrequestTerms = function(d) { }; function whitelisted(term, startWords){ - return startWords.indexOf(term) !== -1; + return _.indexOf(startWords, term) !== -1; } function blacklisted(term, extraStopWords){ @@ -246,8 +246,7 @@ function blacklisted(term, extraStopWords){ } function usePhrase(phrase, options){ - return whitelisted(phrase, options.startWords) || - !_.detect(phrase.split(' '), function(term){ - return blacklisted(term, options.stopWords); + return !_.detect(phrase.split(' '), function(term){ + return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords); }); } diff --git a/test/extract.js b/test/extract.js index a5d2232..2f1c50b 100644 --- a/test/extract.js +++ b/test/extract.js @@ -77,6 +77,13 @@ test('with {startWords: [word]} as option', function(t){ t.end(); }); +test('that start words aren\'t only matched when they\'re the entire ngram', function(t){ + var options = { startWords: ['is'] }; + var results = k.extract(text, options); + t.ok(results.indexOf('beep beep is') !== -1, 'treat start words as words'); + t.end(); +}); + test('with {ngram: number} as option', function(t){ var text = "test node code. And test and node and code and node. And test node code"; var options = { ngrams: [3] }; From 617b996ca7ab5fc762cea5976617fe20489ca847 Mon Sep 17 00:00:00 2001 From: Elliot Vos Date: Fri, 29 Apr 2016 14:26:20 -0400 Subject: [PATCH 2/5] update all packages to current versions --- index.js | 10 +++++----- package.json | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/index.js b/index.js index 15570ea..85139bf 100644 --- a/index.js +++ b/index.js @@ -80,7 +80,7 @@ exports.extract = function(text, options){ }); tf.addDocument(tokenized); keywordsForNgram = tf.listMostFrequestTerms(0); - keywordsForNgram = _.select(keywordsForNgram, function(item){ + keywordsForNgram = _.filter(keywordsForNgram, function(item){ return usePhrase(item.term, options); }); results = results.concat(keywordsForNgram); @@ -96,7 +96,7 @@ exports.extract = function(text, options){ // Convert to a list of objects sorted by tf (term frequency) combined = _.chain(combined) - .pairs() + .toPairs() .sortBy(_.last) .reverse() .map(function(combination){ return {term: combination[0], tf: combination[1] }; }) @@ -104,7 +104,7 @@ exports.extract = function(text, options){ // Only return results over a given frequency (default is 2 or more) if (options.min){ - combined = _.select(combined, function(result){ + combined = _.filter(combined, function(result){ return result.tf >= options.min; }); } @@ -130,7 +130,7 @@ exports.extract = function(text, options){ ); }else{ // Return results with scores or without depending on options - combined = options.score ? combined : _.pluck(combined, 'term'); + combined = options.score ? combined : _.map(combined, 'term'); } @@ -246,7 +246,7 @@ function blacklisted(term, extraStopWords){ } function usePhrase(phrase, options){ - return !_.detect(phrase.split(' '), function(term){ + return !_.find(phrase.split(' '), function(term){ return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords); }); } diff --git a/package.json b/package.json index d960899..7baa960 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gramophone", - "version": "0.0.3", + "version": "0.0.4", "main": "index.js", "description": "extracts most frequently used keywords and phrases from text", "keywords": [ @@ -14,12 +14,12 @@ }, "license": "MIT", "dependencies": { - "lodash": "~0.8.2", - "natural": "~0.1.18", - "event-stream": "~3.0.8", - "underscore.string": "~2.3.1" + "event-stream": "^3.3.2", + "lodash": "^4.11.1", + "natural": "^0.4.0", + "underscore.string": "^3.3.4" }, "devDependencies": { - "tap": "~0.3.3" + "tap": "^5.7.1" } } From 6f670e21c7ba7ea20cb8cae535b6903eaa19b593 Mon Sep 17 00:00:00 2001 From: Elliot Vos Date: Thu, 12 May 2016 15:40:16 -0400 Subject: [PATCH 3/5] add setting to ignore phrases that are just added start words --- index.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 85139bf..b887a37 100644 --- a/index.js +++ b/index.js @@ -246,7 +246,15 @@ function blacklisted(term, extraStopWords){ } function usePhrase(phrase, options){ - return !_.find(phrase.split(' '), function(term){ + // check if any of the terms in the phrase are blacklisted and not whitelisted. If not, it's usable + var doUsePhrase = !_.find(phrase.split(' '), function(term){ return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords); }); + // if options specify, exclude phrases that are only startWords + if (doUsePhrase && options.ignoreStartWordOnlyPhrases) { + doUsePhrase = !_.every(phrase.split(' '), function(term){ + return whitelisted(term, options.startWords); + }); + } + return doUsePhrase; } From 6c7cf313752a63ae3b538f9b4dfbaa78b2a4e5cf Mon Sep 17 00:00:00 2001 From: Elliot Vos Date: Fri, 13 May 2016 10:59:04 -0400 Subject: [PATCH 4/5] Add documentation for new option. Fix bug with cutoff excluding component subphrases when composite phrase is below minimum threshold for results. Fix bug with string numbers being passed for ngrams option causing memory crash. --- README.md | 12 ++++++++++++ index.js | 13 ++++++++++--- test/extract.js | 16 ++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 57ca1f7..6a42dc2 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,18 @@ that respects these characters. keyword.extract('Lörem Ipsüm Lörem Ipsüm.', {alternativeTokenizer: true}) ``` +#### Option: ignoreStartWordOnlyPhrases + +Setting `ignoreStartWordOnlyPhrases` to `true` will exclude phrases that are made up +entirely of words in the `startWords` list. + +```js +keyword.extract('foo but bar and not with foo but not with bar', + {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}) +``` + +Returns `['foo but', 'bar']` + -------------------------------------------------------- ### gramophone.stream([options]) diff --git a/index.js b/index.js index b887a37..4e5f3c7 100644 --- a/index.js +++ b/index.js @@ -41,6 +41,8 @@ exports.extract = function(text, options){ options.ngrams = [1, 2, 3]; }else if (typeof options.ngrams === 'number'){ options.ngrams = [options.ngrams]; + }else{ + options.ngrams = _.map(options.ngrams, Number); } if (!options.cutoff) options.cutoff = 0.5; if (!options.min) options.min = 2; @@ -92,7 +94,7 @@ exports.extract = function(text, options){ }); // Combine results from each ngram to remove redundancy phrases - combined = exports.combine(combinedResults, options.cutoff); + combined = exports.combine(combinedResults, options.cutoff, options.min); // Convert to a list of objects sorted by tf (term frequency) combined = _.chain(combined) @@ -197,11 +199,16 @@ exports.transformStream = function(options){ // was used 22 times (within the cutoff of 20 * 0.2), then it would be removed // from the results. If "national broadband" was used more than the cutoff, // e.g. 30 times, it would be left in the results. -exports.combine = function(phrases, cutoff){ +exports.combine = function(phrases, cutoff, min){ var combined = _.clone(phrases); _.each(_.keys(phrases), function(phrase){ - var ngramToTry, subPhrases; + var ngramToTry; + + // Skip this check if the composite phrase doesn't meet the minimum + // requirements for the results. + if (phrases[phrase] < min) return; + ngramToTry = phrase.split(' ').length - 1; if (ngramToTry < 1) return; diff --git a/test/extract.js b/test/extract.js index 2f1c50b..97e4894 100644 --- a/test/extract.js +++ b/test/extract.js @@ -114,6 +114,22 @@ test('with {cutoff: float} as option', function(t){ t.end(); }); +test('with {ignoreStartWordOnlyPhrases: true} as option', function(t){ + var text = "foo but bar and not with foo but not with bar"; + var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}; + var results = k.extract(text, options); + t.ok(t.deepEqual(results, ['foo but', 'bar']), "should not include phrases of only start words"); + t.end(); +}); + +test('default cutoff should not exclude component phrases when composite is below minimum', function(t){ + var text = "foo but bar and not with foo but not with bar"; + var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}; + var results = k.extract(text, options); + t.ok(results.indexOf('bar') > -1, "should not exclude component phrases when composite is below minimum"); + t.end(); +}); + test('extract apostrophe', function (t){ var text = "Today is 15 July - St Swithin's Day. Legend has it that if it rains on St Swithin's Day then the wet weather will continue for 40 days."; var options = {alternativeTokenizer: true}; From eb3d0ba6efef854bd6a328ca43251d674d81d1e1 Mon Sep 17 00:00:00 2001 From: Elliot Vos Date: Fri, 13 May 2016 10:59:27 -0400 Subject: [PATCH 5/5] bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7baa960..83d9709 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gramophone", - "version": "0.0.4", + "version": "0.0.5", "main": "index.js", "description": "extracts most frequently used keywords and phrases from text", "keywords": [