diff --git a/README.md b/README.md index 57ca1f7..6a42dc2 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,18 @@ that respects these characters. keyword.extract('Lörem Ipsüm Lörem Ipsüm.', {alternativeTokenizer: true}) ``` +#### Option: ignoreStartWordOnlyPhrases + +Setting `ignoreStartWordOnlyPhrases` to `true` will exclude phrases that are made up +entirely of words in the `startWords` list. + +```js +keyword.extract('foo but bar and not with foo but not with bar', + {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}) +``` + +Returns `['foo but', 'bar']` + -------------------------------------------------------- ### gramophone.stream([options]) diff --git a/index.js b/index.js index 28e29f8..4e5f3c7 100644 --- a/index.js +++ b/index.js @@ -41,6 +41,8 @@ exports.extract = function(text, options){ options.ngrams = [1, 2, 3]; }else if (typeof options.ngrams === 'number'){ options.ngrams = [options.ngrams]; + }else{ + options.ngrams = _.map(options.ngrams, Number); } if (!options.cutoff) options.cutoff = 0.5; if (!options.min) options.min = 2; @@ -80,7 +82,7 @@ exports.extract = function(text, options){ }); tf.addDocument(tokenized); keywordsForNgram = tf.listMostFrequestTerms(0); - keywordsForNgram = _.select(keywordsForNgram, function(item){ + keywordsForNgram = _.filter(keywordsForNgram, function(item){ return usePhrase(item.term, options); }); results = results.concat(keywordsForNgram); @@ -92,11 +94,11 @@ exports.extract = function(text, options){ }); // Combine results from each ngram to remove redundancy phrases - combined = exports.combine(combinedResults, options.cutoff); + combined = exports.combine(combinedResults, options.cutoff, options.min); // Convert to a list of objects sorted by tf (term frequency) combined = _.chain(combined) - .pairs() + .toPairs() .sortBy(_.last) .reverse() .map(function(combination){ return {term: combination[0], tf: combination[1] }; }) @@ -104,7 +106,7 @@ exports.extract = function(text, options){ // Only return results over a given frequency (default is 2 or more) if (options.min){ - combined = _.select(combined, function(result){ + combined = _.filter(combined, function(result){ return result.tf >= options.min; }); } @@ -130,7 +132,7 @@ exports.extract = function(text, options){ ); }else{ // Return results with scores or without depending on options - combined = options.score ? combined : _.pluck(combined, 'term'); + combined = options.score ? combined : _.map(combined, 'term'); } @@ -197,11 +199,16 @@ exports.transformStream = function(options){ // was used 22 times (within the cutoff of 20 * 0.2), then it would be removed // from the results. If "national broadband" was used more than the cutoff, // e.g. 30 times, it would be left in the results. -exports.combine = function(phrases, cutoff){ +exports.combine = function(phrases, cutoff, min){ var combined = _.clone(phrases); _.each(_.keys(phrases), function(phrase){ - var ngramToTry, subPhrases; + var ngramToTry; + + // Skip this check if the composite phrase doesn't meet the minimum + // requirements for the results. + if (phrases[phrase] < min) return; + ngramToTry = phrase.split(' ').length - 1; if (ngramToTry < 1) return; @@ -234,7 +241,7 @@ Tf.prototype.listMostFrequestTerms = function(d) { }; function whitelisted(term, startWords){ - return startWords.indexOf(term) !== -1; + return _.indexOf(startWords, term) !== -1; } function blacklisted(term, extraStopWords){ @@ -246,8 +253,15 @@ function blacklisted(term, extraStopWords){ } function usePhrase(phrase, options){ - return whitelisted(phrase, options.startWords) || - !_.detect(phrase.split(' '), function(term){ - return blacklisted(term, options.stopWords); + // check if any of the terms in the phrase are blacklisted and not whitelisted. If not, it's usable + var doUsePhrase = !_.find(phrase.split(' '), function(term){ + return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords); }); + // if options specify, exclude phrases that are only startWords + if (doUsePhrase && options.ignoreStartWordOnlyPhrases) { + doUsePhrase = !_.every(phrase.split(' '), function(term){ + return whitelisted(term, options.startWords); + }); + } + return doUsePhrase; } diff --git a/package.json b/package.json index d960899..83d9709 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gramophone", - "version": "0.0.3", + "version": "0.0.5", "main": "index.js", "description": "extracts most frequently used keywords and phrases from text", "keywords": [ @@ -14,12 +14,12 @@ }, "license": "MIT", "dependencies": { - "lodash": "~0.8.2", - "natural": "~0.1.18", - "event-stream": "~3.0.8", - "underscore.string": "~2.3.1" + "event-stream": "^3.3.2", + "lodash": "^4.11.1", + "natural": "^0.4.0", + "underscore.string": "^3.3.4" }, "devDependencies": { - "tap": "~0.3.3" + "tap": "^5.7.1" } } diff --git a/test/extract.js b/test/extract.js index a5d2232..97e4894 100644 --- a/test/extract.js +++ b/test/extract.js @@ -77,6 +77,13 @@ test('with {startWords: [word]} as option', function(t){ t.end(); }); +test('that start words aren\'t only matched when they\'re the entire ngram', function(t){ + var options = { startWords: ['is'] }; + var results = k.extract(text, options); + t.ok(results.indexOf('beep beep is') !== -1, 'treat start words as words'); + t.end(); +}); + test('with {ngram: number} as option', function(t){ var text = "test node code. And test and node and code and node. And test node code"; var options = { ngrams: [3] }; @@ -107,6 +114,22 @@ test('with {cutoff: float} as option', function(t){ t.end(); }); +test('with {ignoreStartWordOnlyPhrases: true} as option', function(t){ + var text = "foo but bar and not with foo but not with bar"; + var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}; + var results = k.extract(text, options); + t.ok(t.deepEqual(results, ['foo but', 'bar']), "should not include phrases of only start words"); + t.end(); +}); + +test('default cutoff should not exclude component phrases when composite is below minimum', function(t){ + var text = "foo but bar and not with foo but not with bar"; + var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true}; + var results = k.extract(text, options); + t.ok(results.indexOf('bar') > -1, "should not exclude component phrases when composite is below minimum"); + t.end(); +}); + test('extract apostrophe', function (t){ var text = "Today is 15 July - St Swithin's Day. Legend has it that if it rains on St Swithin's Day then the wet weather will continue for 40 days."; var options = {alternativeTokenizer: true};