edlea · elvo86 · Apr 29, 2016 · Apr 29, 2016 · May 12, 2016 · May 12, 2016
diff --git a/README.md b/README.md
@@ -182,6 +182,18 @@ that respects these characters.
 keyword.extract('Lörem Ipsüm Lörem Ipsüm.', {alternativeTokenizer: true})
 ```
 
+#### Option: ignoreStartWordOnlyPhrases
+
+Setting `ignoreStartWordOnlyPhrases` to `true` will exclude phrases that are made up
+entirely of words in the `startWords` list.
+
+```js
+keyword.extract('foo but bar and not with foo but not with bar',
+  {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true})
+```
+
+Returns `['foo but', 'bar']`
+
 --------------------------------------------------------
 <a name="stream"></a>
 ### gramophone.stream([options])

diff --git a/index.js b/index.js
@@ -41,6 +41,8 @@ exports.extract = function(text, options){
     options.ngrams = [1, 2, 3];
   }else if (typeof options.ngrams === 'number'){
     options.ngrams = [options.ngrams];
+  }else{
+    options.ngrams = _.map(options.ngrams, Number);
   }
   if (!options.cutoff) options.cutoff = 0.5;
   if (!options.min) options.min = 2;
@@ -80,7 +82,7 @@ exports.extract = function(text, options){
     });
     tf.addDocument(tokenized);
     keywordsForNgram = tf.listMostFrequestTerms(0);
-    keywordsForNgram = _.select(keywordsForNgram, function(item){
+    keywordsForNgram = _.filter(keywordsForNgram, function(item){
       return usePhrase(item.term, options);
     });
     results = results.concat(keywordsForNgram);
@@ -92,19 +94,19 @@ exports.extract = function(text, options){
   });
 
   // Combine results from each ngram to remove redundancy phrases
-  combined = exports.combine(combinedResults, options.cutoff);
+  combined = exports.combine(combinedResults, options.cutoff, options.min);
 
   // Convert to a list of objects sorted by tf (term frequency)
   combined = _.chain(combined)
-    .pairs()
+    .toPairs()
     .sortBy(_.last)
     .reverse()
     .map(function(combination){ return {term: combination[0], tf: combination[1] }; })
     .value();
 
   // Only return results over a given frequency (default is 2 or more)
   if (options.min){
-    combined = _.select(combined, function(result){
+    combined = _.filter(combined, function(result){
       return result.tf >= options.min;
     });
   }
@@ -130,7 +132,7 @@ exports.extract = function(text, options){
     );
   }else{
     // Return results with scores or without depending on options
-    combined =  options.score ? combined : _.pluck(combined, 'term');
+    combined =  options.score ? combined : _.map(combined, 'term');
   }
 
 
@@ -197,11 +199,16 @@ exports.transformStream = function(options){
 // was used 22 times (within the cutoff of 20 * 0.2), then it would be removed
 // from the results. If "national broadband" was used more than the cutoff,
 // e.g. 30 times, it would be left in the results.
-exports.combine = function(phrases, cutoff){
+exports.combine = function(phrases, cutoff, min){
   var combined = _.clone(phrases);
 
   _.each(_.keys(phrases), function(phrase){
-    var ngramToTry, subPhrases;
+    var ngramToTry;
+
+    // Skip this check if the composite phrase doesn't meet the minimum
+    // requirements for the results.
+    if (phrases[phrase] < min) return;
+
     ngramToTry = phrase.split(' ').length - 1;
 
     if (ngramToTry < 1) return;
@@ -234,7 +241,7 @@ Tf.prototype.listMostFrequestTerms = function(d) {
 };
 
 function whitelisted(term, startWords){
-  return startWords.indexOf(term) !== -1;
+  return _.indexOf(startWords, term) !== -1;
 }
 
 function blacklisted(term, extraStopWords){
@@ -246,8 +253,15 @@ function blacklisted(term, extraStopWords){
 }
 
 function usePhrase(phrase, options){
-  return whitelisted(phrase, options.startWords) ||
-    !_.detect(phrase.split(' '), function(term){
-      return blacklisted(term, options.stopWords);
+  // check if any of the terms in the phrase are blacklisted and not whitelisted. If not, it's usable
+  var doUsePhrase = !_.find(phrase.split(' '), function(term){
+      return blacklisted(term, options.stopWords) && !whitelisted(term, options.startWords);
     });
+  // if options specify, exclude phrases that are only startWords
+  if (doUsePhrase && options.ignoreStartWordOnlyPhrases) {
+    doUsePhrase = !_.every(phrase.split(' '), function(term){
+      return whitelisted(term, options.startWords);
+    });
+  }
+  return doUsePhrase;
 }
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "gramophone",
-  "version": "0.0.3",
+  "version": "0.0.5",
   "main": "index.js",
   "description": "extracts most frequently used keywords and phrases from text",
   "keywords": [
@@ -14,12 +14,12 @@
   },
   "license": "MIT",
   "dependencies": {
-    "lodash": "~0.8.2",
-    "natural": "~0.1.18",
-    "event-stream": "~3.0.8",
-    "underscore.string": "~2.3.1"
+    "event-stream": "^3.3.2",
+    "lodash": "^4.11.1",
+    "natural": "^0.4.0",
+    "underscore.string": "^3.3.4"
   },
   "devDependencies": {
-    "tap": "~0.3.3"
+    "tap": "^5.7.1"
   }
 }
diff --git a/test/extract.js b/test/extract.js
@@ -77,6 +77,13 @@ test('with {startWords: [word]} as option', function(t){
   t.end();
 });
 
+test('that start words aren\'t only matched when they\'re the entire ngram', function(t){
+  var options = { startWords: ['is'] };
+  var results = k.extract(text, options);
+  t.ok(results.indexOf('beep beep is') !== -1, 'treat start words as words');
+  t.end();
+});
+
 test('with {ngram: number} as option', function(t){
   var text = "test node code. And test and node and code and node. And test node code";
   var options = { ngrams: [3] };
@@ -107,6 +114,22 @@ test('with {cutoff: float} as option', function(t){
   t.end();
 });
 
+test('with {ignoreStartWordOnlyPhrases: true} as option', function(t){
+  var text = "foo but bar and not with foo but not with bar";
+  var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true};
+  var results = k.extract(text, options);
+  t.ok(t.deepEqual(results, ['foo but', 'bar']), "should not include phrases of only start words");
+  t.end();
+});
+
+test('default cutoff should not exclude component phrases when composite is below minimum', function(t){
+  var text = "foo but bar and not with foo but not with bar";
+  var options = {startWords: ['but', 'not', 'with'], ignoreStartWordOnlyPhrases: true};
+  var results = k.extract(text, options);
+  t.ok(results.indexOf('bar') > -1, "should not exclude component phrases when composite is below minimum");
+  t.end();
+});
+
 test('extract apostrophe', function (t){
   var text = "Today is 15 July - St Swithin's Day. Legend has it that if it rains on St Swithin's Day then the wet weather will continue for 40 days.";
   var options = {alternativeTokenizer: true};