fossology · Kaushl2208 · Aug 11, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -9,6 +9,7 @@ python:
   - 3.5
   - 3.6
   - 3.7
+  - 3.8
 
 before_install:
   - pip install --upgrade pip
@@ -23,3 +24,6 @@ script:
   - atarashi -a tfidf -s CosineSim ./atarashi/atarashii.py
   - atarashi -a DLD ./atarashi/atarashii.py
   - atarashi -a wordFrequencySimilarity ./atarashi/atarashii.py
+  - atarashi -a lr_classifier ./atarashi/atarashii.py
+  - atarashi -a svc_classifier ./atarashi/atarashii.py
+  - atarashi -a nb_classifier ./atarashi/atarashii.py
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -11,6 +11,7 @@ include requirements*.*
 include pyproject.toml
 include atarashi/data/licenses/processedLicenses.csv
 include atarashi/data/Ngram_keywords.json
+include atarashi/data/models/*
 
 prune .git
 prune venv

diff --git a/README.md b/README.md
@@ -81,6 +81,22 @@ Get the help by running `atarashi -h` or `atarashi --help`
     - With **Bigram Cosine similarity**
 
         `atarashi -a Ngram -s BigramCosineSim /path/to/file.c`
+- **Classification models**
+    - **Training** (optional)
+
+        `python3 atarashi/agents/models/train.py`
+    - Running **Classification Models**
+
+
+        - **Logistic Regression**
+
+            `atarashi -a lr_classifier /path/to/file.c`
+        - **Multimomial Naive Bayes**
+
+            `atarashi -a nb_classifier /path/to/file.c`
+        - **Linear SVC** 
+
+            `atarashi -a svc_classifier /path/to/file.c`
 - Running in **verbose** mode
 
     `atarashi -a DLD -v /path/to/file.c`
@@ -89,6 +105,14 @@ Get the help by running `atarashi -h` or `atarashi --help`
     understandable by atarashi.
     - `atarashi -a DLD -l /path/to/processedList.csv /path/to/file.c`
     - `atarashi -a Ngram -l /path/to/processedList.csv -j /path/to/ngram.json /path/to/file.c`
+- Running with a custom folder containing all the binary files
+    - We have introduced a `-m` /`--models` input parameter for providing location for model folder containing all the binary files required.
+     E.g. 
+
+        ` atarashi -m path/to/custom/model/folder/ -a classifier_name /path/to/file.c`
+
+    *Note: Providing this parameter is not compulsory, the default folder with binary files will load instead.*
+
 
 ### Running Docker image
 1. Pull Docker image

diff --git a/atarashi/agents/models/__init__.py b/atarashi/agents/models/__init__.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from .test import Model as Model
diff --git a/atarashi/agents/models/test.py b/atarashi/agents/models/test.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 Kaushlendra Pratap (kaushlendrapratap.9837@gmail.com)
+
+SPDX-License-Identifier: GPL-2.0
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+version 2 as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+import joblib
+import os
+import argparse
+from atarashi.agents.atarashiAgent import AtarashiAgent
+from atarashi.libs.initialmatch import spdx_identifer
+
+
+class Model(AtarashiAgent):
+
+  '''
+  Class Model Inherits the Atarashi Agent class inorder to follow a linear and similar interface.
+  Few Methods of parent class are required in Model class.
+
+  :Inherits: Atarashi Agent
+  :Inherited_Method_1(__init__): Parent class constructor to verify the provided licenseList
+  :Inherited_Method_2(loadFile): Extracting the license text from the source code and returning a pre-processed comment text.
+
+  :Derived Class: Model
+  :Method_1(__init__): Initialising absolute path of the models directory
+  :Method_2(similarity_calc): Classifying the license name from the input processed comment.
+  :Method_3(model_predict): Returning a list containing respective metadata.
+  :Method_4(getSimalgo): Getter method
+  :Method_5(setSimAlgo): Setter method for assigning the algorithm to use.
+  :Method_6(scan): Acts as a control method which allows to move forward when everything asked for is there.
+
+  '''
+
+  lr_classifier = "lr_classifier"
+  nb_classifier = "nb_classifier"
+  svc_classifier = "svc_classifier"
+
+  def __init__(self, licenseList, modelsLoc):
+    super().__init__(licenseList)
+    self.models_folder = os.path.abspath(modelsLoc)
+
+  def similarity_calc(self, processed_comment):
+
+    '''
+    The function is designed to give the prediction results of the specific model
+    asked by the user. Implementation of all three models and their binary files 
+    is done here.
+
+    :param processed_comment: Pre-processed string derived from the input extracted license.
+    :return: A list containing the predicted license name by the specific model.
+    :rtype: list() 
+
+    '''
+
+    with open(os.path.join(self.models_folder, 'vectorizer.pkl'), 'rb') as f:
+      loaded_vect = joblib.load(f)
+
+    if self.getSimAlgo() == self.lr_classifier:
+      classifier = joblib.load(os.path.join(self.models_folder, 'lr_model.pkl'))
+    elif self.getSimAlgo() == self.nb_classifier:
+      classifier = joblib.load(os.path.join(self.models_folder, 'nb_model.pkl'))
+    elif self.getSimAlgo() == self.svc_classifier:
+      classifier = joblib.load(os.path.join(self.models_folder, 'svc_model.pkl'))
+
+    return classifier.predict((loaded_vect.transform([processed_comment])))
+
+
+  def model_predict(self, filePath):
+
+    '''
+    The function is designed to give output as the most similar predicted files
+    provided by the user. Three different model approaches are designed
+    which can result into different similarities. The comments from files are
+    extracted and then the prediction is done on the basis of pre-trained
+    models in data folder.
+
+    :param filePath: Input file path to scan
+    :return: Result with license shortname, sim_score, sim_type and description
+    :rtype: list(JSON Format)
+    '''
+
+    match = []
+
+    with open(filePath) as file:
+      raw_data = file.read()
+
+    # Match SPDX identifiers
+    spdx_identifiers = spdx_identifer(raw_data, self.licenseList['shortname'])
+    match.extend(spdx_identifiers)
+
+    processed_comment = super().loadFile(filePath)
+    license_name = self.similarity_calc(processed_comment)
+
+    match.append({
+      'shortname': str(license_name[0]),
+      'sim_score': 1,
+      'sim_type': self.getSimAlgo(),
+      'description': "Shortname: is the predicted license by the model"
+    })
+    return match
+
+  def getSimAlgo(self):
+    return self.algo
+
+  def setSimAlgo(self, newAlgo):
+    if newAlgo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier):
+      self.algo = newAlgo
+
+  def scan(self, filePath):
+    if self.algo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier):
+      return self.model_predict(filePath)
+    else:
+      return -1
+
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("processedLicenseList", help="Specify the processed license list file")
+  parser.add_argument("modelFolder", help="Specify the location of folder with models")
+  parser.add_argument("inputFile", help="Specify the input file which needs to be scanned")
+  parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name")
+  args = parser.parse_args()
+
+  licenseList = args.processedLicenseList
+  filename = args.inputFile
+  model = args.modelname
+  modelFolder = args.modelFolder
+
+  scanner = Model(licenseList, modelFolder)
+  scanner.setSimAlgo(model)
+  scanner.scan(filename)
diff --git a/atarashi/agents/models/train.py b/atarashi/agents/models/train.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 Kaushlendra Pratap (kaushlendrapratap.9837@gmail.com)
+
+SPDX-License-Identifier: GPL-2.0
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+version 2 as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+import pandas as pd
+import os
+import joblib
+from atarashi.libs.commentPreprocessor import CommentPreprocessor
+from sklearn.svm import LinearSVC
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+
+
+
+def model_train():
+
+  '''
+  This function is a very versatile function which starts from loading the Pandas Dataframe
+  and applying the pre-defined preprocessing technique. It also generates a vocabulary of words
+  for each license text. Initialisation of all three models followed by the training of each 
+  model on the provided training dataset. Finally, it stores the binary file into models 
+  folder for quick classification in future.
+
+  '''
+
+  current_dir = os.path.dirname(os.path.abspath(__file__))
+  data_dir = os.path.abspath(os.path.join(current_dir,os.path.join(os.pardir,os.pardir)))
+
+  licensepath = os.path.join(data_dir, "data/licenses/licenseList.csv")
+  binary1 = os.path.join(data_dir, 'data/models/lr_model.pkl')
+  binary2 = os.path.join(data_dir, 'data/models/nb_model.pkl')
+  binary3 = os.path.join(data_dir, 'data/models/svc_model.pkl')
+  binary4 = os.path.join(data_dir, 'data/models/vectorizer.pkl')
+
+  data = pd.read_csv(licensepath)
+  data.drop(['parent_shortname', 'report_shortname', 'url', 'notes', 'source', 'risk','fullname'], axis = 1, inplace = True)
+  data.dropna(inplace=True)
+  data['text'] = data['text'].astype(str)
+  data['cleaned'] = data['text'].apply(CommentPreprocessor.preprocess)
+
+  X_train, y_train = data['cleaned'],data['shortname']
+  count_vect = CountVectorizer()
+  X_train_counts = count_vect.fit_transform(X_train)
+  tfidf_transformer = TfidfTransformer()
+  X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
+
+  ##Initialisation of Models and creating
+  naive_bayes = MultinomialNB()
+  l_regress =  LogisticRegression()
+  svc_classifier = LinearSVC()
+
+  print("Model training is going on")
+  naive_bayes.fit(X_train_tfidf,y_train)
+  print("First training completed")
+  l_regress.fit(X_train_tfidf,y_train)
+  print("Second training completed")
+  svc_classifier.fit(X_train_tfidf,y_train)
+  print("Third training completed")
+
+  print("All the models have been trained perfectly!!")
+  print("Saving the models into data folder....")
+  joblib.dump(naive_bayes,binary2)
+  joblib.dump(l_regress,binary1)
+  joblib.dump(svc_classifier,binary3)
+  joblib.dump(count_vect,binary4)
+  print("Done")
+
+
+
+if __name__ == "__main__":
+  model_train()
diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py
@@ -27,19 +27,23 @@
 from atarashi.agents.dameruLevenDist import DameruLevenDist
 from atarashi.agents.tfidf import TFIDF
 from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
+from atarashi.agents.models import Model
 
 __author__ = "Aman Jain"
 __email__ = "amanjain5221@gmail.com"
 __version__ = "0.0.10"
 
 
-def atarashii_runner(inputFile, processedLicense, agent_name, similarity="CosineSim", ngramJsonLoc=None, verbose=None):
+def atarashii_runner(inputFile, processedLicense, agent_name,
+                     similarity="CosineSim", ngramJsonLoc=None, modelsLoc=None,
+                     verbose=None):
   '''
   :param inputFile: Input File for scanning of license
   :param processedLicense: Processed License List (CSV) path (Default path already provided)
   :param agent_name: Specify the agent that you want to use for scanning
   :param similarity: Specify the similarity type to be used for the particular agent
   :param ngramJsonLoc: Specify N-Gram Json File location
+  :param modelsLoc: Specify folder location of trained models
   :param verbose: Specify if verbose mode is on or not (Default is Off/ None)
   :return: Returns the array of JSON with scan results
 
@@ -56,6 +60,9 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine
   scanner = ""
   if agent_name == "wordFrequencySimilarity":
     scanner = WordFrequencySimilarity(processedLicense)
+  elif agent_name in ("lr_classifier", "svc_classifier", "nb_classifier"):
+    scanner = Model(processedLicense, modelsLoc)
+    scanner.setSimAlgo(agent_name)
   elif agent_name == "DLD":
     scanner = DameruLevenDist(processedLicense)
   elif agent_name == "tfidf":
@@ -91,19 +98,23 @@ def main():
   '''
   defaultProcessed = resource_filename("atarashi", "data/licenses/processedLicenses.csv")
   defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
+  defaultModels = os.path.dirname(resource_filename("atarashi", "data/models/vectorizer.pkl"))
   parser = argparse.ArgumentParser()
   parser.add_argument("inputFile", help="Specify the input file path to scan")
   parser.add_argument("-l", "--processedLicenseList", required=False,
                       help="Specify the location of processed license list file")
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity','lr_classifier','svc_classifier','nb_classifier' ,'DLD', 'tfidf', 'Ngram'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
                       help="Specify the similarity algorithm that you want."
                            " First 2 are for TFIDF and last 3 are for Ngram")
   parser.add_argument("-j", "--ngram_json", required=False,
                       help="Specify the location of Ngram JSON (for Ngram agent only)")
+  parser.add_argument("-m", "--models", required=False,
+                      help="Specify the location of models folder (for "
+                           "classifier agents only)", default=defaultModels)
   parser.add_argument("-v", "--verbose", help="increase output verbosity",
                       action="count", default=0)
   parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__)
@@ -114,13 +125,17 @@ def main():
   verbose = args.verbose
   processedLicense = args.processedLicenseList
   ngram_json = args.ngram_json
+  models = args.models
 
   if processedLicense is None:
     processedLicense = defaultProcessed
   if ngram_json is None:
     ngram_json = defaultJSON
+  if models is None:
+    models = defaultModels
 
-  result = atarashii_runner(inputFile, processedLicense, agent_name, similarity, ngram_json, verbose)
+  result = atarashii_runner(inputFile, processedLicense, agent_name, similarity,
+                            ngram_json, models, verbose)
   if agent_name == "wordFrequencySimilarity":
     result = [{
             "shortname": str(result),

diff --git a/atarashi/data/models/lr_model.pkl b/atarashi/data/models/lr_model.pkl
diff --git a/atarashi/data/models/nb_model.pkl b/atarashi/data/models/nb_model.pkl
diff --git a/atarashi/data/models/svc_model.pkl b/atarashi/data/models/svc_model.pkl
diff --git a/atarashi/data/models/vectorizer.pkl b/atarashi/data/models/vectorizer.pkl