diff --git a/.dockerignore b/.dockerignore
index e69de29..0bd0f5b 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+src/data/*
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index e68495e..7865546 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,14 @@
+.idea/
outputs/
-data/
+data/*
venv/
test/
runs/
scikit_model
cache_dir/
__pycache__/
+wandb/
+src.iml
+/src/tb_logs/
+/src/tenderclass/
+/src/tenderclass-backend-src/
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 0e40fe8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-
-# Default ignored files
-/workspace.xml
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index ba24381..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 84c84bf..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/tenderclass-backend.iml b/.idea/tenderclass-backend.iml
deleted file mode 100644
index 7c9d48f..0000000
--- a/.idea/tenderclass-backend.iml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 80ec5ac..4d200f4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,11 +9,15 @@ WORKDIR /
#RUN apk add make gcc musl-dev g++
+RUN pip install --upgrade pip
+RUN pip install cython
+
+
+RUN pip install git+https://github.com/huggingface/transformers.git
+
# we need to install further python packages which are listed in requirements.txt
COPY requirements.txt ./
-RUN pip install --upgrade pip
-RUN pip install cython
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
diff --git a/README.md b/README.md
index 30371c8..2dacb70 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,12 @@
# tuwien-inso-bachelorthesis-tenderclass-backend
-tenderclass is an automated screening system for public procurement notices using state-of-the-art Machine Learning and Natural Language Processing (NLP) frameworks. This git repository holds the Python-based backend of tenderclass. It is responsible for downloading, parsing and classifying tenders from Tenders Electronic Daily (TED). For this reason, this prototype implements two Machine Learning approaches:
+tenderclass is an automated screening system for public procurement notices using state-of-the-art Machine Learning and Natural Language Processing (NLP) frameworks. This git repository holds the Python-based backend of tenderclass. It is responsible for downloading, parsing and classifying tenders from Tenders Electronic Daily (TED). For this reason, this prototype implements following Machine Learning approaches:
-- SpacyScikitModel: Machine Learning Model based on [spaCy](https://spacy.io/) and [scikit-learn](https://scikit-learn.org/stable/)
-- TransformerModel: Machine Learning Model based on [Hugging Face](https://github.com/huggingface/transformers) and [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers)
+- SpacyScikitModel: Machine Learning Model based on [spaCy](https://spacy.io/) and [scikit-learn](https://scikit-learn.org/stable/) (titles only)
+- TransformerModel: Machine Learning Model based on [Hugging Face](https://github.com/huggingface/transformers) and [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers) (titles only)
+- FullTextSvmModel: Machine Learning Model based on [spaCy](https://spacy.io/) and [scikit-learn](https://scikit-learn.org/stable/)
+- FastTextModel: Machine Leanring Model based on [FastText](https://fasttext.cc/)
+- FullTextTransformerModel: Machine Learning Model based on [Hugging Face](https://github.com/huggingface/transformers) and [PyTorch](https://pytorch.org/) with [PytorchLightning](https://www.pytorchlightning.ai/)
## Getting Started
@@ -14,7 +17,7 @@ These instructions will get you a copy of the project up and running on your loc
What things you need to install the software and how to install them
- [Python 3.7/3.8](https://www.python.org/downloads/)
-- OPTIONAL: If you want to train the TransformerModel on a Nvidia GPU (much faster!): [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads)
+- OPTIONAL: Some models can be trained on a Nvidia GPU (much faster!): [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads)
- OPTIONAL: If you want to deploy it as a Docker container: [Docker](https://www.docker.com/) runtime environment
### Installing
@@ -39,14 +42,17 @@ Run on Linux: `$ source venv/bin/activate`
3. Install all the required dependecies using Python packet manager `pip`.
`$ pip install -r requirements.txt`
-4. Install the spaCy german language model.
+4. The Transformers package has to be installed from source.
+`pip install git+https://github.com/huggingface/transformers.git`
+
+5. Install the spaCy german language model.
`$ python -m spacy download de`
-5. Navigate to the `src` directory and start the web server by running `main.py`.
+6. Navigate to the `src` directory and start the web server by running `main.py`.
`$ cd src`
`$ python main.py`
-6. OPTIONAL: Deactive the virtual environment:
+7. OPTIONAL: Deactive the virtual environment:
Run on Windows: `$ venv\Scripts\deactivate.bat`
Run on Linux: `$ source venv/bin/deactivate`
@@ -75,6 +81,7 @@ You can deploy the backend of tenderclass by using a Docker container.
## API Endpoints
Documentation for the API Endpoints is available in Swagger UI. After starting the web server, enter the following web site into your browser:
+
[API Documenation](http://localhost:5000/swagger)
## Architecture
@@ -83,34 +90,54 @@ The back end incorporates the business logic and Machine Learning services inclu
- Trainer: This module trains the model by fetching two tender sets (positive tenders and negative tenders), labeling them and then feeding them to the model. It also allows to reset the model.
- Fetcher: This class is responsible for downloading tenders from the internet. Although it only delegates the request to the TedFetcher, there is the possibility that tenderclass could also address other public procurement data sources next to TED. This component would decide which data source should be used.
- TedFetcher: Given a number and a search query, this component automatically downloads the raw XML tender data using the TedDownloader. Afterwards, it delegates the XML document to TedExtractor, which builds up the Tender entity by extracting the relevant fields from the XML document.
+

### Data model
-The prototype tenderclass implements two types of entities. A Tender represents one single public procurement notice. It holds the unique identifier of type string, which is assigned externally by TED, the hyperlink to the official TED website document of type string as well as the
-list of CPV codes of type list of strings. Moreover, each tender consists of an array of at least one LanguageEntity. This entity holds the language-specific information such as title of type string and description of type string. Although requirement analysis only dictates to support German public procurement notices, the data model already supports multiple languages in case of extending the prototype with additional features such as multi-language or translation support. The following figure shows the corresponding class diagram of the data model.
+The prototype tenderclass implements two types of entities. A Tender represents one single public procurement notice. It holds the unique identifier of type string, which is assigned externally by TED, the list of CPV codes of type list of strings and the language of the original submission of the procurment notive.
+Moreover, each tender consists of an array of at least one LanguageEntity which is the language entity in the language of the original publication.
+This entity holds the language-specific information such as title of type string and description of type string and additionally a link to the procurement on the TED webite.
+Although requirement analysis only dictates to support German public procurement notices, the data model already supports multiple languages in case of extending the prototype with additional features such as multi-language or translation support. The following figure shows the corresponding class diagram of the data model.
+

## Endpoints and Program Flow
### Get recommendations
This endpoint downloads all tenders published on the current date, classifies them and then only returns the positive tenders as recommendation. The following figure depicts the communication flow. After receiving the request, firstly the Flask web server delegates it to the Recommender module. This component uses the Fetcher module for downloading and parsing the tenders. The Fetch Model section displays its sequence communication in more detail. Subsequently, the Recommender module only returns those tenders the model has classified to be interesting.
+

### Create new model
This endpoint creates a new model and trains it with two distinct sets of tenders. For this purpose, the JSON body requires four different properties. The pos_number attribute indicates how many positive tenders the application should download from TED and feed to the model. Thereby, the pos_search_criteria specifies the constraints each positive tender must fulfill. In this case, at least one CPV code must start with 72. Analogously, the same procedure applies for the negative tenders. The following figure illustrates the communication flow. After receiving the request from the Flask web server, the Trainer component firstly creates a new model. Secondly, it fetches both sets of positive and negative tenders respectively. With the tenders wrapped to tuples together with their corresponding labels (1 for positive, 0 for negative), the Trainer module randomly shuffles the tuples. The reason is that otherwise, the model would firstly be trained with the series of positive tenders and afterwards with the series of negative tenders. To counteract this imbalance, the following train call receives the shuffled list of labeled tenders.
+

### Train from web
This endpoint updates the model with additional labeled training data. As the user should have the possibility to either confirm or reject the recommendations, this endpoint fits the model with feedback data. For that reason, the Flask web server accepts a JSON body with two properties. The ids property is a JSON list of tender identifiers which tenderclass automatically downloads. The labels property is an integer list which gives the corresponding labels for these tenders. The i-th label belongs to the i-th id. This is why both lists must be of the same length. Similar to the get recommendations endpoint in the previous section, Flask delegates the request to another component, but this time to the Trainer. The following figure outlines the communication flow. As this module only knows the ids, but not the actual tender data, it first of all needs to download the entire tender metadata. This is why it builds up a search criteria query in the way that the tender id must equal to at least one id in the list. After passing this search criteria to the Fetcher, it receives all tender entities that have been found. As a second step, the Trainer module maps the downloaded tenders to the given labels before wrapping them to tuples. By passing them to the train method, the Model component feeds those labeled tenders to its internal classification model. Finally, the flask web server returns with ok.
+

### Fetch models
Although there is no designated endpoint for fetching specific, query-based tenders, each core function requires downloading, parsing and extracting tenders which the Fetcher module is responsible for. This is why this subsection explains in detail the communication flow of fetching a tender, as seen in the following figure. As soon as the Fetcher module gets a request, it immediately delegates the request to the TedFetcher. In defiance of this extra delegation, this pattern allows developers to add additional data sources such as national public procurement platforms. As TED API supports pagination with up to 100 tender documents per API call, the TedFetcher needs to enter a loop. In each iteration, it calls get_xml_contracts with i as the page number. Subsequently, the triggered TedDownloader issues a REST call to the TED API as described in the Fetch Tender section. Once it has parsed the response and returned the list of XML documents of the i-th page, the TedFetcher module calls the extract method from the TedExtractor. This second step instantiates and initializes a new Tender entity by extracting CPV code, id, title and description out of the XML document. As soon as the component either reaches the requested number c of tenders or exceeds the maximum number of pages (which implies that fewer tenders than intended are returned), the module returns the list of Tender entities.
+

+## Logging
+The FullTextTransformer model has support for wandb to log statistics of a training run of the model. The logs can be accessed on the [Weights & Biases](https://wandb.ai/home) (wandb) portal.
+
+To write logs to the platform, the user has to be logged in. Provided the the corresponding `wandb package has been istalled via pip, which is the case
+if all requirements have been installed, the user ca login via following command:
+
+`wandb login`
+
+No the logged metrics of runs of the FullTextTransformerModel can be viewed on the wandb-acocunt associated with the used
+credentials on [https://www.wandb.com/](https://www.wandb.com/).
+
## Authors
* **Nicolas Griebenow** - *Initial work* - [ngriebenow](https://github.com/ngriebenow)
+* **Lukas Arnhold** - *Further development of classification models* - [larnhold](https://github.com/larnhold)
diff --git a/doc/datamodel.png b/doc/datamodel.png
index d0e93da..32361ee 100644
Binary files a/doc/datamodel.png and b/doc/datamodel.png differ
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..b2b6137
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,9 @@
+version: "3"
+
+services:
+ backend:
+ build: .
+ ports:
+ - "5000:5000"
+ volumes:
+ - "./data:/src/data:rw"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ae393a3..74e6f66 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,21 @@
-scikit-learn
-joblib
+pytorch-lightning~=1.1.4
+scikit-learn~=0.23.1
+joblib~=0.16.0
simpletransformers
-flask
-beautifulsoup4
+flask~=1.1.2
+beautifulsoup4~=4.9.1
flask_swagger_ui
-spaCy
+spaCy~=2.3.2
lxml
flask_cors
+requests~=2.24.0
+pandas~=1.0.4
+marshmallow~=3.7.1
+nltk~=3.5
+sklearn~=0.0
+torch~=1.7.1
+torchvision
+wandb
+fasttext
+matplotlib~=3.3.2
+memory_profiler
\ No newline at end of file
diff --git a/src/Models/FromDatasetsModelModel.py b/src/Models/FromDatasetsModelModel.py
new file mode 100644
index 0000000..4e92aeb
--- /dev/null
+++ b/src/Models/FromDatasetsModelModel.py
@@ -0,0 +1,4 @@
+class FromDatasetsModel:
+ def __init__(self, pos_filename, neg_filename):
+ self.pos_filename = pos_filename
+ self.neg_filename = neg_filename
\ No newline at end of file
diff --git a/src/Models/ModelNameModel.py b/src/Models/ModelNameModel.py
new file mode 100644
index 0000000..4d8293e
--- /dev/null
+++ b/src/Models/ModelNameModel.py
@@ -0,0 +1,3 @@
+class ModelNameModel:
+ def __init__(self, name):
+ self.name = name
\ No newline at end of file
diff --git a/src/Models/NewModelModel.py b/src/Models/NewModelModel.py
new file mode 100644
index 0000000..61f2f70
--- /dev/null
+++ b/src/Models/NewModelModel.py
@@ -0,0 +1,6 @@
+class NewModelModel:
+ def __init__(self, pos_number, neg_number, pos_search_criteria, neg_search_criteria):
+ self.pos_number = pos_number
+ self.neg_number = neg_number
+ self.pos_search_criteria = pos_search_criteria
+ self.neg_search_criteria = neg_search_criteria
\ No newline at end of file
diff --git a/src/Models/TedSaveModel.py b/src/Models/TedSaveModel.py
new file mode 100644
index 0000000..36c8f07
--- /dev/null
+++ b/src/Models/TedSaveModel.py
@@ -0,0 +1,7 @@
+class TedSaveModel:
+ def __init__(self, amount, search_criteria, dataset_name, original_languages=None, languages=None):
+ self.amount: int = amount
+ self.search_criteria: str = search_criteria
+ self.original_languages: list[str] = original_languages
+ self.languages: list[str] = languages
+ self.dataset_name: str = dataset_name
diff --git a/src/Models/__init__.py b/src/Models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextFastTextModel/FullTextFastTextModel.py b/src/classifier/FullTextFastTextModel/FullTextFastTextModel.py
new file mode 100644
index 0000000..fed83ed
--- /dev/null
+++ b/src/classifier/FullTextFastTextModel/FullTextFastTextModel.py
@@ -0,0 +1,130 @@
+import csv
+import logging
+import string
+from typing import List
+
+import pandas as pd
+import fasttext.util
+from sklearn import svm
+from spacy.lang.de.stop_words import STOP_WORDS
+
+from spacy.lang.de import German
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+from src.entity.Tender import Tender
+
+punctuations = string.punctuation
+logger = logging.getLogger(__name__)
+
+
+class FullTextFastTextModel(TenderClassClassifier):
+
+ def __init__(self):
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.parser = German()
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.fast_text_model = None
+ self.svm_average_model = None
+ self.create_new_model()
+
+ def predict(self, df):
+ val_svm = pd.DataFrame({
+ "title_pos_prob": [self.extract_pos_probability(self.fast_text_model.predict(x)) for x in
+ df["title"].values.tolist()],
+ "desc_pos_prob": [self.extract_pos_probability(self.fast_text_model.predict(x)) for x in
+ df["description"].values.tolist()]
+ })
+
+ return self.svm_average_model.predict(val_svm[["title_pos_prob", "desc_pos_prob"]])
+
+ def classify(self, tenders: List[Tender]):
+ pass
+
+ def spacy_tokenizer(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ complete_df = pd.DataFrame({"title": labelled_tenders_collection.get_titles("DE"),
+ "description": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+ complete_df = complete_df.dropna()
+
+ # tokenize
+ logger.info("Tokenize")
+ complete_df.iloc[:, 0] = complete_df.iloc[:, 0].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+ complete_df.iloc[:, 1] = complete_df.iloc[:, 1].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+
+ # prefix label to FastText format
+ logger.info("Prefixing")
+ complete_df.iloc[:, 2] = complete_df.iloc[:, 2].apply(lambda x: "__label__" + str(x))
+
+ return complete_df
+
+ def save_dataset(self, dataset, name):
+ dataset.to_csv(name,
+ index=False,
+ sep=" ",
+ header=None,
+ quoting=csv.QUOTE_NONE,
+ quotechar="",
+ escapechar=" ")
+
+ def train(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+
+ title_df = complete_df[["title", "label"]]
+ title_df.columns = ['value', 'label']
+
+ desc_df = complete_df[["description", "label"]]
+ desc_df.columns = ['value', 'label']
+
+ fasttext_training_df = pd.concat([title_df, desc_df])
+
+ self.save_dataset(fasttext_training_df, "fasttext_train.csv")
+
+ logger.info("Training model")
+ self.fast_text_model = fasttext.train_supervised("fasttext_train.csv", wordNgrams=2)
+
+ # train the linear classifier
+ svm_train = pd.DataFrame({
+ "title_pos_prob": [self.extract_pos_probability(self.fast_text_model.predict(x)) for x in
+ complete_df["title"].values.tolist()],
+ "desc_pos_prob": [self.extract_pos_probability(self.fast_text_model.predict(x)) for x in
+ complete_df["description"].values.tolist()],
+ "label": [0 if x == "__label__0" else 1 for x in complete_df["label"].values.tolist()]
+ })
+
+ X, y = (svm_train[["title_pos_prob", "desc_pos_prob"]], svm_train["label"])
+
+ self.svm_average_model = svm.SVC(kernel='linear')
+ self.svm_average_model.fit(X, y)
+
+ def validate(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+ y_pred = self.predict(complete_df)
+ y_labels = [0 if x == "__label__0" else 1 for x in complete_df["label"].values.tolist()]
+
+ return ValidationResult(y_labels, y_pred)
+
+ def extract_pos_probability(self, fast_text_prediction):
+ (label, probability) = fast_text_prediction
+ label = 0 if label[0] == "__label__0" else 1
+ probability = probability[0]
+
+ return probability if label == 1 else (1 - probability)
+
+ def save_model(self):
+ pass
+
+ def create_new_model(self):
+ self.fast_text_model = None
+ self.svm_average_model = None
diff --git a/src/classifier/FullTextFastTextModel/__init__.py b/src/classifier/FullTextFastTextModel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelDescOnly.py b/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelDescOnly.py
new file mode 100644
index 0000000..dd2e73f
--- /dev/null
+++ b/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelDescOnly.py
@@ -0,0 +1,100 @@
+import csv
+import logging
+import string
+from typing import List
+
+import pandas as pd
+import fasttext.util
+from spacy.lang.de.stop_words import STOP_WORDS
+
+from spacy.lang.de import German
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+from src.entity.Tender import Tender
+
+punctuations = string.punctuation
+logger = logging.getLogger(__name__)
+
+
+class FullTextFastTextModelDescOnly(TenderClassClassifier):
+
+ def __init__(self):
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.parser = German()
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.fast_text_model = None
+ self.svm_average_model = None
+ self.create_new_model()
+
+ def predict(self, df):
+ return [self.extract_label(self.fast_text_model.predict(x)) for x in df["description"].values.tolist()]
+
+ def classify(self, tenders: List[Tender]):
+ pass
+
+ def spacy_tokenizer(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ complete_df = pd.DataFrame({"title": labelled_tenders_collection.get_titles("DE"),
+ "description": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+ complete_df = complete_df.dropna()
+
+ # tokenize
+ logger.info("Tokenize")
+ complete_df.iloc[:, 0] = complete_df.iloc[:, 0].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+ complete_df.iloc[:, 1] = complete_df.iloc[:, 1].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+
+ # prefix label to FastText format
+ logger.info("Prefixing")
+ complete_df.iloc[:, 2] = complete_df.iloc[:, 2].apply(lambda x: "__label__" + str(x))
+
+ return complete_df
+
+ def save_dataset(self, dataset, name):
+ dataset.to_csv(name,
+ index=False,
+ sep=" ",
+ header=None,
+ quoting=csv.QUOTE_NONE,
+ quotechar="",
+ escapechar=" ")
+
+ def train(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+
+ fasttext_training_df = complete_df[["description", "label"]]
+ fasttext_training_df.columns = ['value', 'label']
+
+ self.save_dataset(fasttext_training_df, "fasttext_train.csv")
+
+ self.fast_text_model = fasttext.train_supervised("fasttext_train.csv", wordNgrams=2)
+
+ def validate(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+ y_pred = self.predict(complete_df)
+ y_labels = [0 if x == "__label__0" else 1 for x in complete_df["label"].values.tolist()]
+
+ return ValidationResult(y_labels, y_pred)
+
+ def extract_label(self, fast_text_prediction):
+ (label, probability) = fast_text_prediction
+ label = 0 if label[0] == "__label__0" else 1
+ return label
+
+ def save_model(self):
+ pass
+
+ def create_new_model(self):
+ self.fast_text_model = None
+ self.svm_average_model = None
diff --git a/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelTitleOnlyl.py b/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelTitleOnlyl.py
new file mode 100644
index 0000000..74be106
--- /dev/null
+++ b/src/classifier/FullTextFastTextModel/validation/FullTextFastTextModelTitleOnlyl.py
@@ -0,0 +1,100 @@
+import csv
+import logging
+import string
+from typing import List
+
+import pandas as pd
+import fasttext.util
+from spacy.lang.de.stop_words import STOP_WORDS
+
+from spacy.lang.de import German
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+from src.entity.Tender import Tender
+
+punctuations = string.punctuation
+logger = logging.getLogger(__name__)
+
+
+class FullTextFastTextModelTitleOnly(TenderClassClassifier):
+
+ def __init__(self):
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.parser = German()
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.fast_text_model = None
+ self.svm_average_model = None
+ self.create_new_model()
+
+ def predict(self, df):
+ return [self.extract_label(self.fast_text_model.predict(x)) for x in df["title"].values.tolist()]
+
+ def classify(self, tenders: List[Tender]):
+ pass
+
+ def spacy_tokenizer(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ complete_df = pd.DataFrame({"title": labelled_tenders_collection.get_titles("DE"),
+ "description": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+ complete_df = complete_df.dropna()
+
+ # tokenize
+ logger.info("Tokenize")
+ complete_df.iloc[:, 0] = complete_df.iloc[:, 0].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+ complete_df.iloc[:, 1] = complete_df.iloc[:, 1].apply(lambda x: " ".join(self.spacy_tokenizer(x)))
+
+ # prefix label to FastText format
+ logger.info("Prefixing")
+ complete_df.iloc[:, 2] = complete_df.iloc[:, 2].apply(lambda x: "__label__" + str(x))
+
+ return complete_df
+
+ def save_dataset(self, dataset, name):
+ dataset.to_csv(name,
+ index=False,
+ sep=" ",
+ header=None,
+ quoting=csv.QUOTE_NONE,
+ quotechar="",
+ escapechar=" ")
+
+ def train(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+
+ fasttext_training_df = complete_df[["title", "label"]]
+ fasttext_training_df.columns = ['value', 'label']
+
+ self.save_dataset(fasttext_training_df, "fasttext_train.csv")
+
+ self.fast_text_model = fasttext.train_supervised("fasttext_train.csv", wordNgrams=2)
+
+ def validate(self, labelled_tenders):
+ complete_df = self.prepare_data(labelled_tenders)
+ y_pred = self.predict(complete_df)
+ y_labels = [0 if x == "__label__0" else 1 for x in complete_df["label"].values.tolist()]
+
+ return ValidationResult(y_labels, y_pred)
+
+ def extract_label(self, fast_text_prediction):
+ (label, probability) = fast_text_prediction
+ label = 0 if label[0] == "__label__0" else 1
+ return label
+
+ def save_model(self):
+ pass
+
+ def create_new_model(self):
+ self.fast_text_model = None
+ self.svm_average_model = None
diff --git a/src/classifier/FullTextSvmModel/FullTextSvmModel.py b/src/classifier/FullTextSvmModel/FullTextSvmModel.py
new file mode 100644
index 0000000..e3e584d
--- /dev/null
+++ b/src/classifier/FullTextSvmModel/FullTextSvmModel.py
@@ -0,0 +1,126 @@
+import logging
+from typing import List
+
+import joblib
+import pandas as pd
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.svm import SVC
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from spacy.lang.de import German
+from spacy.lang.de.stop_words import STOP_WORDS
+import string
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.Tender import Tender
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+
+punctuations = string.punctuation
+
+logger = logging.getLogger(__name__)
+
+
+class FullTextSvmModel(TenderClassClassifier):
+
+ def __init__(self):
+ self.stopwords = list(STOP_WORDS)
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.stopwords.extend(self.domain_stopwords)
+ self.parser = German()
+ self.punctuations = string.punctuation
+ self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement",
+ "work", "include", "support", "approximately", "management", "agreement",
+ "office", "solution", "manage", "product", "design", "program", "project",
+ "supply", "trust", "equipment"]
+
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.model = None
+ self.create_new_model()
+
+ def tokenize(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def classify(self, tenders: List[Tender]):
+
+ if len(tenders) is 0:
+ return tenders
+
+ titles = list(map(lambda x: x.get_original_language_entity().title, tenders))
+ descriptions = list(map(lambda x: x.get_original_language_entity().description, tenders))
+
+ df = pd.DataFrame({"titles": titles, "descriptions": descriptions})
+ df = df.dropna()
+
+ predictions = self.model.predict(df)
+ tuples = zip(tenders, predictions)
+ selected_tenders = [t for t, p in tuples if p == 1]
+ return selected_tenders
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ # create the pandas df
+ training_df = pd.DataFrame({"titles": labelled_tenders_collection.get_titles("DE"),
+ "descriptions": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+
+ # remove null values (description is not alway set)
+ training_df = training_df
+ training_df.loc[training_df["descriptions"].isnull(), 'descriptions'] = training_df["titles"]
+ X = training_df[['titles', 'descriptions']]
+ ylabels = training_df['label']
+
+ return X, ylabels
+
+ def train(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ self.model.fit(X, ylabels)
+
+ def validate(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ y_pred = self.model.predict(X)
+ return ValidationResult(ylabels, y_pred)
+
+ def load(self, name):
+ self.model = joblib.load("./data/" + name)
+
+ def save(self, name):
+ joblib.dump(self.model, "./data/" + name)
+
+ class Extractor(BaseEstimator, TransformerMixin):
+ def __init__(self, column):
+ self.column = column
+ pass
+
+ def transform(self, df, y=None):
+ return df[self.column]
+
+ def fit(self, df, y=None):
+ return self
+
+ def create_new_model(self):
+ pipeline = Pipeline([
+ ('union', FeatureUnion(
+ transformer_list=[
+ ('titles', Pipeline([
+ ('selector', self.Extractor(column="titles")),
+ ('vect', CountVectorizer(max_features=1000, tokenizer=self.tokenize, ngram_range=(1, 2))),
+ ('tfidf', TfidfTransformer())
+ ])),
+ ('descriptions', Pipeline([
+ ('selector', self.Extractor(column="descriptions")),
+ ('vect', CountVectorizer(max_features=1000, tokenizer=self.tokenize, ngram_range=(1, 2))),
+ ('tfidf', TfidfTransformer())
+ ])),
+ ],
+ )),
+ ('svc', SVC(kernel="linear", random_state=0)),
+ ])
+
+ self.model = pipeline
diff --git a/src/classifier/FullTextSvmModel/__init__.py b/src/classifier/FullTextSvmModel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextSvmModel/validation/FullTextSvmModelDescOnly.py b/src/classifier/FullTextSvmModel/validation/FullTextSvmModelDescOnly.py
new file mode 100644
index 0000000..4faebf8
--- /dev/null
+++ b/src/classifier/FullTextSvmModel/validation/FullTextSvmModelDescOnly.py
@@ -0,0 +1,103 @@
+import logging
+
+import pandas as pd
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.svm import SVC
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from spacy.lang.de import German
+from spacy.lang.de.stop_words import STOP_WORDS
+import string
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+
+punctuations = string.punctuation
+
+logger = logging.getLogger(__name__)
+
+
+class FullTextSvmModelDescOnly(TenderClassClassifier):
+
+ def __init__(self):
+ self.stopwords = list(STOP_WORDS)
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.stopwords.extend(self.domain_stopwords)
+ self.parser = German()
+ self.punctuations = string.punctuation
+ self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement",
+ "work", "include", "support", "approximately", "management", "agreement",
+ "office", "solution", "manage", "product", "design", "program", "project",
+ "supply", "trust", "equipment"]
+
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.create_new_model()
+
+ def tokenize(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def classify(self, tenders):
+ titles = list(map(lambda x: x.get_title("DE"), tenders))
+ predictions = self.model.predict(titles)
+ tuples = zip(tenders, predictions)
+ selected_tenders = [t for t, p in tuples if p == 1]
+ return selected_tenders
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ # create the pandas df
+ training_df = pd.DataFrame({"titles": labelled_tenders_collection.get_titles("DE"),
+ "descriptions": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+
+ # remove null values (description is not alway set)
+ training_df = training_df
+ training_df.loc[training_df["descriptions"].isnull(), 'descriptions'] = training_df["titles"]
+ X = training_df[['titles', 'descriptions']]
+ ylabels = training_df['label']
+
+ return X, ylabels
+
+ def train(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ self.model.fit(X, ylabels)
+
+ def validate(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ y_pred = self.model.predict(X)
+ return ValidationResult(ylabels, y_pred)
+
+ def create_new_model(self):
+ class Ectractor(BaseEstimator, TransformerMixin):
+
+ def __init__(self, column):
+ self.column = column
+ pass
+
+ def transform(self, df, y=None):
+ return df[self.column]
+
+ def fit(self, df, y=None):
+ return self
+
+ pipeline = Pipeline([
+ ('union', FeatureUnion(
+ transformer_list=[
+ ('descriptions', Pipeline([
+ ('selector', Ectractor(column="descriptions")),
+ ('vect', CountVectorizer(max_features=1000, tokenizer=self.tokenize, ngram_range=(1, 2))),
+ ('tfidf', TfidfTransformer())
+ ])),
+ ],
+ )),
+ ('svc', SVC(kernel="linear", random_state=0)),
+ ])
+
+ self.model = pipeline
diff --git a/src/classifier/FullTextSvmModel/validation/FullTextSvmModelTitleOnly.py b/src/classifier/FullTextSvmModel/validation/FullTextSvmModelTitleOnly.py
new file mode 100644
index 0000000..2795a59
--- /dev/null
+++ b/src/classifier/FullTextSvmModel/validation/FullTextSvmModelTitleOnly.py
@@ -0,0 +1,103 @@
+import logging
+
+import pandas as pd
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.svm import SVC
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from spacy.lang.de import German
+from spacy.lang.de.stop_words import STOP_WORDS
+import string
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+
+punctuations = string.punctuation
+
+logger = logging.getLogger(__name__)
+
+
+class FullTextSvmModelTitleOnly(TenderClassClassifier):
+
+ def __init__(self):
+ self.stopwords = list(STOP_WORDS)
+ self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
+ self.stopwords.extend(self.domain_stopwords)
+ self.parser = German()
+ self.punctuations = string.punctuation
+ self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement",
+ "work", "include", "support", "approximately", "management", "agreement",
+ "office", "solution", "manage", "product", "design", "program", "project",
+ "supply", "trust", "equipment"]
+
+ self.stopwords = list(STOP_WORDS)
+ self.stopwords.extend(self.domain_stopwords)
+ self.create_new_model()
+
+ def tokenize(self, sentence):
+ sentence_tokens = self.parser(sentence)
+ sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in
+ sentence_tokens]
+ sentence_tokens = [word for word in sentence_tokens if word not in self.stopwords and word not in punctuations]
+ return sentence_tokens
+
+ def classify(self, tenders):
+ titles = list(map(lambda x: x.get_title("DE"), tenders))
+ predictions = self.model.predict(titles)
+ tuples = zip(tenders, predictions)
+ selected_tenders = [t for t, p in tuples if p == 1]
+ return selected_tenders
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ # create the pandas df
+ training_df = pd.DataFrame({"titles": labelled_tenders_collection.get_titles("DE"),
+ "descriptions": labelled_tenders_collection.get_descriptions("DE"),
+ "label": labelled_tenders_collection.get_labels()})
+
+ # remove null values (description is not alway set)
+ training_df = training_df
+ training_df.loc[training_df["descriptions"].isnull(), 'descriptions'] = training_df["titles"]
+ X = training_df[['titles', 'descriptions']]
+ ylabels = training_df['label']
+
+ return X, ylabels
+
+ def train(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ self.model.fit(X, ylabels)
+
+ def validate(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ y_pred = self.model.predict(X)
+ return ValidationResult(ylabels, y_pred)
+
+ def create_new_model(self):
+ class Ectractor(BaseEstimator, TransformerMixin):
+
+ def __init__(self, column):
+ self.column = column
+ pass
+
+ def transform(self, df, y=None):
+ return df[self.column]
+
+ def fit(self, df, y=None):
+ return self
+
+ pipeline = Pipeline([
+ ('union', FeatureUnion(
+ transformer_list=[
+ ('titles', Pipeline([
+ ('selector', Ectractor(column="titles")),
+ ('vect', CountVectorizer(max_features=1000, tokenizer=self.tokenize, ngram_range=(1, 2))),
+ ('tfidf', TfidfTransformer())
+ ]))
+ ],
+ )),
+ ('svc', SVC(kernel="linear", random_state=0)),
+ ])
+
+ self.model = pipeline
diff --git a/src/classifier/FullTextSvmModel/validation/__init__.py b/src/classifier/FullTextSvmModel/validation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextTransformerModel/FullTextTransformerModel.py b/src/classifier/FullTextTransformerModel/FullTextTransformerModel.py
new file mode 100644
index 0000000..15b8ce0
--- /dev/null
+++ b/src/classifier/FullTextTransformerModel/FullTextTransformerModel.py
@@ -0,0 +1,110 @@
+import gc
+from typing import List
+
+import pandas as pd
+import torch
+from pytorch_lightning import Trainer
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader
+from pytorch_lightning.loggers import WandbLogger
+
+from src.classifier.FullTextTransformerModel.PytorchTransformer.PyTorchLighningTransformer import \
+ PyTorchTransformerLightning
+from src.classifier.FullTextTransformerModel.config.TransformerModelConfig import \
+ PytorchTransformerConfig
+from src.classifier.FullTextTransformerModel.PytorchTransformer.data_processing.BertDataSet import BertDataSet
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.ValidationResult import ValidationResult
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+from src.entity.Tender import Tender
+
+
+class FullTextTransformerModel(TenderClassClassifier):
+
+ def __init__(self, config_class=PytorchTransformerConfig.bert_german_full):
+ self.configClass = config_class
+ self.config = None
+ self.model = None
+
+ def predict(self, df):
+ data = BertDataSet(df, self.config)
+ dataloader = DataLoader(data, batch_size=1)
+ predictions = []
+
+ for batch_ndx, sample in enumerate(dataloader):
+ title_input_ids = sample["title_input_ids"]
+ title_attention_masks = sample["title_attention_mask"]
+ description_input_ids = sample["description_input_ids"]
+ description_attention_masks = sample["description_attention_mask"]
+ logits = self.model(title_input_ids, title_attention_masks, description_input_ids,
+ description_attention_masks)
+ _, predicted = torch.max(logits, 1)
+ predictions.append(predicted.data[0].item())
+
+ return predictions
+
+ def load(self, name):
+ self.create_new_model()
+ self.model.load_state_dict(torch.load("./data/" + name))
+
+ def save(self, name):
+ torch.save(self.model.state_dict(), "./data/" + name)
+
+ def classify(self, tenders: List[Tender]):
+ titles = list(map(lambda x: x.get_title("DE"), tenders))
+ descriptions = list(map(lambda x: x.get_description("DE"), tenders))
+ labels = list(map(lambda x: -1, tenders))
+
+ df = pd.DataFrame({"title": titles, "description": descriptions, "label": labels})
+ df = df.dropna()
+
+ predictions = self.predict(df)
+
+ tuples = zip(tenders, predictions)
+ selected_tenders = [t for t, p in tuples if p == 1]
+ return selected_tenders
+
+ def prepare_data(self, labelled_tenders):
+ labelled_tenders_collection = LabelledTenderCollection(labelled_tenders)
+
+ training_df = pd.DataFrame({"title": labelled_tenders_collection.get_titles(),
+ "description": labelled_tenders_collection.get_original_language_entity_description(),
+ "label": labelled_tenders_collection.get_labels()})
+ return training_df.dropna()
+
+ def train(self, labelled_tenders):
+ training_df = self.prepare_data(labelled_tenders)
+
+ train_df, val_df = train_test_split(training_df, test_size=0.1)
+
+ # Create the DataLoader for our training set
+ train_data = BertDataSet(train_df, self.config)
+ train_dataloader = DataLoader(train_data, batch_size=self.config.batch_size, num_workers=6)
+
+ # Create the DataLoader for our validation set
+ val_data = BertDataSet(val_df, self.config)
+ val_dataloader = DataLoader(val_data, batch_size=self.config.batch_size, num_workers=6)
+
+ self.model.set_total_training_steps(self.config.batch_size * len(train_dataloader))
+ wandb_logger = WandbLogger()
+ trainer = Trainer(gradient_clip_val=1.0, gpus=1, max_epochs=self.config.epochs, logger=wandb_logger)
+ trainer.fit(self.model, train_dataloader, val_dataloader)
+
+ def validate(self, labelled_tenders):
+ df = self.prepare_data(labelled_tenders)
+ X = df[['title', 'description']]
+ ylabels = df['label']
+ y_pred = self.predict(X)
+ return ValidationResult(ylabels, y_pred)
+
+ def save_model(self):
+ torch.save(self.model.state_dict(), "./data/" + self.config.name)
+
+ def create_new_model(self):
+ del self.model
+ del self.config
+ torch.cuda.empty_cache()
+ gc.collect()
+
+ self.config = self.configClass()
+ self.model = PyTorchTransformerLightning(self.config)
diff --git a/src/classifier/FullTextTransformerModel/PytorchTransformer/PyTorchLighningTransformer.py b/src/classifier/FullTextTransformerModel/PytorchTransformer/PyTorchLighningTransformer.py
new file mode 100644
index 0000000..ea6f70a
--- /dev/null
+++ b/src/classifier/FullTextTransformerModel/PytorchTransformer/PyTorchLighningTransformer.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn as nn
+from pytorch_lightning import LightningModule
+from transformers import AdamW, get_linear_schedule_with_warmup
+
+from src.classifier.FullTextTransformerModel.config import TransformerModelConfig
+
+
+class PyTorchTransformerLightning(LightningModule):
+
+ def __init__(self, config: TransformerModelConfig, total_steps=0):
+ super(PyTorchTransformerLightning, self).__init__()
+
+ self.config = config
+
+ # Specify number of input feature, size of hidden state and number of output features
+ D_in, H, D_out = 768, 50, 2
+
+ # Instantiate pretrained model
+ self.model = config.model
+
+ # Instantiate an one-layer feed-forward classifier
+ self.classifier = nn.Sequential(
+ nn.Linear(D_in * self.config.num_models, H),
+ nn.ReLU(),
+ nn.Dropout(0.1),
+ nn.Linear(H, D_out),
+ nn.Sigmoid()
+ )
+
+ # Freeze the pretrained model
+ if config.freeze_pretrained_layers:
+ for param in self.model.parameters():
+ param.requires_grad = False
+
+ self.loss_fn = torch.nn.CrossEntropyLoss()
+ self.total_steps = total_steps
+
+ def empty_model(self, input_ids, attention_mask):
+ return None
+
+ def set_total_training_steps(self, training_steps):
+ self.total_steps = training_steps
+
+ def configure_optimizers(self):
+ optimizer = AdamW(self.parameters(),
+ lr=5e-5, # Default learning rate
+ eps=1e-8 # Default epsilon value
+ )
+
+ scheduler = get_linear_schedule_with_warmup(optimizer,
+ num_warmup_steps=0, # Default value
+ num_training_steps=self.total_steps)
+
+ return [optimizer], [scheduler]
+
+ def forward(self, title_input_ids, title_attention_masks, description_input_ids, description_attention_masks):
+
+ if self.config.use_title is False:
+ outputs_descs = self.model(input_ids=description_input_ids, attention_mask=description_attention_masks)
+ output = outputs_descs[0][:, 0, :]
+ elif self.config.use_desc is False:
+ outputs_titles = self.model(input_ids=title_input_ids, attention_mask=title_attention_masks)
+ output = outputs_titles[0][:, 0, :]
+ else:
+ outputs_titles = self.model(input_ids=title_input_ids, attention_mask=title_attention_masks)
+ outputs_descs = self.model(input_ids=description_input_ids, attention_mask=description_attention_masks)
+ output = torch.cat((outputs_titles[0][:, 0, :], outputs_descs[0][:, 0, :]), dim=1)
+
+ logits = self.classifier(output)
+
+ return logits
+
+ def training_step(self, batch, batch_idx):
+ title_input_ids = batch["title_input_ids"]
+ title_attention_masks = batch["title_attention_mask"]
+ description_input_ids = batch["description_input_ids"]
+ description_attention_masks = batch["description_attention_mask"]
+ labels = batch["label"]
+
+ logits = self.forward(title_input_ids, title_attention_masks, description_input_ids,
+ description_attention_masks)
+
+ correct = logits.argmax(dim=1).eq(labels).sum().item()
+
+ total = len(labels)
+
+ loss = self.loss_fn(logits, labels)
+ logs = {'train_loss': loss}
+
+ self.logger.log_metrics({"Test Accuracy": correct / total, "Test Loss": loss})
+
+ return {'loss': loss,
+ 'logs': logs,
+ "correct": correct,
+ "total": total
+ }
+
+ def validation_step(self, batch, batch_idx):
+ title_input_ids = batch["title_input_ids"]
+ title_attention_masks = batch["title_attention_mask"]
+ description_input_ids = batch["description_input_ids"]
+ description_attention_masks = batch["description_attention_mask"]
+ labels = batch["label"]
+
+ logits = self.forward(title_input_ids, title_attention_masks, description_input_ids,
+ description_attention_masks)
+
+ loss = self.loss_fn(logits, labels)
+
+ preds = torch.argmax(logits, dim=1).flatten()
+ accuracy = torch.tensor((preds == labels).cpu().numpy().mean() * 100)
+
+ self.logger.log_metrics({"Val Accuracy": accuracy, "Val Loss": loss})
+
+ return {'loss': loss, 'acc': accuracy}
+
+ def validation_end(self, outputs):
+ avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
+ avg_val_acc = torch.stack([x['acc'] for x in outputs]).mean()
+ tensorboard_logs = {'val_loss': avg_loss, 'avg_val_acc': avg_val_acc}
+
+ print(f'Accuracy: {avg_val_acc}')
+ self.logger.log_metrics({"Epoch Val Accuracy": avg_val_acc, "Epoch Val Loss": avg_loss})
+
+ return {'avg_val_loss': avg_loss, 'logs': tensorboard_logs}
diff --git a/src/classifier/FullTextTransformerModel/PytorchTransformer/__init__.py b/src/classifier/FullTextTransformerModel/PytorchTransformer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertDataSet.py b/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertDataSet.py
new file mode 100644
index 0000000..dce2538
--- /dev/null
+++ b/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertDataSet.py
@@ -0,0 +1,42 @@
+import torch
+from torch.utils.data import Dataset
+
+from src.classifier.FullTextTransformerModel.config.TransformerModelConfig import \
+ PytorchTransformerConfig
+from src.classifier.FullTextTransformerModel.PytorchTransformer.data_processing.BertPreprocessor import BertPreprocessor
+
+
+class BertDataSet(Dataset):
+
+ def __init__(self, df, config: PytorchTransformerConfig):
+ self.titles = None
+ self.descriptions = None
+ self.labels = None
+
+ if "title" in df.columns:
+ self.titles = df["title"].values
+
+ if "description" in df.columns:
+ self.descriptions = df["description"].values
+
+ if "label" in df.columns:
+ self.labels = torch.tensor(df["label"].values)
+
+ self.processor = BertPreprocessor
+ self.config = config
+
+ def __len__(self):
+ if self.titles is not None:
+ return len(self.titles)
+ elif self.descriptions is not None:
+ return len(self.descriptions)
+ else:
+ return len(self.labels)
+
+ def __getitem__(self, idx):
+
+ title = "" if self.titles is None else self.titles[idx]
+ desc = "" if self.descriptions is None else self.descriptions[idx]
+ label = -1 if self.labels is None else self.labels[idx]
+
+ return BertPreprocessor.get_sample(self.config, title, desc, label)
diff --git a/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertPreprocessor.py b/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertPreprocessor.py
new file mode 100644
index 0000000..151b4ce
--- /dev/null
+++ b/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/BertPreprocessor.py
@@ -0,0 +1,69 @@
+import string
+
+import torch
+
+from src.classifier.FullTextTransformerModel.config.TransformerModelConfig import \
+ PytorchTransformerConfig
+from src.entity.Tender import Tender
+
+
+class BertPreprocessor:
+
+ @staticmethod
+ def get_sample_from_tender(config: PytorchTransformerConfig, tender: Tender):
+ return BertPreprocessor.get_sample(config, tender.get_title("EN"), tender.original_lang_entity.description)
+
+ @staticmethod
+ def get_sample(config: PytorchTransformerConfig, title: string, desc: string, label=-1):
+ if config.truncate:
+ title = BertPreprocessor.truncate(title, config.max_len_title)
+ desc = BertPreprocessor.truncate(desc, config.max_len_desc)
+
+ title_ids, title_mask, desc_ids, desc_mask = ([], [], [], [])
+
+ if config.use_title:
+ title_ids, title_mask = BertPreprocessor.preprocessing_for_bert(title, config.tokenizer,
+ config.max_len_title)
+
+ if config.use_desc:
+ desc_ids, desc_mask = BertPreprocessor.preprocessing_for_bert(desc, config.tokenizer,
+ config.max_len_desc)
+
+ sample = {'title_input_ids': title_ids, 'title_attention_mask': title_mask, 'description_input_ids': desc_ids,
+ 'description_attention_mask': desc_mask, "label": label}
+
+ return sample
+
+ @staticmethod
+ def preprocessing_for_bert(data, tokenizer, max_len):
+ encoded_sent = tokenizer.encode_plus(
+ truncation=True,
+ text=data, # Preprocess sentence
+ add_special_tokens=True, # Add `[CLS]` and `[SEP]`
+ max_length=max_len, # Max length to truncate/pad
+ pad_to_max_length=True, # Pad sentence to max length
+ # return_tensors='pt', # Return PyTorch tensor
+ return_attention_mask=True # Return attention mask
+ )
+
+ # Add the outputs to the lists
+ input_ids = encoded_sent.get('input_ids')
+ attention_masks = encoded_sent.get('attention_mask')
+
+ # Convert lists to tensors
+ input_ids = torch.tensor(input_ids)
+ attention_masks = torch.tensor(attention_masks)
+
+ return input_ids, attention_masks
+
+ @staticmethod
+ def truncate(text, max_len):
+ splitted = text.split()
+
+ if len(splitted) <= max_len:
+ return text
+
+ first_part = splitted[:129]
+ second_part = splitted[-383:]
+ retstr = "".join(first_part) + " " + " ".join(second_part)
+ return retstr
\ No newline at end of file
diff --git a/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/__init__.py b/src/classifier/FullTextTransformerModel/PytorchTransformer/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextTransformerModel/__init__.py b/src/classifier/FullTextTransformerModel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/FullTextTransformerModel/config/TransformerModelConfig.py b/src/classifier/FullTextTransformerModel/config/TransformerModelConfig.py
new file mode 100644
index 0000000..5624a07
--- /dev/null
+++ b/src/classifier/FullTextTransformerModel/config/TransformerModelConfig.py
@@ -0,0 +1,71 @@
+from transformers import AutoTokenizer, AutoModel
+
+
+class PytorchTransformerConfig:
+
+ def __init__(self, tokenizer, model, max_len_title=512, max_len_desc=512,
+ batch_size=32, epochs=10, truncate=False, use_title=True, use_description=True, freeze_pretrained_layers=True, name="Transformer-Model"):
+
+ self.name = name
+ self.tokenizer = tokenizer
+ self.model = model
+ self.max_len_title = max_len_title
+ self.max_len_desc = max_len_desc
+ self.batch_size = batch_size
+ self.epochs = epochs
+ self.truncate = truncate
+ self.use_title = use_title
+ self.use_desc = use_description
+ self.freeze_pretrained_layers = freeze_pretrained_layers
+
+ if self.use_desc is False:
+ self.model_desc = None
+ self.tokenizer_desc = None
+
+ if self.use_title is False:
+ self.model_title = None
+ self.tokenizer_title = None
+
+ self.num_models = 2 if (self.use_title and self.use_desc) else 1
+
+ @classmethod
+ def bert_german_full_lite(cls):
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
+ model = AutoModel.from_pretrained("distilbert-base-german-cased")
+ max_len_title = 70
+ max_len_desc = 300
+ batch_size = 16
+ epochs = 4
+
+ return PytorchTransformerConfig(tokenizer, model, max_len_title,
+ max_len_desc, batch_size, epochs, name="TrainBertLayers",
+ freeze_pretrained_layers=False)
+
+ @classmethod
+ def bert_german_full(cls):
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
+ model = AutoModel.from_pretrained("bert-base-german-cased")
+ max_len_title = 70
+ max_len_desc = 300
+ batch_size = 16
+ epochs = 4
+
+ return PytorchTransformerConfig(tokenizer, model, max_len_title,
+ max_len_desc, batch_size, epochs, name="TrainBertLayers",
+ freeze_pretrained_layers=False)
+
+ @classmethod
+ def bert_german_description_only(cls):
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
+ model = AutoModel.from_pretrained("bert-base-german-cased")
+
+ return PytorchTransformerConfig(tokenizer, model, use_title=False, max_len_desc=300, batch_size=16,
+ epochs=4, name="bert_german_description_only", freeze_pretrained_layers=False)
+
+ @classmethod
+ def bert_german_title_only(cls):
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
+ model = AutoModel.from_pretrained("bert-base-german-cased")
+
+ return PytorchTransformerConfig(tokenizer, model, use_description=False, max_len_title=70, epochs=4,
+ batch_size=16, name="bert_german_description_only", freeze_pretrained_layers=False)
\ No newline at end of file
diff --git a/src/classifier/FullTextTransformerModel/config/__init__.py b/src/classifier/FullTextTransformerModel/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/SpacyScikitModel.py b/src/classifier/SpacyScikitModel/SpacyScikitModel.py
similarity index 73%
rename from src/classifier/SpacyScikitModel.py
rename to src/classifier/SpacyScikitModel/SpacyScikitModel.py
index f501c0d..ef71f40 100644
--- a/src/classifier/SpacyScikitModel.py
+++ b/src/classifier/SpacyScikitModel/SpacyScikitModel.py
@@ -1,17 +1,16 @@
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.linear_model import SGDClassifier
-from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import confusion_matrix
import spacy
import string
import logging
-import numpy as np
+
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+from src.entity.LabeledTenderCollection import LabelledTenderCollection
+from src.entity.ValidationResult import ValidationResult
logger = logging.getLogger(__name__)
@@ -20,18 +19,16 @@
punctuations = string.punctuation
-class SpacyScikitModel:
+class SpacyScikitModel(TenderClassClassifier):
def __init__(self):
if LANGUAGE == "DE":
from spacy.lang.de.stop_words import STOP_WORDS
- self.nlp = spacy.load('de_core_news_sm')
self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"]
from spacy.lang.de import German
self.parser = German()
elif LANGUAGE == "EN":
from spacy.lang.en.stop_words import STOP_WORDS
- self.nlp = spacy.load('en')
self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement",
"work", "include", "support", "approximately", "management", "agreement",
"office", "solution", "manage", "product", "design", "program", "project",
@@ -44,6 +41,7 @@ def __init__(self):
self.stopwords = list(STOP_WORDS)
self.stopwords.extend(self.domain_stopwords)
self.pipe = None
+ self.create_new_model()
def spacy_tokenizer(self, sentence):
sentence_tokens = self.parser(sentence)
@@ -67,26 +65,24 @@ def fit(self, X, y=None, **fit_params):
def get_params(self, deep=True):
return {}
- def __load_model(self):
- if not self.pipe:
- self.pipe = joblib.load(MODEL_NAME)
-
def __convert_to_input(self, tenders):
- titles = list(map(lambda x: x.get_title("DE"), tenders))
+ titles = list(map(lambda x: x.get_title(LANGUAGE), tenders))
return titles
def classify(self, tenders):
- self.__load_model()
-
titles = self.__convert_to_input(tenders)
predictions = self.pipe.predict(titles)
tuples = zip(tenders, predictions)
selected_tenders = [t for t, p in tuples if p == 1]
return selected_tenders
- def train(self, labelled_tenders):
- self.__load_model()
+ def load(self, name):
+ self.pipe = joblib.load("./data/" + name)
+
+ def save(self, name):
+ joblib.dump(self.pipe, "./data/" + name)
+ def prepare_data(self, labelled_tenders):
tenders = [i for i, j in labelled_tenders]
labels = [j for i, j in labelled_tenders]
titles = self.__convert_to_input(tenders)
@@ -95,16 +91,16 @@ def train(self, labelled_tenders):
T = training_df["title"]
y = training_df["label"]
- T_train, T_test, y_train, y_test = train_test_split(T, y, test_size=0.1, random_state=42)
- logger.info("start training")
- self.pipe.fit(T_train, y_train)
- logger.info("start testing")
- y_pred = self.pipe.predict(T_test)
- logger.info(f"accuracy: {self.pipe.score(T_test, y_test)} , tested with {len(T_test)} instances")
- tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
- logger.info(f"tn: {tn} fp: {fp}")
- logger.info(f"fn: {fn} tp:{tp}")
- joblib.dump(self.pipe, MODEL_NAME)
+ return T, y
+
+ def train(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ self.pipe.fit(X, ylabels)
+
+ def validate(self, labelled_tenders):
+ X, ylabels = self.prepare_data(labelled_tenders)
+ y_pred = self.pipe.predict(X)
+ return ValidationResult(ylabels, y_pred)
def create_new_model(self):
vectorizer = CountVectorizer(tokenizer=self.spacy_tokenizer, ngram_range=(1, 2))
@@ -115,5 +111,3 @@ def create_new_model(self):
self.pipe = Pipeline([("cleaner", predictor),
('vectorizer', vectorizer),
('classifier', classifier)])
-
- joblib.dump(self.pipe, MODEL_NAME)
diff --git a/src/classifier/SpacyScikitModel/__init__.py b/src/classifier/SpacyScikitModel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/TenderClassClassifier.py b/src/classifier/TenderClassClassifier.py
new file mode 100644
index 0000000..a5aec88
--- /dev/null
+++ b/src/classifier/TenderClassClassifier.py
@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+
+
+class TenderClassClassifier(ABC):
+
+ @abstractmethod
+ def create_new_model(self, *args):
+ pass
+
+ @abstractmethod
+ def train(self, labelled_tenders):
+ pass
+
+ @abstractmethod
+ def validate(self, labelled_tenders):
+ pass
+
+ @abstractmethod
+ def classify(self, tenders):
+ pass
+
+ @abstractmethod
+ def save(self, name):
+ pass
+
+ @abstractmethod
+ def load(self, name):
+ pass
diff --git a/src/classifier/TransformerModel.py b/src/classifier/TransformerModel/TransformerModel.py
similarity index 61%
rename from src/classifier/TransformerModel.py
rename to src/classifier/TransformerModel/TransformerModel.py
index 6c3b98b..90e0739 100644
--- a/src/classifier/TransformerModel.py
+++ b/src/classifier/TransformerModel/TransformerModel.py
@@ -1,72 +1,75 @@
-from typing import List
-from sklearn.metrics import confusion_matrix
-import numpy as np
+from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
import logging
+import torch
+import time
+from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
+from src.classifier.TenderClassClassifier import TenderClassClassifier
+
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
args = {
'learning_rate': 1e-4,
- 'overwrite_output_dir': True
+ 'overwrite_output_dir': True,
+ 'fp16': False
}
+cuda_available = torch.cuda.is_available()
+
-class TransformerModel:
+class TransformerModel(TenderClassClassifier):
"""
This class provides the Machine Learning model and classifies tenders based on previous training data.
"""
- def load_model(self):
- if not self.model:
- from simpletransformers.classification import ClassificationModel
- try:
- self.model = ClassificationModel('bert', './outputs/', use_cuda=False, args=args)
- except Exception as ex:
- logger.error(f"could not load model from /outputs due to {str(ex)}, creating new model")
- self.create_new_model()
-
def __init__(self):
self.model = None
+ def load(self, name):
+ self.model = ClassificationModel('bert', './outputs/', use_cuda=cuda_available, args=args)
+
+ def save(self, name):
+ pass
+
def __convert_to_input(self, tenders):
titles = list(map(lambda x: x.get_title("DE"), tenders))
return titles
def classify(self, tenders):
- self.load_model()
-
titles = self.__convert_to_input(tenders)
predictions, raw_output = self.model.predict(titles)
tuples = zip(tenders, predictions)
- selected_tenders = [t for t,p in tuples if p == 1]
+ selected_tenders = [t for t, p in tuples if p == 1]
return selected_tenders
def train(self, labelled_tenders):
- self.load_model()
-
tenders = [i for i, j in labelled_tenders]
tenders = self.__convert_to_input(tenders)
labels = [j for i, j in labelled_tenders]
-
- tenders_train, tenders_test, labels_train, labels_test = train_test_split(tenders, labels, test_size=0.1, random_state=42)
+ tenders_train, tenders_test, labels_train, labels_test = train_test_split(tenders, labels, test_size=0.1,
+ random_state=42)
data_input = pd.DataFrame(zip(tenders_train, labels_train))
+ start = time.time()
self.model.train_model(data_input)
+ end = time.time()
+
+ print(end - start)
labels_pred, raw_output = self.model.predict(tenders_test)
tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
logger.info(f"tn: {tn} fp: {fp}")
logger.info(f"fn: {fn} tp:{tp}")
-
+ logger.info(f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}")
def create_new_model(self):
from simpletransformers.classification import ClassificationModel
- self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=False, args=args)
+ self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=cuda_available, args=args)
diff --git a/src/classifier/TransformerModel/__init__.py b/src/classifier/TransformerModel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier/__init__.py b/src/classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/config.py b/src/config.py
new file mode 100644
index 0000000..f8fe39c
--- /dev/null
+++ b/src/config.py
@@ -0,0 +1,20 @@
+from src.classifier.FullTextFastTextModel.FullTextFastTextModel import FullTextFastTextModel
+from src.classifier.FullTextSvmModel.FullTextSvmModel import FullTextSvmModel
+from src.classifier.FullTextTransformerModel.FullTextTransformerModel import FullTextTransformerModel
+from src.classifier.SpacyScikitModel.SpacyScikitModel import SpacyScikitModel
+from src.persistence.Persistence import Persistence
+from src.service.Recommender import Recommender
+from src.service.Trainer import Trainer
+from src.service.fetcher.Fetcher import Fetcher
+
+
+# TODO: select the Machine Learning model
+#tender_model = FullTextTransformerModel()
+
+tender_model = FullTextSvmModel()
+# tender_model = TransformerModel()
+
+tender_recommender = Recommender(tender_model)
+tender_trainer = Trainer(tender_model)
+tender_persistence = Persistence()
+tender_fetcher: Fetcher = Fetcher()
diff --git a/src/dev/dev_data_inspector.py b/src/dev/dev_data_inspector.py
new file mode 100644
index 0000000..46207f0
--- /dev/null
+++ b/src/dev/dev_data_inspector.py
@@ -0,0 +1,66 @@
+import json
+from transformers import AutoTokenizer
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from entity.Tender import Tender
+
+
+class DevDATAInspector:
+ def main(self):
+ tenders = self.load("dev_neg_tenders.json") + self.load("dev_pos_tenders.json")
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
+
+ words = []
+ tokens_per_word = []
+
+ for tender in tenders:
+ langEnt = tender.get_original_language_entity()
+
+ for word in langEnt.title.split():
+ words.append(word)
+
+ for word in langEnt.description.split():
+ words.append(word)
+
+ for word in words:
+ tokens_per_word.append(len(self.tokens_of_sequence(tokenizer, word)))
+
+ print(sum(tokens_per_word) / len(tokens_per_word))
+
+ def tokens_of_sequence(self, tokenizer, data):
+ encoded_sent = tokenizer.encode_plus(
+ text=data,
+ add_special_tokens=False
+ )
+
+ input_ids = encoded_sent.get('input_ids')
+ return input_ids
+
+
+ def load(self, path):
+ with open(path, 'r', encoding='utf8') as json_file:
+ tender_dicts = json.load(json_file)
+ tenders = list(map(lambda x: Tender.from_json_dict(x), tender_dicts))
+ return tenders
+
+ def sequence_length(self):
+ tenders = self.load("dev_neg_tenders.json") + self.load("dev_pos_tenders.json")
+ df = pd.DataFrame()
+ df["title_len"] = [len((x.get_title("DE")).split()) for x in tenders]
+ df["description_len"] = [len((x.get_description("DE")).split()) for x in tenders]
+
+ title_len_fig = plt.figure()
+ title_len_fig.suptitle('')
+ df.boxplot(column=['title_len'], showfliers=False)
+
+ desc_len_fig = plt.figure()
+ desc_len_fig.suptitle('')
+ df.boxplot(column=['description_len'], showfliers=False)
+
+ title_len_fig.savefig("title_len.png", format="png")
+ desc_len_fig.savefig("desc_len.png", format="png")
+
+if __name__ == '__main__':
+ DevDATAInspector().main()
\ No newline at end of file
diff --git a/src/dev/labelled_tenders.pkl b/src/dev/labelled_tenders.pkl
new file mode 100644
index 0000000..85622fd
Binary files /dev/null and b/src/dev/labelled_tenders.pkl differ
diff --git a/src/dev/train_test_splitter.py b/src/dev/train_test_splitter.py
new file mode 100644
index 0000000..71298a8
--- /dev/null
+++ b/src/dev/train_test_splitter.py
@@ -0,0 +1,31 @@
+import json
+from sklearn.model_selection import train_test_split
+
+from src.entity.Tender import Tender
+
+
+class TrainTestSplitter:
+ def main(self):
+ self.split("dev_pos_tenders")
+
+ def split(self, path):
+
+ tenders = self.load(path + ".json")
+ train, test = train_test_split(tenders, test_size=0.1, random_state=42)
+ self.save(train, path + "_train.json")
+ self.save(test, path + "_test.json")
+
+ def load(self, path):
+ with open("../data/" + path, 'r', encoding='utf8') as json_file:
+ tender_dicts = json.load(json_file)
+ tenders = list(map(lambda x: Tender.from_json_dict(x), tender_dicts))
+ return tenders
+
+ def save(self, tenders, path):
+ with open("../data/" + path, 'w', encoding='utf8') as json_file:
+ json.dump(list(map(lambda x: x.get_dict(), tenders)), json_file, ensure_ascii=False)
+ json_file.flush()
+ json_file.close()
+
+if __name__ == '__main__':
+ TrainTestSplitter().main()
\ No newline at end of file
diff --git a/src/dev/validator.py b/src/dev/validator.py
new file mode 100644
index 0000000..3c7921d
--- /dev/null
+++ b/src/dev/validator.py
@@ -0,0 +1,94 @@
+import json
+import os
+import random
+import csv
+import sys
+import time
+import gc
+
+# Use src path so that the python interpreter can access all modules
+import torch
+from memory_profiler import profile
+
+sys.path.append(os.getcwd()[:os.getcwd().index('src')])
+
+from sklearn.model_selection import KFold
+
+from src.classifier.FullTextFastTextModel.FullTextFastTextModel import FullTextFastTextModel
+from src.classifier.FullTextFastTextModel.validation.FullTextFastTextModelDescOnly import FullTextFastTextModelDescOnly
+from src.classifier.FullTextFastTextModel.validation.FullTextFastTextModelTitleOnlyl import FullTextFastTextModelTitleOnly
+from src.classifier.FullTextSvmModel.FullTextSvmModel import FullTextSvmModel
+from src.classifier.FullTextSvmModel.validation.FullTextSvmModelDescOnly import FullTextSvmModelDescOnly
+from src.classifier.FullTextSvmModel.validation.FullTextSvmModelTitleOnly import FullTextSvmModelTitleOnly
+from src.classifier.FullTextTransformerModel.FullTextTransformerModel import FullTextTransformerModel
+from src.classifier.FullTextTransformerModel.config.TransformerModelConfig import PytorchTransformerConfig
+from src.entity.Tender import Tender
+
+models = {
+ #"FullTextTransformerModel": FullTextTransformerModel(PytorchTransformerConfig.bert_german_full),
+ #"FullTextTransformerModelTitleOnly": FullTextTransformerModel(PytorchTransformerConfig.bert_german_title_only),
+ #"FullTextTransformerModelDescOnly": FullTextTransformerModel(PytorchTransformerConfig.bert_german_description_only),
+ #"FullTextFastTextModel": FullTextFastTextModel(),
+ #"FullTextFastTextModelTitleOnly": FullTextFastTextModelTitleOnly(),
+ #"FullTextFastTextModelDescOnly": FullTextFastTextModelDescOnly(),
+ #"FullTextSvmModel": FullTextSvmModel(),
+ #"FullTextSvmModelTitleOnly": FullTextSvmModelTitleOnly(),
+ #"FullTextSvmModelDescOnly": FullTextSvmModelDescOnly(),
+}
+
+
+def load(path):
+ with open(path, 'r', encoding='utf8') as json_file:
+ tender_dicts = json.load(json_file)
+ tenders = list(map(lambda x: Tender.from_json_dict(x), tender_dicts))
+ return tenders
+
+
+class Validator:
+ @profile
+ def __init__(self):
+ self.start = time.time()
+ labelled_tenders = self.get_tenders()
+ #labelled_tenders = labelled_tenders[:1000]
+ kfold = KFold(5, True, 1)
+
+ # enumerate splits
+ iteration = 0
+ for train, test in kfold.split(labelled_tenders):
+ iteration = iteration + 1
+ print('K %s of 5' % iteration)
+ print('train: %s, test: %s' % (len(train), len(test)))
+ for name, impl in models.items():
+ impl.create_new_model()
+ begin = time.time()
+ impl.train([labelled_tenders[i] for i in train])
+ print(time.time() - begin)
+ self.write_result(name, iteration, impl.validate([labelled_tenders[i] for i in test]))
+ torch.cuda.empty_cache()
+ gc.collect()
+
+ def write_result(self, name, iteration, val_result):
+ with open('test_result_' + str(self.start) + '.csv', mode='a') as file:
+ writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+
+ writer.writerow([name, iteration, val_result.tn, val_result.fp, val_result.fn, val_result.tp,
+ val_result.accuracy, val_result.precision, val_result.recall, val_result.f1])
+
+ def get_tenders(self):
+ pos_path = "dev_pos_tenders.json"
+ neg_path = "dev_neg_tenders.json"
+
+ pos_tenders = load(pos_path)
+ neg_tenders = load(neg_path)
+
+ pos_labels = [1] * len(pos_tenders)
+ neg_labels = [0] * len(neg_tenders)
+
+ labelled_tenders = list(zip(pos_tenders, pos_labels)) + list(zip(neg_tenders, neg_labels))
+
+ random.shuffle(labelled_tenders)
+ return labelled_tenders
+
+
+if __name__ == '__main__':
+ Validator()
diff --git a/src/entity/LabeledTenderCollection.py b/src/entity/LabeledTenderCollection.py
new file mode 100644
index 0000000..1a26409
--- /dev/null
+++ b/src/entity/LabeledTenderCollection.py
@@ -0,0 +1,19 @@
+class LabelledTenderCollection:
+
+ def __init__(self, labelled_tenders):
+ self.labelled_tenders = labelled_tenders
+
+ def get_original_language_entity_description(self):
+ return list(map(lambda x: x.get_original_language_entity().description, self.get_tenders()))
+
+ def get_titles(self, language="EN"):
+ return list(map(lambda x: x.get_title(language), self.get_tenders()))
+
+ def get_descriptions(self, language="EN"):
+ return list(map(lambda x: x.get_description(language), self.get_tenders()))
+
+ def get_tenders(self):
+ return [i for i, j in self.labelled_tenders]
+
+ def get_labels(self):
+ return [int(j) for i, j in self.labelled_tenders]
\ No newline at end of file
diff --git a/src/entity/Tender.py b/src/entity/Tender.py
index 1e76d90..0f463ba 100644
--- a/src/entity/Tender.py
+++ b/src/entity/Tender.py
@@ -12,33 +12,46 @@ class Tender:
@classmethod
def from_json_dict(cls, serialized_dict):
id = serialized_dict["id"]
+ original_lang = serialized_dict["original_lang"]
cpvs = serialized_dict["cpvs"]
lang_entities = {}
+
for e in serialized_dict["languageentities"]:
- lang_entry = TenderLanguageEntity(e["title"], e["description"])
+ lang_entry = TenderLanguageEntity(e["title"], e["description"], e["link"])
lang_entities[e["language"]] = lang_entry
- return cls(id, cpvs, lang_entities)
+ return cls(id, cpvs, lang_entities, original_lang)
- def __init__(self, id: str, cpvs: List[str], lang_entities=None):
+ def __init__(self, id: str, cpvs: List[str], lang_entities=None, original_lang=None):
+ self.original_lang = original_lang
self.id = id
self.cpvs = cpvs
+
if lang_entities is None:
lang_entities = {}
+
self.lang_entities = lang_entities
def add_language_entity(self, language_key, title, description="", link=""):
entity = TenderLanguageEntity(title, description, link)
self.lang_entities[language_key] = entity
+ def set_original_language_entity(self, language_key, title, description="", link=""):
+ self.add_language_entity(language_key, title, description, link)
+ self.original_lang = language_key
+
def get_title(self, language):
return self.lang_entities[language].title
+ def get_original_language_entity(self):
+ return self.lang_entities[self.original_lang]
+
def get_description(self, language):
return self.lang_entities[language].description
def get_dict(self):
- contract = {"id": self.id, "cpvs": list(self.cpvs)}
+ contract = {"id": self.id, "cpvs": list(self.cpvs), "original_lang": self.original_lang}
lang_list = []
+
for k, v in self.lang_entities.items():
lang_entry = {"language": k, "title": v.title, "description": v.description, "link": v.link}
lang_list.append(lang_entry)
diff --git a/src/entity/TenderLanguageEntity.py b/src/entity/TenderLanguageEntity.py
index a6590de..c02d3cd 100644
--- a/src/entity/TenderLanguageEntity.py
+++ b/src/entity/TenderLanguageEntity.py
@@ -3,7 +3,19 @@ class TenderLanguageEntity:
This class holds the title and description of one tender for a certain language.
"""
- def __init__(self, title, description, link):
- self.title = title
- self.description = description
- self.link = link
+ def __init__(self, title: str, description: str, link: str):
+ self.title: str = title
+ self.description: str = description
+ self.link: str = link
+
+ @classmethod
+ def from_json_dict(cls, serialized_dict):
+ title = serialized_dict["title"]
+ description = serialized_dict["description"]
+ link = serialized_dict["link"]
+
+ return cls(title, description, link)
+
+ def get_dict(self):
+ lang_entity = {"title": self.title, "description": self.description, "link": self.link}
+ return lang_entity
diff --git a/src/entity/ValidationResult.py b/src/entity/ValidationResult.py
new file mode 100644
index 0000000..9f143c8
--- /dev/null
+++ b/src/entity/ValidationResult.py
@@ -0,0 +1,26 @@
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
+
+
+class ValidationResult:
+ """
+ This class serves as a place to standardize the saved validation measurements
+ """
+
+ def __init__(self, labels, pred):
+ self.tn, self.fp, self.fn, self.tp = confusion_matrix(labels, pred).ravel()
+ self.accuracy = accuracy_score(labels, pred)
+ self.precision = precision_score(labels, pred)
+ self.recall = recall_score(labels, pred)
+ self.f1 = f1_score(labels, pred)
+
+ def toDict(self):
+ return {
+ "tn": self.tn.item(),
+ "fp": self.fp.item(),
+ "fn": self.fn.item(),
+ "tp": self.tp.item(),
+ "accuracy": self.accuracy,
+ "precision": self.precision,
+ "recall": self.recall,
+ "f1": self.f1
+ }
diff --git a/src/entity/__init__.py b/src/entity/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/fetcher/Fetcher.py b/src/fetcher/Fetcher.py
deleted file mode 100644
index 233f689..0000000
--- a/src/fetcher/Fetcher.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from typing import List
-from src.fetcher.ted.TedFetcher import TedFetcher
-
-
-class Fetcher:
- """
- This class fetches tenders from provides databases.
- Currently, only TED serves as database.
- """
-
- def __init__(self):
- self.ted_fetcher = TedFetcher()
-
- def get(self, count: int, load_documents: bool = False, search_criteria: str = "", languages: List[str] = ["DE"], page_offset: int = 0):
- return self.ted_fetcher.get(count, load_documents, search_criteria, languages, page_offset)
-
diff --git a/src/main.py b/src/main.py
index dede7c1..23ef70a 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,24 +1,21 @@
-# Use src path so that the python interpreter can access all modules
-import datetime
+# import dependencies
import os
import sys
-sys.path.append(os.getcwd()[:os.getcwd().index('src')])
-
-# import all own modules
-from src.classifier.SpacyScikitModel import SpacyScikitModel
-from src.classifier.TransformerModel import TransformerModel
-from src.persistence.Persistence import Persistence
-from src.service.Recommender import Recommender
-from src.service.Trainer import Trainer
-# import dependencies
-from flask import Flask, request, jsonify
+from flask import Flask
from flask_swagger_ui import get_swaggerui_blueprint
from flask_cors import CORS
-from datetime import date
-from datetime import datetime
import logging
+# Use src path so that the python interpreter can access all modules
+sys.path.append(os.getcwd()[:os.getcwd().index('src')])
+
+# import routes
+from src.routes.v1.fetch import fetch_blueprint
+from src.routes.v1.model import model_blueprint
+from src.routes.v1.persistence import persistence_blueprint
+from src.routes.v1.web import web_blueprint
+
# set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@@ -27,12 +24,17 @@
datefmt='%Y-%m-%d %H:%M:%S')
logger.info("start tenderclass-backend")
+#setup routes
app = Flask(__name__)
+app.register_blueprint(model_blueprint, url_prefix="/api/v1/model")
+app.register_blueprint(web_blueprint, url_prefix="/api/v1/web")
+app.register_blueprint(persistence_blueprint, url_prefix="/api/v1/persistence")
+app.register_blueprint(fetch_blueprint, url_prefix="/api/v1/fetch")
CORS(app)
# set up Swagger documentation
SWAGGER_URL = '/swagger'
-API_URL = '/static/swagger.json'
+API_URL = '/static/api.yaml'
SWAGGERUI_BLUEPRINT = get_swaggerui_blueprint(
SWAGGER_URL,
API_URL,
@@ -42,89 +44,5 @@
)
app.register_blueprint(SWAGGERUI_BLUEPRINT, url_prefix=SWAGGER_URL)
-# TODO: select the Machine Learning model
-tender_model = SpacyScikitModel()
-#tender_model = TransformerModel()
-
-tender_recommender = Recommender(tender_model)
-tender_trainer = Trainer(tender_model)
-tender_persistence = Persistence()
-
-
-@app.route("/api/v1/web/recommendations", methods=['GET'])
-def get_recommendations():
- # use query parameters to overwrite default count and date
- count = int(request.args.get('count'))
- if count is None:
- # download all tenders (indicated by count=0)
- count = 0
- today = request.args.get('date')
- if today is None:
- # DEFAULT: get tenders of today
- today = datetime.strftime(date.today(), "%Y%m%d")
- search_criteria = " AND PD=[" + today + "]"
- tenders = tender_recommender.get_recommendations(count, search_criteria)
- return jsonify(list(map(lambda x: x.get_dict(), tenders)))
-
-
-@app.route("/api/v1/web/train", methods=['POST'])
-def post_train_from_web():
- body = request.json
- train_tender_ids = body["ids"]
- train_tender_labels = body["labels"]
- tender_trainer.train(train_tender_ids, train_tender_labels)
-
- return "ok"
-
-
-@app.route("/api/v1/model/new", methods=['POST'])
-def post_create_new():
- body = request.json
- pos_number = body["pos_number"]
- neg_number = body["neg_number"]
- pos_search_criteria = body["pos_search_criteria"]
- neg_search_criteria = body["neg_search_criteria"]
-
- tender_trainer.create_and_init(pos_number, pos_search_criteria, neg_number, neg_search_criteria)
-
- return "ok"
-
-
-
-### Additional endpoints for saving tenders and training tenders from the file system.
-### NOT documented yet because it is not scope of this bachelor thesis
-@app.route("/api/v1/persistence/save", methods=['POST'])
-def post_save():
- path = request.json["path"]
- search_criteria = request.json["search_criteria"]
- count = int(request.args.get('count'))
- tenders = tender_recommender.get_all(count, search_criteria=search_criteria)
- tender_persistence.save(tenders, path)
-
- return "ok"
-
-
-@app.route("/api/v1/persistence/train", methods=['POST'])
-def post_train_from_persistence():
- neg_path = request.json["neg_path"]
- pos_path = request.json["pos_path"]
- neg_tenders = tender_persistence.load(neg_path)
- pos_tenders = tender_persistence.load(pos_path)
- tender_trainer.train_from_entities(neg_tenders, pos_tenders)
-
- return "ok"
-
-
-@app.route("/api/v1/web", methods=['GET'])
-def get_all():
- count = int(request.args.get('count'))
- date_filter = request.args.get('date')
- search_criteria = ""
- if date_filter:
- search_criteria = " AND PD=[" + date_filter + "]"
- tenders = tender_recommender.get_all(count, search_criteria=search_criteria)
- return jsonify(list(map(lambda x: x.get_dict(), tenders)))
-
-
if __name__ == "__main__":
app.run(host='0.0.0.0')
diff --git a/src/persistence/Persistence.py b/src/persistence/Persistence.py
index c714111..0f8f57b 100644
--- a/src/persistence/Persistence.py
+++ b/src/persistence/Persistence.py
@@ -6,11 +6,13 @@
class Persistence:
def save(self, tenders, path):
- with open("data/" + path, 'w', encoding='utf8') as json_file:
+ with open("./data/" + path, 'w', encoding='utf8') as json_file:
json.dump(list(map(lambda x: x.get_dict(), tenders)), json_file, ensure_ascii=False)
+ json_file.flush()
+ json_file.close()
def load(self, path):
- with open("data/" + path, 'r', encoding='utf8') as json_file:
+ with open("./data/" + path, 'r', encoding='utf8') as json_file:
tender_dicts = json.load(json_file)
tenders = list(map(lambda x: Tender.from_json_dict(x), tender_dicts))
return tenders
diff --git a/src/routes/__init__.py b/src/routes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/routes/v1/__init__.py b/src/routes/v1/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/routes/v1/fetch.py b/src/routes/v1/fetch.py
new file mode 100644
index 0000000..93e780f
--- /dev/null
+++ b/src/routes/v1/fetch.py
@@ -0,0 +1,21 @@
+from http.client import BAD_REQUEST
+
+from flask import Blueprint, request, abort
+from marshmallow import ValidationError
+
+from src.config import tender_fetcher
+from src.validation.ted_save_validation import TedSaveValidation
+
+fetch_blueprint = Blueprint('download_blueprint', __name__)
+ted_save_validation = TedSaveValidation()
+
+
+@fetch_blueprint.route("/ted", methods=['POST'])
+def fetch():
+ try:
+ model = ted_save_validation.load(request.json)
+ tender_fetcher.fetch_and_save(model)
+
+ return "ok"
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
diff --git a/src/routes/v1/model.py b/src/routes/v1/model.py
new file mode 100644
index 0000000..29aa94f
--- /dev/null
+++ b/src/routes/v1/model.py
@@ -0,0 +1,71 @@
+from http.client import BAD_REQUEST
+
+from flask import Blueprint, request, abort, jsonify
+from src.config import tender_trainer
+from src.entity.ValidationResult import ValidationResult
+from src.validation.create_from_datasets_validation import CreateFromDatasetsValidation
+from src.validation.create_new_validation import CreateNewValidation
+from marshmallow import ValidationError
+
+from src.validation.model_name_validation import ModelNameValidation
+
+model_blueprint = Blueprint('model_blueprint', __name__)
+create_new_validation = CreateNewValidation()
+create_from_datasets_validation = CreateFromDatasetsValidation()
+model_name_validation = ModelNameValidation()
+
+
+@model_blueprint.route("/new", methods=['POST'])
+def post_create_new():
+ try:
+ tender_trainer.create_new()
+
+ return "ok"
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
+
+
+@model_blueprint.route("/train-from-datasets", methods=['POST'])
+def post_train_from_datasets():
+ try:
+ model = create_from_datasets_validation.load(request.json)
+
+ tender_trainer.load_and_train(model)
+
+ return "ok"
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
+
+@model_blueprint.route("/validate-on-dataset", methods=['POST'])
+def validate_on_datasets():
+ try:
+ model = create_from_datasets_validation.load(request.json)
+
+ restult: ValidationResult = tender_trainer.validate(model)
+
+ return jsonify(restult.toDict())
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
+
+@model_blueprint.route("/save", methods=['POST'])
+def save_model():
+ try:
+ model = model_name_validation.load(request.json)
+
+ tender_trainer.save(model)
+
+ return "ok"
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
+
+
+@model_blueprint.route("/load", methods=['POST'])
+def load_model():
+ try:
+ model = model_name_validation.load(request.json)
+
+ tender_trainer.load(model)
+
+ return "ok"
+ except ValidationError as err:
+ abort(BAD_REQUEST, str(err.messages))
diff --git a/src/routes/v1/persistence.py b/src/routes/v1/persistence.py
new file mode 100644
index 0000000..390518c
--- /dev/null
+++ b/src/routes/v1/persistence.py
@@ -0,0 +1,30 @@
+from flask import Blueprint, request
+
+from src.config import tender_recommender, tender_persistence, tender_trainer
+
+persistence_blueprint = Blueprint('persistence_blueprint', __name__)
+
+
+### Additional endpoints for saving tenders and training tenders from the file system.
+### NOT documented yet because it is not scope of this bachelor thesis
+
+@persistence_blueprint.route("/save", methods=['POST'])
+def post_save():
+ path = request.json["path"]
+ search_criteria = request.json["search_criteria"]
+ count = int(request.args.get('count'))
+ tenders = tender_recommender.get_all(count, search_criteria=search_criteria)
+ tender_persistence.save(tenders, path)
+
+ return "ok"
+
+
+@persistence_blueprint.route("/train", methods=['POST'])
+def post_train_from_persistence():
+ neg_path = request.json["neg_path"]
+ pos_path = request.json["pos_path"]
+ neg_tenders = tender_persistence.load(neg_path)
+ pos_tenders = tender_persistence.load(pos_path)
+ tender_trainer.train_from_entities(neg_tenders, pos_tenders)
+
+ return "ok"
diff --git a/src/routes/v1/web.py b/src/routes/v1/web.py
new file mode 100644
index 0000000..ce3802c
--- /dev/null
+++ b/src/routes/v1/web.py
@@ -0,0 +1,42 @@
+from flask import Blueprint, request, jsonify
+from src.config import tender_recommender, tender_trainer
+
+web_blueprint = Blueprint('web_blueprint', __name__)
+
+
+@web_blueprint.route("/recommendations", methods=['GET'])
+def get_recommendations():
+ # use query parameters to overwrite default count, language and date
+ count = int(request.args.get('count'))
+ if count is None:
+ # download all tenders (indicated by count=0)
+ count = 0
+ date_filter = request.args.get('date')
+ search_criteria = ""
+ if date_filter and "undefined" not in date_filter:
+ # DEFAULT: get all tender without specific date
+ search_criteria = " AND PD=[" + date_filter + "]"
+
+ tenders = tender_recommender.get_recommendations(count, search_criteria)
+ return jsonify(list(map(lambda x: x.get_dict(), tenders)))
+
+
+@web_blueprint.route("/train", methods=['POST'])
+def post_train_from_web():
+ body = request.json
+ train_tender_ids = body["ids"]
+ train_tender_labels = body["labels"]
+ tender_trainer.train(train_tender_ids, train_tender_labels)
+
+ return "ok"
+
+
+@web_blueprint.route("/", methods=['GET'])
+def get_all():
+ count = int(request.args.get('count'))
+ date_filter = request.args.get('date')
+ search_criteria = ""
+ if date_filter and "undefined" not in date_filter:
+ search_criteria = " AND PD=[" + date_filter + "]"
+ tenders = tender_recommender.get_all(count, search_criteria=search_criteria)
+ return jsonify(list(map(lambda x: x.get_dict(), tenders)))
diff --git a/src/service/Recommender.py b/src/service/Recommender.py
index 0e42cb6..b63ffe4 100644
--- a/src/service/Recommender.py
+++ b/src/service/Recommender.py
@@ -1,6 +1,4 @@
-from src.classifier.TransformerModel import TransformerModel
-from src.fetcher.Fetcher import Fetcher
-from datetime import datetime
+from src.service.fetcher.Fetcher import Fetcher
class Recommender:
@@ -11,14 +9,11 @@ class Recommender:
def __init__(self, tender_model):
self.tender_fetcher = Fetcher()
self.tender_model = tender_model
- self.cached_selected_tenders = []
self.cached_search_criteria = ""
def get_recommendations(self, count, search_criteria = ""):
- if not self.cached_selected_tenders or self.cached_selected_tenders != search_criteria:
- tenders = self.tender_fetcher.get(count, search_criteria=search_criteria)
- self.cached_selected_tenders = self.tender_model.classify(tenders)
- return self.cached_selected_tenders
+ tenders = self.tender_fetcher.get(count, search_criteria=search_criteria)
+ return self.tender_model.classify(tenders)
def get_all(self, count, search_criteria=""):
tenders = self.tender_fetcher.get(count, search_criteria=search_criteria)
diff --git a/src/service/Trainer.py b/src/service/Trainer.py
index 2c6fdf7..689090f 100644
--- a/src/service/Trainer.py
+++ b/src/service/Trainer.py
@@ -1,8 +1,12 @@
-from src.classifier.TransformerModel import TransformerModel
-from src.fetcher.Fetcher import Fetcher
+from src.Models.FromDatasetsModelModel import FromDatasetsModel
import random
import logging
+from src.Models.ModelNameModel import ModelNameModel
+from src.entity.ValidationResult import ValidationResult
+from src.persistence.Persistence import Persistence
+from src.service.fetcher.Fetcher import Fetcher
+
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@@ -15,6 +19,16 @@ class Trainer:
def __init__(self, tender_model):
self.tender_fetcher = Fetcher()
self.tender_model = tender_model
+ self.persistence = Persistence()
+
+ def create_new(self):
+ self.tender_model.create_new_model()
+
+ def save(self, model: ModelNameModel):
+ self.tender_model.save(model.name)
+
+ def load(self, model: ModelNameModel):
+ self.tender_model.load(model.name)
def train(self, tender_ids, labels):
search_arg = " OR ".join(tender_ids)
@@ -25,23 +39,24 @@ def train(self, tender_ids, labels):
self.tender_model.train(labelled_tenders)
- def create_and_init(self, pos_number, pos_search_criteria, neg_number, neg_search_criteria):
- self.tender_model.create_new_model()
- if (pos_number + neg_number) == 0:
- return
-
- pos_tenders = self.tender_fetcher.get(pos_number, search_criteria=pos_search_criteria)
- neg_tenders = self.tender_fetcher.get(neg_number, search_criteria=neg_search_criteria)
+ def validate(self, model: FromDatasetsModel) -> ValidationResult:
+ pos_tenders = self.persistence.load(model.pos_filename)
+ neg_tenders = self.persistence.load(model.neg_filename)
- pos_labels = [1]*len(pos_tenders)
- neg_labels = [0]*len(neg_tenders)
+ pos_labels = [1] * len(pos_tenders)
+ neg_labels = [0] * len(neg_tenders)
labelled_tenders = list(zip(pos_tenders, pos_labels)) + list(zip(neg_tenders, neg_labels))
random.shuffle(labelled_tenders)
- self.tender_model.train(labelled_tenders)
- logger.info("tenders successfully downloaded and labelled")
+ return self.tender_model.validate(labelled_tenders)
+
+ def load_and_train(self, model: FromDatasetsModel):
+ pos_tenders = self.persistence.load(model.pos_filename)
+ neg_tenders = self.persistence.load(model.neg_filename)
+
+ return self.train_from_entities(neg_tenders, pos_tenders)
def train_from_entities(self, neg_tenders, pos_tenders):
pos_labels = [1] * len(pos_tenders)
@@ -52,5 +67,4 @@ def train_from_entities(self, neg_tenders, pos_tenders):
random.shuffle(labelled_tenders)
self.tender_model.train(labelled_tenders)
-
-
+ logger.info("tenders successfully laoded and labelled")
diff --git a/src/service/fetcher/Fetcher.py b/src/service/fetcher/Fetcher.py
new file mode 100644
index 0000000..d446860
--- /dev/null
+++ b/src/service/fetcher/Fetcher.py
@@ -0,0 +1,26 @@
+from typing import List
+
+from src.Models.TedSaveModel import TedSaveModel
+from src.entity.Tender import Tender
+from src.persistence.Persistence import Persistence
+from src.service.fetcher.ted.TedFetcher import TedFetcher
+
+
+class Fetcher:
+ """
+ This class fetches tenders from provides databases.
+ Currently, only TED serves as database.
+ """
+
+ def __init__(self):
+ self.ted_fetcher = TedFetcher()
+ self.persistence = Persistence()
+
+ def get(self, count: int, load_documents: bool = False, search_criteria: str = "", languages: List[str] = ["DE", "EN"], original_languages = ["DE"], page_offset: int = 0) -> List[Tender]:
+ return self.ted_fetcher.get(count, load_documents, search_criteria, original_languages, languages, page_offset)
+
+ def fetch_and_save(self, model: TedSaveModel):
+ tenders = self.ted_fetcher.from_ted_save_model(model)
+ self.persistence.save(tenders, model.dataset_name + ".json")
+ return
+
diff --git a/src/service/fetcher/__init__.py b/src/service/fetcher/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/fetcher/ted/TedDownloader.py b/src/service/fetcher/ted/TedDownloader.py
similarity index 96%
rename from src/fetcher/ted/TedDownloader.py
rename to src/service/fetcher/ted/TedDownloader.py
index 3282b6c..7f19eea 100644
--- a/src/fetcher/ted/TedDownloader.py
+++ b/src/service/fetcher/ted/TedDownloader.py
@@ -20,7 +20,7 @@ def get_xml_contracts(self, page: int, count: int, search_criteria: str = "", pa
# CY=[UK or DE or AT] ... country filter
querystring = {"fields": "CONTENT", "pageNum": str(page + page_offset), "pageSize": str(count),
- "q": "TD=[\"Contract notice\"]" + search_criteria, "reverseOrder": "false", "scope": "3",
+ "q": "TD=[\"Contract notice\"]" + search_criteria, "reverseOrder": "true", "scope": 3,
"sortField": "ND"}
response = requests.request("GET", self.TED_URL_SEARCH, params=querystring)
diff --git a/src/fetcher/ted/TedExtractor.py b/src/service/fetcher/ted/TedExtractor.py
similarity index 60%
rename from src/fetcher/ted/TedExtractor.py
rename to src/service/fetcher/ted/TedExtractor.py
index 270830e..e5936bc 100644
--- a/src/fetcher/ted/TedExtractor.py
+++ b/src/service/fetcher/ted/TedExtractor.py
@@ -20,10 +20,18 @@ class TedExtractor:
This class converts the xml version of one tender to the entity.
"""
+ def __init__(self):
+ self.tender = None
+
def extract(self, xml_doc: Soup, languages: List[str]):
# parse document id
- ted_export = xml_doc.findAll(re.compile("TED_EXPORT"))[0]
+ try:
+ ted_export = xml_doc.findAll(re.compile("TED_EXPORT"))[0]
+ except:
+ #can't retrieve notices issued before 2009
+ return None
+
tender_id = "EU" + ted_export['DOC_ID']
# parse cpv code
@@ -40,13 +48,49 @@ def extract(self, xml_doc: Soup, languages: List[str]):
logger.error(xml_doc.prettify())
raise Exception("could not retrieve CPV for contract")
- tender = Tender(tender_id, tender_cpvs)
+ self.tender = Tender(tender_id, tender_cpvs)
+
+ # extract original
+ try:
+ # extract original
+ original_section = xml_doc.findAll(attrs={"CATEGORY": "ORIGINAL"})
+ original_language = original_section[0]['LG']
+
+ # extract original title
+ original_title_ti_doc = original_section[0].findAll(re.compile("TITLE"))
+ if len(original_title_ti_doc) == 0:
+ original_title_ti_doc = original_section[0].findAll(re.compile("TI_DOC"))
+
+ # extract original short description
+ original_short_descr_ti_doc = original_section[0].findAll(re.compile("SHORT_DESCR"))
+ if len(original_short_descr_ti_doc) == 0:
+ original_short_descr_ti_doc = original_section[0].findAll(
+ re.compile("SHORT_CONTRACT_DESCRIPTION"))
+
+ if len(original_short_descr_ti_doc) == 0:
+ original_short_descr_ti_doc = original_section[0].find_all(re.compile("TI_MARK"),
+ string="Contract description:")
+ if len(original_short_descr_ti_doc) != 0:
+ original_short_descr_ti_doc = [original_short_descr_ti_doc[0].findNext(re.compile("TXT_MARK"))]
+
+ original_short_description = extract_text(original_short_descr_ti_doc[0]) if len(
+ original_short_descr_ti_doc) != 0 else ""
+ original_title = extract_text(original_title_ti_doc[0]) if len(original_title_ti_doc) != 0 else ""
+
+ original_link = extract_text(xml_doc.findAll(re.compile("URI_DOC"))[0])
+
+ self.tender.set_original_language_entity(original_language, original_title, original_short_description, original_link)
+ except:
+ logger.error("Could not retrieve original language data for contract")
+ logger.error(xml_doc.prettify())
+ raise Exception("Could not retrieve original language data for contract")
# extract title and description for each language
for lg in languages:
title = None
short_desc = None
+ link = None
# first format of contract
try:
@@ -56,6 +100,7 @@ def extract(self, xml_doc: Soup, languages: List[str]):
if ml_ti_doc:
ti_text = ml_ti_doc[0].findAll(re.compile("TI_TEXT"))[0]
title = extract_text(ti_text)
+
except:
logger.debug(f"could not parse first format of contract {tender_id}")
@@ -96,11 +141,10 @@ def extract(self, xml_doc: Soup, languages: List[str]):
# extract link
try:
- link = extract_text(xml_doc.findAll(re.compile("URI_DOC"), {"LG": lg})[0])
+ link = extract_text(xml_doc.findAll(re.compile("URI_DOC"))[0])
except:
logger.debug(f"could not parse link for language {lg}")
+ self.tender.add_language_entity(lg, title, short_desc, link)
- tender.add_language_entity(lg, title, short_desc, link)
-
- return tender
+ return self.tender
diff --git a/src/fetcher/ted/TedFetcher.py b/src/service/fetcher/ted/TedFetcher.py
similarity index 50%
rename from src/fetcher/ted/TedFetcher.py
rename to src/service/fetcher/ted/TedFetcher.py
index c294ace..4e6f77f 100644
--- a/src/fetcher/ted/TedFetcher.py
+++ b/src/service/fetcher/ted/TedFetcher.py
@@ -1,6 +1,9 @@
from typing import List
-from src.fetcher.ted.TedDownloader import TedDownloader
-from src.fetcher.ted.TedExtractor import TedExtractor
+
+from src.Models.TedSaveModel import TedSaveModel
+from src.entity.Tender import Tender
+from src.service.fetcher.ted.TedDownloader import TedDownloader
+from src.service.fetcher.ted.TedExtractor import TedExtractor
import sys
@@ -16,8 +19,23 @@ def __init__(self):
self.ted_downloader = TedDownloader()
self.ted_extractor = TedExtractor()
- def get(self, count: int, load_documents: bool = False, search_criteria: str = "", languages: List[str] = ["DE"],
- page_offset: int = 0):
+ def from_ted_save_model(self, model: TedSaveModel):
+ return self.get(count=model.amount, search_criteria=model.search_criteria,
+ original_languages=model.original_languages, languages=model.languages)
+
+ def get(self, count: int, load_documents: bool = False, search_criteria: str = "", original_languages=[],
+ languages: List[str] = ["DE", "EN"], page_offset: int = 0):
+
+ if original_languages is None:
+ original_languages = []
+
+ if languages is None:
+ languages = []
+
+ # each original language must occur in the ifnal tender entity
+ for language in original_languages:
+ if language not in languages:
+ languages.append(language)
if count <= 0:
count = sys.maxsize
@@ -26,7 +44,7 @@ def get(self, count: int, load_documents: bool = False, search_criteria: str = "
page = 1
last_docs_count = -1
- while last_docs_count != len(ted_docs):
+ while last_docs_count < len(ted_docs):
last_docs_count = len(ted_docs)
xml_docs = self.ted_downloader.get_xml_contracts(page, self.MAX_PAGE_COUNT, search_criteria, page_offset)
@@ -34,23 +52,26 @@ def get(self, count: int, load_documents: bool = False, search_criteria: str = "
for xml_doc in xml_docs:
if xml_doc is not None:
- doc = self.ted_extractor.extract(xml_doc, languages)
+ doc: Tender = self.ted_extractor.extract(xml_doc, languages)
if doc is not None:
-
documents = []
if load_documents:
pass
-
# doc_links = self.ted_extractor.extract_doc_links(xml_doc)
# logger.info("found doc links: " + str(doc_links))
# for doc_link in doc_links:
# documents.append(doc_parse.get_doc_content(doc_link))
# TODO add document links to tender
- ted_docs.append(doc)
+ # if filter by orignal language then just use tender in those languages
+ if len(original_languages) > 0:
+ if doc.original_lang in original_languages:
+ ted_docs.append(doc)
+ else:
+ ted_docs.append(doc)
- if len(ted_docs) == count:
+ if len(ted_docs) >= count:
return ted_docs
page += 1
diff --git a/src/static/api.yaml b/src/static/api.yaml
new file mode 100644
index 0000000..c73e676
--- /dev/null
+++ b/src/static/api.yaml
@@ -0,0 +1,287 @@
+openapi: 3.0.0
+info:
+ description: Backend for Machine Learning solution to screen public tenders automatically.
+ version: 1.0.0
+ title: tenderclass-backend
+ contact:
+ email: e11709460@student.tuwien.ac.at
+servers:
+ - url: 'http://localhost:5000/api/v1'
+tags:
+ - name: ML Model
+ description: API for the Machine Learning Model
+paths:
+ /web/recommendations:
+ get:
+ tags:
+ - ML Model
+ parameters:
+ - name: count
+ in: query
+ schema:
+ type: integer
+ summary: Returns today's tender recommendations
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/tender'
+ /web/train:
+ post:
+ tags:
+ - ML Model
+ summary: Trains the existing model with new tenders
+ requestBody:
+ description: JSON of list of tenders and list of labels
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/trainingtuples'
+ responses:
+ '200':
+ description: OK
+ /model/new:
+ post:
+ tags:
+ - ML Model
+ summary: Create a new model
+ responses:
+ '200':
+ description: OK
+ /model/save:
+ post:
+ tags:
+ - ML Model
+ summary: Save a trained model
+ requestBody:
+ description: Model name
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/model_name'
+ responses:
+ '200':
+ description: OK
+ /model/load:
+ post:
+ tags:
+ - ML Model
+ summary: Load a trained model
+ requestBody:
+ description: Model name
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/model_name'
+ responses:
+ '200':
+ description: OK
+ /model/train-from-datasets:
+ post:
+ tags:
+ - ML Model
+ summary: Train a model from already fetched dataset
+ requestBody:
+ description: Location of the positive and negative training instances
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/dataset-location-model'
+ responses:
+ '200':
+ description: OK
+ /model/validate-on-dataset:
+ post:
+ tags:
+ - ML Model
+ summary: Validate a trained model on a given dataset
+ requestBody:
+ description: Filenames of a positive and negative dataset
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/dataset-location-model'
+ responses:
+ '200':
+ description: Validation result
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/validation_result'
+ /fetch/ted:
+ post:
+ tags:
+ - ML Model
+ summary: Fetch and save Tender from TED
+ requestBody:
+ description: Search criteria and storage lcoation
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/fetch_ted_model'
+ responses:
+ '200':
+ description: OK
+components:
+ schemas:
+ tender:
+ type: object
+ properties:
+ id:
+ type: string
+ format: string
+ cpvs:
+ type: array
+ items:
+ type: string
+ original_lang:
+ type: string
+ format: string
+ languageEntities:
+ type: array
+ items:
+ type: object
+ properties:
+ schema:
+ $ref: '#/components/schemas/languageentity'
+ trainingtuples:
+ type: object
+ properties:
+ ids:
+ type: array
+ items:
+ type: string
+ labels:
+ type: array
+ items:
+ type: integer
+ example:
+ ids:
+ - EU00001
+ - EU00002
+ - EU00003
+ labels:
+ - 1
+ - 0
+ - 1
+ trainingconfiguration:
+ type: object
+ properties:
+ pos_number:
+ type: integer
+ format: integer
+ neg_number:
+ type: integer
+ format: integer
+ pos_search_criteria:
+ type: integer
+ format: integer
+ neg_search_criteria:
+ type: integer
+ format: integer
+ example:
+ pos_number: 1000
+ neg_number: 1000
+ pos_search_criteria: " AND PC=[72*]"
+ neg_search_criteria: " AND NOT PC=[72*]"
+ dataset-location-model:
+ type: object
+ properties:
+ pos_filename:
+ type: string
+ format: string
+ neg_filename:
+ type: string
+ format: string
+ example:
+ pos_filename: "example-pos.json"
+ neg_filename: "example-neg.json"
+ languageentity:
+ type: object
+ properties:
+ language:
+ type: string
+ format: string
+ title:
+ type: string
+ format: string
+ description:
+ type: string
+ format: string
+ example:
+ language: DE
+ title: Software- und Systemprogrammierung für hauseigenes ERP-System
+ description: 'Für unser hauseigenes ERP-System soll ein Erweiterungsmodul entwickelt werden, das erlaubt, Python-Skripte einzubetten.'
+ model_name:
+ type: object
+ properties:
+ name:
+ type: string
+ format: string
+ example:
+ name: "example-model"
+ fetch_ted_model:
+ type: object
+ required: ["amount", "search_criteria", "dataset_name"]
+ properties:
+ amount:
+ type: integer
+ format: integer
+ search_criteria:
+ type: integer
+ format: integer
+ original_languages:
+ type: array
+ items:
+ type: string
+ languages:
+ type: array
+ items:
+ type: string
+ dataset_name:
+ type: string
+ format: string
+ example:
+ amount: 100
+ search_criteria: " AND PC=[72*] AND CY=[DE OR FR] "
+ original_languages:
+ - "DE"
+ - "FR"
+ languages:
+ - "EN"
+ dataset_name: "example-dataset"
+ validation_result:
+ type: object
+ properties:
+ tn:
+ type: integer
+ format: integer
+ fp:
+ type: integer
+ format: integer
+ fn:
+ type: integer
+ format: integer
+ tp:
+ type: integer
+ format: integer
+ accuracy:
+ type: number
+ format: double
+ precision:
+ type: number
+ format: double
+ recall:
+ type: number
+ format: double
+ f1:
+ type: number
+ format: double
\ No newline at end of file
diff --git a/src/static/swagger.json b/src/static/swagger.json
deleted file mode 100644
index 657e8fc..0000000
--- a/src/static/swagger.json
+++ /dev/null
@@ -1,196 +0,0 @@
-
-{
- "openapi": "3.0.0",
- "info": {
- "description": "Backend for Machine Learning solution to screen public tenders automatically.",
- "version": "1.0.0",
- "title": "tenderclass-backend",
- "contact": {
- "email": "e1617265@student.tuwien.ac.at"
- }
- },
- "servers": [
- {
- "url": "http://localhost:5000/api/v1"
- }
- ],
- "tags": [
- {
- "name": "ML Model",
- "description": "API for the Machine Learning Model"
- }
- ],
- "paths": {
- "/web/recommendations": {
- "get": {
- "tags": [
- "ML Model"
- ],
- "parameters": [
- {
- "name": "count",
- "in": "query",
- "schema": {
- "type": "integer"
- }
- }
- ],
- "summary": "Returns today's tender recommendations",
- "responses": {
- "200": {
- "description": "OK",
- "schema": {
- "$ref": "#/components/schemas/tender"
- }
- }
- }
- }
- },
- "/web/train": {
- "post": {
- "tags": [
- "ML Model"
- ],
- "summary": "Trains the existing model with new tenders",
- "requestBody": {
- "description": "JSON of list of tenders and list of labels",
- "required": true,
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/trainingtuples"
- }
- }
- }
- },
- "produces": [
- "application/json"
- ],
- "responses": {
- "200": {
- "description": "OK"
- }
- }
- }
- },
- "/model/new": {
- "post": {
- "tags": [
- "ML Model"
- ],
- "summary": "Create a new model",
- "requestBody": {
- "description": "Model training configuration",
- "required": true,
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/trainingconfiguration"
- }
- }
- }
- },
- "produces": [
- "application/json"
- ],
- "responses": {
- "200": {
- "description": "OK"
- }
- }
- }
- }
- },
- "components": {
- "schemas": {
- "tender": {
- "type": "object",
- "properties": {
- "id": {
- "type": "string",
- "format": "string"
- },
- "cpvs": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "languageEntities": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalproperties": {
- "$ref": "#/components/schemas/languageentity"
- }
- }
- }
- }
- },
- "trainingtuples": {
- "type": "object",
- "properties": {
- "ids": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "labels": {
- "type": "array",
- "items": {
- "type": "integer"
- }
- }
- },
- "example": {
- "ids": ["EU00001", "EU00002", "EU00003"],
- "labels": [1, 0, 1]
- }
- },
- "trainingconfiguration": {
- "type": "object",
- "properties": {
- "pos_number": {
- "type": "integer",
- "format": "integer"
- },
- "neg_number": {
- "type": "integer",
- "format": "integer"
- },
- "pos_search_criteria": {
- "type": "integer",
- "format": "integer"
- },
- "neg_search_criteria": {
- "type": "integer",
- "format": "integer"
- }
- }
- },
- "languageentity": {
- "type": "object",
- "properties": {
- "language": {
- "type": "string",
- "format": "string"
- },
- "title": {
- "type": "string",
- "format": "string"
- },
- "description": {
- "type": "string",
- "format": "string"
- }
- },
- "example": {
- "language": "DE",
- "title": "Software- und Systemprogrammierung für hauseigenes ERP-System",
- "description": "Für unser hauseigenes ERP-System soll ein Erweiterungsmodul entwickelt werden, das erlaubt, Python-Skripte einzubetten."
- }
- }
- }
- }
-}
\ No newline at end of file
diff --git a/src/validation/__init__.py b/src/validation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/validation/create_from_datasets_validation.py b/src/validation/create_from_datasets_validation.py
new file mode 100644
index 0000000..dcf48ee
--- /dev/null
+++ b/src/validation/create_from_datasets_validation.py
@@ -0,0 +1,12 @@
+from marshmallow import Schema, fields, post_load
+
+from src.Models.FromDatasetsModelModel import FromDatasetsModel
+
+
+class CreateFromDatasetsValidation(Schema):
+ pos_filename = fields.String(required=True)
+ neg_filename = fields.String(required=True)
+
+ @post_load
+ def make_model(self, data, **kwargs):
+ return FromDatasetsModel(**data)
\ No newline at end of file
diff --git a/src/validation/create_new_validation.py b/src/validation/create_new_validation.py
new file mode 100644
index 0000000..237ef48
--- /dev/null
+++ b/src/validation/create_new_validation.py
@@ -0,0 +1,14 @@
+from marshmallow import Schema, fields, post_load
+
+from src.Models.NewModelModel import NewModelModel
+
+
+class CreateNewValidation(Schema):
+ pos_number = fields.Int(required=True)
+ neg_number = fields.Int(required=True)
+ pos_search_criteria = fields.String(required=True)
+ neg_search_criteria = fields.String(required=True)
+
+ @post_load
+ def make_model(self, data, **kwargs):
+ return NewModelModel(**data)
\ No newline at end of file
diff --git a/src/validation/model_name_validation.py b/src/validation/model_name_validation.py
new file mode 100644
index 0000000..46e0c2b
--- /dev/null
+++ b/src/validation/model_name_validation.py
@@ -0,0 +1,11 @@
+from marshmallow import Schema, fields, post_load
+
+from src.Models.ModelNameModel import ModelNameModel
+
+
+class ModelNameValidation(Schema):
+ name = fields.String(required=True)
+
+ @post_load
+ def make_model(self, data, **kwargs):
+ return ModelNameModel(**data)
\ No newline at end of file
diff --git a/src/validation/ted_save_validation.py b/src/validation/ted_save_validation.py
new file mode 100644
index 0000000..3b98231
--- /dev/null
+++ b/src/validation/ted_save_validation.py
@@ -0,0 +1,15 @@
+from marshmallow import Schema, fields, post_load
+
+from src.Models.TedSaveModel import TedSaveModel
+
+
+class TedSaveValidation(Schema):
+ amount = fields.Int(required=True)
+ search_criteria = fields.String(required=True)
+ original_languages = fields.List(fields.String(), required=False)
+ languages = fields.List(fields.String(), required=False)
+ dataset_name = fields.String(required=True)
+
+ @post_load
+ def make_model(self, data, **kwargs):
+ return TedSaveModel(**data)