OmdenaAI · ehabebrahim · May 24, 2021 · Jun 2, 2021 · Jul 2, 2021 · Jul 16, 2021
diff --git a/Lemma.py b/Lemma.py
@@ -0,0 +1,78 @@
+import re
+
+def lemma(word):
+    #Remove honorific sign
+    word = word.replace('"','')
+    word =  word.replace("(", "")
+    word =  word.replace(")", "")
+    word = word.replace('؟','')
+    word = word.replace('!','')
+    word = word.replace('.','')
+    word = re.sub("^[\u0610]+$", "",word) #ARABIC SIGN SALLALLAHOU ALAYHE WA SALLAM
+    word =  re.sub("^[\u0611]+$", "",word)#ARABIC SIGN ALAYHE ASSALLAM
+    word =  re.sub("^[\u0612]+$", "",word)#ARABIC SIGN RAHMATULLAH ALAYHE
+    word =  re.sub("^[\u0613]+$", "",word)#ARABIC SIGN RADI ALLAHOU ANHU
+    word =  re.sub("^[\u0614]+$", "",word)#ARABIC SIGN TAKHALLUS
+    #Remove koranic anotation
+    word =  re.sub("^[\u0615]+$", "",word)#ARABIC SMALL HIGH TAH
+    word =  re.sub("^[\u0616]+$", "",word)#ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
+    word =  re.sub("^[\u0617]+$", "",word)#ARABIC SMALL HIGH ZAIN
+    word =  re.sub("^[\u0618]+$", "",word)#ARABIC SMALL FATHA
+    word =  re.sub("^[\u0619]+$", "",word)#ARABIC SMALL DAMMA
+    word =  re.sub("^[\u061A]+$", "",word)#ARABIC SMALL KASRA
+    word =  re.sub("^[\u06D6]+$", "",word)#ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
+    word =  re.sub("^[\u06D7]+$", "",word)#ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
+    word =  re.sub("^[\u06D8]+$", "",word)#ARABIC SMALL HIGH MEEM INITIAL FORM
+    word =  re.sub("^[\u06D9]+$", "",word)#ARABIC SMALL HIGH LAM ALEF
+    word =  re.sub("^[\u06DA]+$", "",word)#ARABIC SMALL HIGH JEEM
+    word =  re.sub("^[\u06DB]+$", "",word)#ARABIC SMALL HIGH THREE DOTS
+    word =  re.sub("^[\u06DC]+$", "",word)#ARABIC SMALL HIGH SEEN
+    word =  re.sub("^[\u06DD]+$", "",word)#ARABIC END OF AYAH
+    word =  re.sub("^[\u06DE]+$", "",word)#ARABIC START OF RUB EL HIZB
+    word =  re.sub("^[\u06DF]+$", "",word)#ARABIC SMALL HIGH ROUNDED ZERO
+    word =  re.sub("^[\u06E0]+$", "",word)#ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
+    word =  re.sub("^[\u06E1]+$", "",word)#ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
+    word =  re.sub("^[\u06E2]+$", "",word)#ARABIC SMALL HIGH MEEM ISOLATED FORM
+    word =  re.sub("^[\u06E3]+$", "",word)#ARABIC SMALL LOW SEEN
+    word =  re.sub("^[\u06E4]+$", "",word)#ARABIC SMALL HIGH MADDA
+    word =  re.sub("^[\u06E5]+$", "",word)#ARABIC SMALL WAW
+    word =  re.sub("^[\u06E6]+$", "",word)#ARABIC SMALL YEH
+    word =  re.sub("^[\u06E7]+$", "",word)#ARABIC SMALL HIGH YEH
+    word =  re.sub("^[\u06E8]+$", "",word)#ARABIC SMALL HIGH NOON
+    word =  re.sub("^[\u06E9]+$", "",word)#ARABIC PLACE OF SAJDAH
+    word =  re.sub("^[\u06EA]+$", "",word)#ARABIC EMPTY CENTRE LOW STOP
+    word =  re.sub("^[\u06EB]+$", "",word)#ARABIC EMPTY CENTRE HIGH STOP
+    word =  re.sub("^[\u06EC]+$", "",word)#ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
+    word =  re.sub("^[\u06ED]+$", "",word)#ARABIC SMALL LOW MEEM
+    #Remove tatweel
+    word =  re.sub("^[\u0640]+$", "",word)
+    #Remove tashkeel
+    word =  re.sub("^[\u064B]+$", "",word)#ARABIC FATHATAN
+    word =  re.sub("^[\u064C]+$", "",word)#ARABIC DAMMATAN
+    word =  re.sub("^[\u064D]+$", "",word)#ARABIC KASRATAN
+    word =  re.sub("^[\u064E]+$", "",word)#ARABIC FATHA
+    word =  re.sub("^[\u064F]+$", "",word)#ARABIC DAMMA
+    word =  re.sub("^[\u0650]+$", "",word)#ARABIC KASRA
+    word =  re.sub("^[\u0651]+$", "",word)#ARABIC SHADDA
+    word =  re.sub("^[\u0652]+$", "",word)#ARABIC SUKUN
+    word =  re.sub("^[\u0653]+$", "",word)#ARABIC MADDAH ABOVE
+    word =  re.sub("^[\u0654]+$", "",word)#ARABIC HAMZA ABOVE
+    word =  re.sub("^[\u0655]+$", "",word)#ARABIC HAMZA BELOW
+    word =  re.sub("^[\u0656]+$", "",word)#ARABIC SUBSC^[RIPT A]+$LEF
+    word =  re.sub("^[\u0657]+$", "",word)#ARABIC INVERTED DAMMA
+    word =  re.sub("^[\u0658]+$", "",word)#ARABIC MARK NOON GHUNNA
+    word =  re.sub("^[\u0659]+$", "",word)#ARABIC ZWARAKAY
+    word =  re.sub("^[\u065A]+$", "",word)#ARABIC VOWEL SIGN SMALL V ABOVE
+    word =  re.sub("^[\u065B]+$", "",word)#ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
+    word =  re.sub("^[\u065C]+$", "",word)#ARABIC VOWEL SIGN DOT BELOW
+    word =  re.sub("^[\u065D]+$", "",word)#ARABIC REVERSED DAMMA
+    word =  re.sub("^[\u065E]+$", "",word)#ARABIC FATHA WITH TWO DOTS
+    word =  re.sub("^[\u065F]+$", "",word)#ARABIC WAVY HAMZA BELOW
+    word =  re.sub("^[\u0670]+$", "",word)#ARABIC LETTER SUPERSCRIPT ALEF
+    word =  word.replace("ى", "ي")
+    word =  word.replace("ؤ", "ء")
+    word =  word.replace("ئ", "ء")
+    word =  word.replace("ة", "ه")
+    word =  word.replace("گ", "ك")
+    word = word.replace('ال','')
+    return  word
diff --git a/NER.py b/NER.py
@@ -0,0 +1,9 @@
+import spacy
+
+class NER_Tagger:
+    def __init__(self):
+        self.model = spacy.load('NER/SPACY/')
+
+    def classify(self, text):
+        doc = self.model(text)
+        return [(str(ent), ent.label_) for ent in doc.ents]
diff --git a/POS.py b/POS.py
@@ -0,0 +1,15 @@
+from POS_SVM import POS_SVM
+from POS_BILSTM import POS_BILSTM
+
+class POS:
+    def __init__(self,model = 'SVM'):
+        if model == 'SVM':
+            self.model = POS_SVM()
+            self.predict_func = self.model.classify
+        elif model == 'BILSTM':
+            self.model = POS_BILSTM()
+            self.predict_func = self.model.classify
+        else:
+            return 'Incorrect Model Name(Model name should be either SVM or BILSTM)'
+    def predict(self,sent):
+        return self.predict_func(sent)
diff --git a/POS_BILSTM.py b/POS_BILSTM.py
@@ -0,0 +1,87 @@
+import gensim
+import json
+import os
+import numpy as np
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import *
+from keras.optimizers import Adam,SGD
+from keras.models import Model
+import re
+
+class POS_BILSTM:
+    def __init__(self):
+        self.word_tokenizer ,self.tag_tokenizer = self.__get_tokenizers()
+        self.VOCABULARY_SIZE = len(self.word_tokenizer.word_index) + 1                  
+        self.NUM_CLASSES = 35
+        self.__embedding_dim = 300
+        self.__MAX_SEQUENCE_LENGTH = 398
+        self.__trunc_type='post'
+        self.__padding_type='post'
+        self.__oov_tok = "<OOV>"
+        self.__model = self.__define_model()
+
+    def __get_tokenizers(self):
+
+        with open('POS/BILSTM/word_tokenizer.pickle', 'rb') as handle:
+            word_tokenizer = pickle.load(handle)
+        with open('POS/BILSTM/tag_tokenizer.pickle', 'rb') as handle:
+            tag_tokenizer = pickle.load(handle)
+
+        return word_tokenizer , tag_tokenizer
+
+
+    def __define_model(self):
+
+        model = Sequential()
+        model.add(InputLayer((self.__MAX_SEQUENCE_LENGTH)))
+        model.add(Embedding(input_dim = self.VOCABULARY_SIZE,
+                                    output_dim    = self.__embedding_dim,
+                                    input_length  = self.__MAX_SEQUENCE_LENGTH,
+                                    trainable     = True
+        ))
+        model.add(Bidirectional(LSTM(256, return_sequences=True)))
+        model.add(Bidirectional(LSTM(256, return_sequences=True)))
+        model.add(Bidirectional(LSTM(256, return_sequences=True)))
+        model.add(Bidirectional(LSTM(256, return_sequences=True)))
+        model.add(TimeDistributed(Dense(self.NUM_CLASSES, activation='softmax')))
+
+        model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
+        model.load_weights('POS/BILSTM/Bi_LSTM_checkpoint.h5')
+
+        return model
+
+
+    def clean_str(self, text):
+
+        #remove tashkeel
+        p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
+        text = re.sub(p_tashkeel,"", text)
+
+        #remove longation
+        p_longation = re.compile(r'(.)\1+')
+        subst = r"\1\1"
+        text = re.sub(p_longation, subst, text)
+
+        text = text.replace('وو', 'و')
+        text = text.replace('يي', 'ي')
+        text = text.replace('اا', 'ا')
+        text = text.replace('أ', 'ا')
+        text = text.replace('إ', 'ا')
+        text = text.replace('آ', 'ا')
+        text = text.replace('ى', 'ي')
+
+        return text.split()
+
+
+    def classify(self, sentence):
+        sent = sentence
+        sentence = self.clean_str(sentence)
+        seq = [self.word_tokenizer.texts_to_sequences(sentence)]
+        pad_seq = pad_sequences(seq, maxlen=self.__MAX_SEQUENCE_LENGTH, padding=self.__padding_type, truncating=self.__trunc_type)
+        pad_seq = np.squeeze(pad_seq,axis=-1)
+        pred = np.squeeze(self.__model.predict(pad_seq).argmax(-1))
+        output = [self.tag_tokenizer.index_word[tag] for tag in pred if tag != 0]
+        pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))]
+        return pred_tags
diff --git a/POS_SVM.py b/POS_SVM.py
@@ -0,0 +1,81 @@
+import pickle
+import gensim
+import numpy as np
+
+class POS_SVM:
+    def __init__(self):
+        self.load_files()
+
+    def load_files(self):
+        with open("POS/SVM/config.pkl", "rb") as tf:
+            self.config = pickle.load(tf)
+        self.WINDOW = self.config['Window']
+        self.embedding_model = gensim.models.Word2Vec.load('POS/SVM/'+self.config['embedding_model_path'])
+        self.tags_list = self.config['tags_list']
+        self.ambiguities = self.config['ambiguities']
+        self.encoder = pickle.load(open('POS/SVM/'+self.config['encoder'], 'rb'))
+        self.model = pickle.load(open('POS/SVM/svm_pos_tag.pickle', 'rb'))
+
+    def OneHotEncoder(self,number, lenght):
+        zero = np.zeros(lenght)
+        zero[number] = 1
+        return zero
+
+    def getFeatures(self,wordIdx, sentence, pos, w2v, tags, ambiguities, train=True):
+        features = []
+
+        keys = w2v.wv.key_to_index.keys()
+        for i in reversed(range(1,int(self.WINDOW)//2+1)):
+            if sentence[wordIdx-i] not in keys:
+                features.append(np.zeros(w2v.vector_size))
+            else:
+                features.append(w2v.wv.get_vector(sentence[wordIdx-i], norm=True))
+
+        if sentence[wordIdx] not in keys:
+            features.append(np.zeros(w2v.vector_size))
+        else:
+            features.append(w2v.wv.get_vector(sentence[wordIdx], norm=True))
+
+        for i in range(1,int(self.WINDOW)//2+1):
+            if sentence[wordIdx+i] not in keys:
+                features.append(np.zeros(w2v.vector_size))
+            else:
+                features.append(w2v.wv.get_vector(sentence[wordIdx+i], norm=True))
+        if train:
+            for i in reversed(range(1,int(self.WINDOW)//2+1)):
+                tag = pos[wordIdx-i]
+                features.append(self.OneHotEncoder(tags.index(tag),len(tags)))
+
+            if sentence[wordIdx] in ambiguities:
+                features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
+            else:
+                features.append(self.OneHotEncoder([],len(tags)))
+
+        else:
+            for i in reversed(range(1,int(self.WINDOW)//2+1)):
+                tag = pos[wordIdx-i]
+                features.append(self.OneHotEncoder(tags.index(tag),len(tags)))
+
+            if sentence[wordIdx] in ambiguities:
+                features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
+            else:
+                features.append(self.OneHotEncoder([],len(tags)))
+
+        features.append([len(sentence[wordIdx])])
+
+        flat_list = []
+        for i in features:
+            flat_list.extend(i)
+        return flat_list
+
+    def classify(self,sent):
+        words = sent.split()
+        words = ["" for i in range(int(self.WINDOW)//2)] + words + ["" for i in range(int(self.WINDOW)//2)]
+        pos = ["PAD" for i in range(int(self.WINDOW)//2)]
+        for i in range(int(self.WINDOW)//2,len(words) - int(self.WINDOW)//2):
+            feature = np.array(self.getFeatures(i , words, pos, self.embedding_model, self.tags_list, self.ambiguities, train = False)).reshape(1,-1)
+            tag = self.model.predict(feature)
+            pos.append(self.encoder.inverse_transform(tag)[0])
+        output = pos[2:]
+        pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))]
+        return pred_tags
diff --git a/app.py b/app.py
@@ -0,0 +1,49 @@
+from flask import Flask, render_template, request, jsonify
+import re
+from POS import POS
+from NER import NER_Tagger
+from Lemma import lemma
+
+pos_tagger = POS('SVM')
+ner_tagger = NER_Tagger()
+
+app  = Flask(__name__)
+@app.route('/')
+def home():
+    return render_template('home.html')
+
+
+@app.route('/join1',methods=['GET','POST'])
+def lemmatization():
+    wordlist  = []
+    text = request.form['text1']
+    for word in text.split():
+            temp = lemma(word.strip())
+            if len(temp) < 3:
+                    wordlist.append(word)
+            else:
+                    wordlist.append(temp)
+    result = {'output':' '.join(wordlist)}
+    result = {str(key): value for key, value in result.items()}
+    return jsonify(result=result)
+
+@app.route('/join2',methods=['GET','POST'])               
+def pos():
+    text = request.form['text1']
+    output = pos_tagger.predict(text)
+    result = {'output':output}
+    result = {str(key): value for key, value in result.items()}
+    return jsonify(result=result)
+
+@app.route('/join',methods=['GET','POST'])               
+def ner():
+    text = request.form['text1']
+    output = ner_tagger.classify(text)
+    result = {'output':output}
+    result = {str(key): value for key, value in result.items()}
+    return jsonify(result=result)
+
+
+
+if __name__ == '__main__':
+    app.run(debug=True)
diff --git a/demo_lemmatization.py b/demo_lemmatization.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May 23 19:03:10 2021
+
+@author: ehab
+"""
+from nltk.metrics import *
+
+import qalsadi.lemmatizer
+
+
+#this function is to get output lemmatizer 
+def input_data(filename):
+    #open the file
+    file=open(filename)
+    text=file.read()
+
+    lemmer=qalsadi.lemmatizer.Lemmatizer()
+    lemmas=lemmer.lemmatize_text(text)
+    return lemmas
+
+
+#calculate the accuracy
+def test_acc(test_file,lemmas):
+    #file=open(test_file)
+    test=[]
+    with open(test_file,'r') as file:
+        test=[current_place.rstrip() for current_place in file.readlines()]
+
+    reference=lemmas  
+
+    a=accuracy(reference,test)
+    return(a*100)
+
+  ########------test--------------##  
+#filename= "/home/ehab/Desktop/data.txt"
+#test_file="/home/ehab/Desktop/test_data.txt"
+#lemmas=input_data(filename)
+#print(lemmas)
+#print(test_acc(test_file,lemmas))
+