diff --git a/Lemma.py b/Lemma.py new file mode 100644 index 00000000..cb837c56 --- /dev/null +++ b/Lemma.py @@ -0,0 +1,78 @@ +import re + +def lemma(word): + #Remove honorific sign + word = word.replace('"','') + word = word.replace("(", "") + word = word.replace(")", "") + word = word.replace('؟','') + word = word.replace('!','') + word = word.replace('.','') + word = re.sub("^[\u0610]+$", "",word) #ARABIC SIGN SALLALLAHOU ALAYHE WA SALLAM + word = re.sub("^[\u0611]+$", "",word)#ARABIC SIGN ALAYHE ASSALLAM + word = re.sub("^[\u0612]+$", "",word)#ARABIC SIGN RAHMATULLAH ALAYHE + word = re.sub("^[\u0613]+$", "",word)#ARABIC SIGN RADI ALLAHOU ANHU + word = re.sub("^[\u0614]+$", "",word)#ARABIC SIGN TAKHALLUS + #Remove koranic anotation + word = re.sub("^[\u0615]+$", "",word)#ARABIC SMALL HIGH TAH + word = re.sub("^[\u0616]+$", "",word)#ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH + word = re.sub("^[\u0617]+$", "",word)#ARABIC SMALL HIGH ZAIN + word = re.sub("^[\u0618]+$", "",word)#ARABIC SMALL FATHA + word = re.sub("^[\u0619]+$", "",word)#ARABIC SMALL DAMMA + word = re.sub("^[\u061A]+$", "",word)#ARABIC SMALL KASRA + word = re.sub("^[\u06D6]+$", "",word)#ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA + word = re.sub("^[\u06D7]+$", "",word)#ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA + word = re.sub("^[\u06D8]+$", "",word)#ARABIC SMALL HIGH MEEM INITIAL FORM + word = re.sub("^[\u06D9]+$", "",word)#ARABIC SMALL HIGH LAM ALEF + word = re.sub("^[\u06DA]+$", "",word)#ARABIC SMALL HIGH JEEM + word = re.sub("^[\u06DB]+$", "",word)#ARABIC SMALL HIGH THREE DOTS + word = re.sub("^[\u06DC]+$", "",word)#ARABIC SMALL HIGH SEEN + word = re.sub("^[\u06DD]+$", "",word)#ARABIC END OF AYAH + word = re.sub("^[\u06DE]+$", "",word)#ARABIC START OF RUB EL HIZB + word = re.sub("^[\u06DF]+$", "",word)#ARABIC SMALL HIGH ROUNDED ZERO + word = re.sub("^[\u06E0]+$", "",word)#ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO + word = re.sub("^[\u06E1]+$", "",word)#ARABIC SMALL HIGH DOTLESS HEAD OF KHAH + word = re.sub("^[\u06E2]+$", "",word)#ARABIC SMALL HIGH MEEM ISOLATED FORM + word = re.sub("^[\u06E3]+$", "",word)#ARABIC SMALL LOW SEEN + word = re.sub("^[\u06E4]+$", "",word)#ARABIC SMALL HIGH MADDA + word = re.sub("^[\u06E5]+$", "",word)#ARABIC SMALL WAW + word = re.sub("^[\u06E6]+$", "",word)#ARABIC SMALL YEH + word = re.sub("^[\u06E7]+$", "",word)#ARABIC SMALL HIGH YEH + word = re.sub("^[\u06E8]+$", "",word)#ARABIC SMALL HIGH NOON + word = re.sub("^[\u06E9]+$", "",word)#ARABIC PLACE OF SAJDAH + word = re.sub("^[\u06EA]+$", "",word)#ARABIC EMPTY CENTRE LOW STOP + word = re.sub("^[\u06EB]+$", "",word)#ARABIC EMPTY CENTRE HIGH STOP + word = re.sub("^[\u06EC]+$", "",word)#ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE + word = re.sub("^[\u06ED]+$", "",word)#ARABIC SMALL LOW MEEM + #Remove tatweel + word = re.sub("^[\u0640]+$", "",word) + #Remove tashkeel + word = re.sub("^[\u064B]+$", "",word)#ARABIC FATHATAN + word = re.sub("^[\u064C]+$", "",word)#ARABIC DAMMATAN + word = re.sub("^[\u064D]+$", "",word)#ARABIC KASRATAN + word = re.sub("^[\u064E]+$", "",word)#ARABIC FATHA + word = re.sub("^[\u064F]+$", "",word)#ARABIC DAMMA + word = re.sub("^[\u0650]+$", "",word)#ARABIC KASRA + word = re.sub("^[\u0651]+$", "",word)#ARABIC SHADDA + word = re.sub("^[\u0652]+$", "",word)#ARABIC SUKUN + word = re.sub("^[\u0653]+$", "",word)#ARABIC MADDAH ABOVE + word = re.sub("^[\u0654]+$", "",word)#ARABIC HAMZA ABOVE + word = re.sub("^[\u0655]+$", "",word)#ARABIC HAMZA BELOW + word = re.sub("^[\u0656]+$", "",word)#ARABIC SUBSC^[RIPT A]+$LEF + word = re.sub("^[\u0657]+$", "",word)#ARABIC INVERTED DAMMA + word = re.sub("^[\u0658]+$", "",word)#ARABIC MARK NOON GHUNNA + word = re.sub("^[\u0659]+$", "",word)#ARABIC ZWARAKAY + word = re.sub("^[\u065A]+$", "",word)#ARABIC VOWEL SIGN SMALL V ABOVE + word = re.sub("^[\u065B]+$", "",word)#ARABIC VOWEL SIGN INVERTED SMALL V ABOVE + word = re.sub("^[\u065C]+$", "",word)#ARABIC VOWEL SIGN DOT BELOW + word = re.sub("^[\u065D]+$", "",word)#ARABIC REVERSED DAMMA + word = re.sub("^[\u065E]+$", "",word)#ARABIC FATHA WITH TWO DOTS + word = re.sub("^[\u065F]+$", "",word)#ARABIC WAVY HAMZA BELOW + word = re.sub("^[\u0670]+$", "",word)#ARABIC LETTER SUPERSCRIPT ALEF + word = word.replace("ى", "ي") + word = word.replace("ؤ", "ء") + word = word.replace("ئ", "ء") + word = word.replace("ة", "ه") + word = word.replace("گ", "ك") + word = word.replace('ال','') + return word \ No newline at end of file diff --git a/NER.py b/NER.py new file mode 100644 index 00000000..8f70f645 --- /dev/null +++ b/NER.py @@ -0,0 +1,9 @@ +import spacy + +class NER_Tagger: + def __init__(self): + self.model = spacy.load('NER/SPACY/') + + def classify(self, text): + doc = self.model(text) + return [(str(ent), ent.label_) for ent in doc.ents] \ No newline at end of file diff --git a/POS.py b/POS.py new file mode 100644 index 00000000..ff30642a --- /dev/null +++ b/POS.py @@ -0,0 +1,15 @@ +from POS_SVM import POS_SVM +from POS_BILSTM import POS_BILSTM + +class POS: + def __init__(self,model = 'SVM'): + if model == 'SVM': + self.model = POS_SVM() + self.predict_func = self.model.classify + elif model == 'BILSTM': + self.model = POS_BILSTM() + self.predict_func = self.model.classify + else: + return 'Incorrect Model Name(Model name should be either SVM or BILSTM)' + def predict(self,sent): + return self.predict_func(sent) \ No newline at end of file diff --git a/POS_BILSTM.py b/POS_BILSTM.py new file mode 100644 index 00000000..9f13c8da --- /dev/null +++ b/POS_BILSTM.py @@ -0,0 +1,87 @@ +import gensim +import json +import os +import numpy as np +import pickle +from keras.preprocessing.sequence import pad_sequences +from keras.models import Sequential +from keras.layers import * +from keras.optimizers import Adam,SGD +from keras.models import Model +import re + +class POS_BILSTM: + def __init__(self): + self.word_tokenizer ,self.tag_tokenizer = self.__get_tokenizers() + self.VOCABULARY_SIZE = len(self.word_tokenizer.word_index) + 1 + self.NUM_CLASSES = 35 + self.__embedding_dim = 300 + self.__MAX_SEQUENCE_LENGTH = 398 + self.__trunc_type='post' + self.__padding_type='post' + self.__oov_tok = "" + self.__model = self.__define_model() + + def __get_tokenizers(self): + + with open('POS/BILSTM/word_tokenizer.pickle', 'rb') as handle: + word_tokenizer = pickle.load(handle) + with open('POS/BILSTM/tag_tokenizer.pickle', 'rb') as handle: + tag_tokenizer = pickle.load(handle) + + return word_tokenizer , tag_tokenizer + + + def __define_model(self): + + model = Sequential() + model.add(InputLayer((self.__MAX_SEQUENCE_LENGTH))) + model.add(Embedding(input_dim = self.VOCABULARY_SIZE, + output_dim = self.__embedding_dim, + input_length = self.__MAX_SEQUENCE_LENGTH, + trainable = True + )) + model.add(Bidirectional(LSTM(256, return_sequences=True))) + model.add(Bidirectional(LSTM(256, return_sequences=True))) + model.add(Bidirectional(LSTM(256, return_sequences=True))) + model.add(Bidirectional(LSTM(256, return_sequences=True))) + model.add(TimeDistributed(Dense(self.NUM_CLASSES, activation='softmax'))) + + model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) + model.load_weights('POS/BILSTM/Bi_LSTM_checkpoint.h5') + + return model + + + def clean_str(self, text): + + #remove tashkeel + p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') + text = re.sub(p_tashkeel,"", text) + + #remove longation + p_longation = re.compile(r'(.)\1+') + subst = r"\1\1" + text = re.sub(p_longation, subst, text) + + text = text.replace('وو', 'و') + text = text.replace('يي', 'ي') + text = text.replace('اا', 'ا') + text = text.replace('أ', 'ا') + text = text.replace('إ', 'ا') + text = text.replace('آ', 'ا') + text = text.replace('ى', 'ي') + + return text.split() + + + def classify(self, sentence): + sent = sentence + sentence = self.clean_str(sentence) + seq = [self.word_tokenizer.texts_to_sequences(sentence)] + pad_seq = pad_sequences(seq, maxlen=self.__MAX_SEQUENCE_LENGTH, padding=self.__padding_type, truncating=self.__trunc_type) + pad_seq = np.squeeze(pad_seq,axis=-1) + pred = np.squeeze(self.__model.predict(pad_seq).argmax(-1)) + output = [self.tag_tokenizer.index_word[tag] for tag in pred if tag != 0] + pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))] + return pred_tags diff --git a/POS_SVM.py b/POS_SVM.py new file mode 100644 index 00000000..5bcf5457 --- /dev/null +++ b/POS_SVM.py @@ -0,0 +1,81 @@ +import pickle +import gensim +import numpy as np + +class POS_SVM: + def __init__(self): + self.load_files() + + def load_files(self): + with open("POS/SVM/config.pkl", "rb") as tf: + self.config = pickle.load(tf) + self.WINDOW = self.config['Window'] + self.embedding_model = gensim.models.Word2Vec.load('POS/SVM/'+self.config['embedding_model_path']) + self.tags_list = self.config['tags_list'] + self.ambiguities = self.config['ambiguities'] + self.encoder = pickle.load(open('POS/SVM/'+self.config['encoder'], 'rb')) + self.model = pickle.load(open('POS/SVM/svm_pos_tag.pickle', 'rb')) + + def OneHotEncoder(self,number, lenght): + zero = np.zeros(lenght) + zero[number] = 1 + return zero + + def getFeatures(self,wordIdx, sentence, pos, w2v, tags, ambiguities, train=True): + features = [] + + keys = w2v.wv.key_to_index.keys() + for i in reversed(range(1,int(self.WINDOW)//2+1)): + if sentence[wordIdx-i] not in keys: + features.append(np.zeros(w2v.vector_size)) + else: + features.append(w2v.wv.get_vector(sentence[wordIdx-i], norm=True)) + + if sentence[wordIdx] not in keys: + features.append(np.zeros(w2v.vector_size)) + else: + features.append(w2v.wv.get_vector(sentence[wordIdx], norm=True)) + + for i in range(1,int(self.WINDOW)//2+1): + if sentence[wordIdx+i] not in keys: + features.append(np.zeros(w2v.vector_size)) + else: + features.append(w2v.wv.get_vector(sentence[wordIdx+i], norm=True)) + if train: + for i in reversed(range(1,int(self.WINDOW)//2+1)): + tag = pos[wordIdx-i] + features.append(self.OneHotEncoder(tags.index(tag),len(tags))) + + if sentence[wordIdx] in ambiguities: + features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags))) + else: + features.append(self.OneHotEncoder([],len(tags))) + + else: + for i in reversed(range(1,int(self.WINDOW)//2+1)): + tag = pos[wordIdx-i] + features.append(self.OneHotEncoder(tags.index(tag),len(tags))) + + if sentence[wordIdx] in ambiguities: + features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags))) + else: + features.append(self.OneHotEncoder([],len(tags))) + + features.append([len(sentence[wordIdx])]) + + flat_list = [] + for i in features: + flat_list.extend(i) + return flat_list + + def classify(self,sent): + words = sent.split() + words = ["" for i in range(int(self.WINDOW)//2)] + words + ["" for i in range(int(self.WINDOW)//2)] + pos = ["PAD" for i in range(int(self.WINDOW)//2)] + for i in range(int(self.WINDOW)//2,len(words) - int(self.WINDOW)//2): + feature = np.array(self.getFeatures(i , words, pos, self.embedding_model, self.tags_list, self.ambiguities, train = False)).reshape(1,-1) + tag = self.model.predict(feature) + pos.append(self.encoder.inverse_transform(tag)[0]) + output = pos[2:] + pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))] + return pred_tags \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 00000000..c979e95e --- /dev/null +++ b/app.py @@ -0,0 +1,49 @@ +from flask import Flask, render_template, request, jsonify +import re +from POS import POS +from NER import NER_Tagger +from Lemma import lemma + +pos_tagger = POS('SVM') +ner_tagger = NER_Tagger() + +app = Flask(__name__) +@app.route('/') +def home(): + return render_template('home.html') + + +@app.route('/join1',methods=['GET','POST']) +def lemmatization(): + wordlist = [] + text = request.form['text1'] + for word in text.split(): + temp = lemma(word.strip()) + if len(temp) < 3: + wordlist.append(word) + else: + wordlist.append(temp) + result = {'output':' '.join(wordlist)} + result = {str(key): value for key, value in result.items()} + return jsonify(result=result) + +@app.route('/join2',methods=['GET','POST']) +def pos(): + text = request.form['text1'] + output = pos_tagger.predict(text) + result = {'output':output} + result = {str(key): value for key, value in result.items()} + return jsonify(result=result) + +@app.route('/join',methods=['GET','POST']) +def ner(): + text = request.form['text1'] + output = ner_tagger.classify(text) + result = {'output':output} + result = {str(key): value for key, value in result.items()} + return jsonify(result=result) + + + +if __name__ == '__main__': + app.run(debug=True) diff --git a/demo_lemmatization.py b/demo_lemmatization.py new file mode 100755 index 00000000..0684ca01 --- /dev/null +++ b/demo_lemmatization.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun May 23 19:03:10 2021 + +@author: ehab +""" +from nltk.metrics import * + +import qalsadi.lemmatizer + + +#this function is to get output lemmatizer +def input_data(filename): + #open the file + file=open(filename) + text=file.read() + + lemmer=qalsadi.lemmatizer.Lemmatizer() + lemmas=lemmer.lemmatize_text(text) + return lemmas + + +#calculate the accuracy +def test_acc(test_file,lemmas): + #file=open(test_file) + test=[] + with open(test_file,'r') as file: + test=[current_place.rstrip() for current_place in file.readlines()] + + reference=lemmas + + a=accuracy(reference,test) + return(a*100) + + ########------test--------------## +#filename= "/home/ehab/Desktop/data.txt" +#test_file="/home/ehab/Desktop/test_data.txt" +#lemmas=input_data(filename) +#print(lemmas) +#print(test_acc(test_file,lemmas)) +