Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions Lemma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import re

def lemma(word):
#Remove honorific sign
word = word.replace('"','')
word = word.replace("(", "")
word = word.replace(")", "")
word = word.replace('؟','')
word = word.replace('!','')
word = word.replace('.','')
word = re.sub("^[\u0610]+$", "",word) #ARABIC SIGN SALLALLAHOU ALAYHE WA SALLAM
word = re.sub("^[\u0611]+$", "",word)#ARABIC SIGN ALAYHE ASSALLAM
word = re.sub("^[\u0612]+$", "",word)#ARABIC SIGN RAHMATULLAH ALAYHE
word = re.sub("^[\u0613]+$", "",word)#ARABIC SIGN RADI ALLAHOU ANHU
word = re.sub("^[\u0614]+$", "",word)#ARABIC SIGN TAKHALLUS
#Remove koranic anotation
word = re.sub("^[\u0615]+$", "",word)#ARABIC SMALL HIGH TAH
word = re.sub("^[\u0616]+$", "",word)#ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
word = re.sub("^[\u0617]+$", "",word)#ARABIC SMALL HIGH ZAIN
word = re.sub("^[\u0618]+$", "",word)#ARABIC SMALL FATHA
word = re.sub("^[\u0619]+$", "",word)#ARABIC SMALL DAMMA
word = re.sub("^[\u061A]+$", "",word)#ARABIC SMALL KASRA
word = re.sub("^[\u06D6]+$", "",word)#ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
word = re.sub("^[\u06D7]+$", "",word)#ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
word = re.sub("^[\u06D8]+$", "",word)#ARABIC SMALL HIGH MEEM INITIAL FORM
word = re.sub("^[\u06D9]+$", "",word)#ARABIC SMALL HIGH LAM ALEF
word = re.sub("^[\u06DA]+$", "",word)#ARABIC SMALL HIGH JEEM
word = re.sub("^[\u06DB]+$", "",word)#ARABIC SMALL HIGH THREE DOTS
word = re.sub("^[\u06DC]+$", "",word)#ARABIC SMALL HIGH SEEN
word = re.sub("^[\u06DD]+$", "",word)#ARABIC END OF AYAH
word = re.sub("^[\u06DE]+$", "",word)#ARABIC START OF RUB EL HIZB
word = re.sub("^[\u06DF]+$", "",word)#ARABIC SMALL HIGH ROUNDED ZERO
word = re.sub("^[\u06E0]+$", "",word)#ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
word = re.sub("^[\u06E1]+$", "",word)#ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
word = re.sub("^[\u06E2]+$", "",word)#ARABIC SMALL HIGH MEEM ISOLATED FORM
word = re.sub("^[\u06E3]+$", "",word)#ARABIC SMALL LOW SEEN
word = re.sub("^[\u06E4]+$", "",word)#ARABIC SMALL HIGH MADDA
word = re.sub("^[\u06E5]+$", "",word)#ARABIC SMALL WAW
word = re.sub("^[\u06E6]+$", "",word)#ARABIC SMALL YEH
word = re.sub("^[\u06E7]+$", "",word)#ARABIC SMALL HIGH YEH
word = re.sub("^[\u06E8]+$", "",word)#ARABIC SMALL HIGH NOON
word = re.sub("^[\u06E9]+$", "",word)#ARABIC PLACE OF SAJDAH
word = re.sub("^[\u06EA]+$", "",word)#ARABIC EMPTY CENTRE LOW STOP
word = re.sub("^[\u06EB]+$", "",word)#ARABIC EMPTY CENTRE HIGH STOP
word = re.sub("^[\u06EC]+$", "",word)#ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
word = re.sub("^[\u06ED]+$", "",word)#ARABIC SMALL LOW MEEM
#Remove tatweel
word = re.sub("^[\u0640]+$", "",word)
#Remove tashkeel
word = re.sub("^[\u064B]+$", "",word)#ARABIC FATHATAN
word = re.sub("^[\u064C]+$", "",word)#ARABIC DAMMATAN
word = re.sub("^[\u064D]+$", "",word)#ARABIC KASRATAN
word = re.sub("^[\u064E]+$", "",word)#ARABIC FATHA
word = re.sub("^[\u064F]+$", "",word)#ARABIC DAMMA
word = re.sub("^[\u0650]+$", "",word)#ARABIC KASRA
word = re.sub("^[\u0651]+$", "",word)#ARABIC SHADDA
word = re.sub("^[\u0652]+$", "",word)#ARABIC SUKUN
word = re.sub("^[\u0653]+$", "",word)#ARABIC MADDAH ABOVE
word = re.sub("^[\u0654]+$", "",word)#ARABIC HAMZA ABOVE
word = re.sub("^[\u0655]+$", "",word)#ARABIC HAMZA BELOW
word = re.sub("^[\u0656]+$", "",word)#ARABIC SUBSC^[RIPT A]+$LEF
word = re.sub("^[\u0657]+$", "",word)#ARABIC INVERTED DAMMA
word = re.sub("^[\u0658]+$", "",word)#ARABIC MARK NOON GHUNNA
word = re.sub("^[\u0659]+$", "",word)#ARABIC ZWARAKAY
word = re.sub("^[\u065A]+$", "",word)#ARABIC VOWEL SIGN SMALL V ABOVE
word = re.sub("^[\u065B]+$", "",word)#ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
word = re.sub("^[\u065C]+$", "",word)#ARABIC VOWEL SIGN DOT BELOW
word = re.sub("^[\u065D]+$", "",word)#ARABIC REVERSED DAMMA
word = re.sub("^[\u065E]+$", "",word)#ARABIC FATHA WITH TWO DOTS
word = re.sub("^[\u065F]+$", "",word)#ARABIC WAVY HAMZA BELOW
word = re.sub("^[\u0670]+$", "",word)#ARABIC LETTER SUPERSCRIPT ALEF
word = word.replace("ى", "ي")
word = word.replace("ؤ", "ء")
word = word.replace("ئ", "ء")
word = word.replace("ة", "ه")
word = word.replace("گ", "ك")
word = word.replace('ال','')
return word
9 changes: 9 additions & 0 deletions NER.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import spacy

class NER_Tagger:
def __init__(self):
self.model = spacy.load('NER/SPACY/')

def classify(self, text):
doc = self.model(text)
return [(str(ent), ent.label_) for ent in doc.ents]
15 changes: 15 additions & 0 deletions POS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from POS_SVM import POS_SVM
from POS_BILSTM import POS_BILSTM

class POS:
def __init__(self,model = 'SVM'):
if model == 'SVM':
self.model = POS_SVM()
self.predict_func = self.model.classify
elif model == 'BILSTM':
self.model = POS_BILSTM()
self.predict_func = self.model.classify
else:
return 'Incorrect Model Name(Model name should be either SVM or BILSTM)'
def predict(self,sent):
return self.predict_func(sent)
87 changes: 87 additions & 0 deletions POS_BILSTM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import gensim
import json
import os
import numpy as np
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam,SGD
from keras.models import Model
import re

class POS_BILSTM:
def __init__(self):
self.word_tokenizer ,self.tag_tokenizer = self.__get_tokenizers()
self.VOCABULARY_SIZE = len(self.word_tokenizer.word_index) + 1
self.NUM_CLASSES = 35
self.__embedding_dim = 300
self.__MAX_SEQUENCE_LENGTH = 398
self.__trunc_type='post'
self.__padding_type='post'
self.__oov_tok = "<OOV>"
self.__model = self.__define_model()

def __get_tokenizers(self):

with open('POS/BILSTM/word_tokenizer.pickle', 'rb') as handle:
word_tokenizer = pickle.load(handle)
with open('POS/BILSTM/tag_tokenizer.pickle', 'rb') as handle:
tag_tokenizer = pickle.load(handle)

return word_tokenizer , tag_tokenizer


def __define_model(self):

model = Sequential()
model.add(InputLayer((self.__MAX_SEQUENCE_LENGTH)))
model.add(Embedding(input_dim = self.VOCABULARY_SIZE,
output_dim = self.__embedding_dim,
input_length = self.__MAX_SEQUENCE_LENGTH,
trainable = True
))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(self.NUM_CLASSES, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.load_weights('POS/BILSTM/Bi_LSTM_checkpoint.h5')

return model


def clean_str(self, text):

#remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel,"", text)

#remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)

text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
text = text.replace('أ', 'ا')
text = text.replace('إ', 'ا')
text = text.replace('آ', 'ا')
text = text.replace('ى', 'ي')

return text.split()


def classify(self, sentence):
sent = sentence
sentence = self.clean_str(sentence)
seq = [self.word_tokenizer.texts_to_sequences(sentence)]
pad_seq = pad_sequences(seq, maxlen=self.__MAX_SEQUENCE_LENGTH, padding=self.__padding_type, truncating=self.__trunc_type)
pad_seq = np.squeeze(pad_seq,axis=-1)
pred = np.squeeze(self.__model.predict(pad_seq).argmax(-1))
output = [self.tag_tokenizer.index_word[tag] for tag in pred if tag != 0]
pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))]
return pred_tags
81 changes: 81 additions & 0 deletions POS_SVM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pickle
import gensim
import numpy as np

class POS_SVM:
def __init__(self):
self.load_files()

def load_files(self):
with open("POS/SVM/config.pkl", "rb") as tf:
self.config = pickle.load(tf)
self.WINDOW = self.config['Window']
self.embedding_model = gensim.models.Word2Vec.load('POS/SVM/'+self.config['embedding_model_path'])
self.tags_list = self.config['tags_list']
self.ambiguities = self.config['ambiguities']
self.encoder = pickle.load(open('POS/SVM/'+self.config['encoder'], 'rb'))
self.model = pickle.load(open('POS/SVM/svm_pos_tag.pickle', 'rb'))

def OneHotEncoder(self,number, lenght):
zero = np.zeros(lenght)
zero[number] = 1
return zero

def getFeatures(self,wordIdx, sentence, pos, w2v, tags, ambiguities, train=True):
features = []

keys = w2v.wv.key_to_index.keys()
for i in reversed(range(1,int(self.WINDOW)//2+1)):
if sentence[wordIdx-i] not in keys:
features.append(np.zeros(w2v.vector_size))
else:
features.append(w2v.wv.get_vector(sentence[wordIdx-i], norm=True))

if sentence[wordIdx] not in keys:
features.append(np.zeros(w2v.vector_size))
else:
features.append(w2v.wv.get_vector(sentence[wordIdx], norm=True))

for i in range(1,int(self.WINDOW)//2+1):
if sentence[wordIdx+i] not in keys:
features.append(np.zeros(w2v.vector_size))
else:
features.append(w2v.wv.get_vector(sentence[wordIdx+i], norm=True))
if train:
for i in reversed(range(1,int(self.WINDOW)//2+1)):
tag = pos[wordIdx-i]
features.append(self.OneHotEncoder(tags.index(tag),len(tags)))

if sentence[wordIdx] in ambiguities:
features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
else:
features.append(self.OneHotEncoder([],len(tags)))

else:
for i in reversed(range(1,int(self.WINDOW)//2+1)):
tag = pos[wordIdx-i]
features.append(self.OneHotEncoder(tags.index(tag),len(tags)))

if sentence[wordIdx] in ambiguities:
features.append(self.OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
else:
features.append(self.OneHotEncoder([],len(tags)))

features.append([len(sentence[wordIdx])])

flat_list = []
for i in features:
flat_list.extend(i)
return flat_list

def classify(self,sent):
words = sent.split()
words = ["" for i in range(int(self.WINDOW)//2)] + words + ["" for i in range(int(self.WINDOW)//2)]
pos = ["PAD" for i in range(int(self.WINDOW)//2)]
for i in range(int(self.WINDOW)//2,len(words) - int(self.WINDOW)//2):
feature = np.array(self.getFeatures(i , words, pos, self.embedding_model, self.tags_list, self.ambiguities, train = False)).reshape(1,-1)
tag = self.model.predict(feature)
pos.append(self.encoder.inverse_transform(tag)[0])
output = pos[2:]
pred_tags = [(sent.split()[i],output[i]) for i in range(len(sent.split()))]
return pred_tags
49 changes: 49 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from flask import Flask, render_template, request, jsonify
import re
from POS import POS
from NER import NER_Tagger
from Lemma import lemma

pos_tagger = POS('SVM')
ner_tagger = NER_Tagger()

app = Flask(__name__)
@app.route('/')
def home():
return render_template('home.html')


@app.route('/join1',methods=['GET','POST'])
def lemmatization():
wordlist = []
text = request.form['text1']
for word in text.split():
temp = lemma(word.strip())
if len(temp) < 3:
wordlist.append(word)
else:
wordlist.append(temp)
result = {'output':' '.join(wordlist)}
result = {str(key): value for key, value in result.items()}
return jsonify(result=result)

@app.route('/join2',methods=['GET','POST'])
def pos():
text = request.form['text1']
output = pos_tagger.predict(text)
result = {'output':output}
result = {str(key): value for key, value in result.items()}
return jsonify(result=result)

@app.route('/join',methods=['GET','POST'])
def ner():
text = request.form['text1']
output = ner_tagger.classify(text)
result = {'output':output}
result = {str(key): value for key, value in result.items()}
return jsonify(result=result)



if __name__ == '__main__':
app.run(debug=True)
42 changes: 42 additions & 0 deletions demo_lemmatization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 19:03:10 2021

@author: ehab
"""
from nltk.metrics import *

import qalsadi.lemmatizer


#this function is to get output lemmatizer
def input_data(filename):
#open the file
file=open(filename)
text=file.read()

lemmer=qalsadi.lemmatizer.Lemmatizer()
lemmas=lemmer.lemmatize_text(text)
return lemmas


#calculate the accuracy
def test_acc(test_file,lemmas):
#file=open(test_file)
test=[]
with open(test_file,'r') as file:
test=[current_place.rstrip() for current_place in file.readlines()]

reference=lemmas

a=accuracy(reference,test)
return(a*100)

########------test--------------##
#filename= "/home/ehab/Desktop/data.txt"
#test_file="/home/ehab/Desktop/test_data.txt"
#lemmas=input_data(filename)
#print(lemmas)
#print(test_acc(test_file,lemmas))