Image-Captioning/deploy.py at master · Thejesh-M/Image-Captioning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import pandas as pd
import pickle
import numpy as np
import os, urllib
from PIL import Image
from keras.applications.resnet50 import ResNet50
from keras.optimizers import Adam
from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
from keras.models import Sequential, Model
from keras.models import load_model
import random
from keras.preprocessing import image, sequence
import matplotlib.pyplot as plt
# import keras.backend.tensorflow_backend as tb
# tb._SYMBOLIC_SCOPE.value = True


embedding_size=128
max_len=40
vocab_size=8254

@st.cache_data
def resnet():
    res = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg')
    return res

@st.cache_resource
def im_model():
    image_model = Sequential()
    image_model.add(Dense(embedding_size, input_shape=(2048,), activation='relu'))
    image_model.add(RepeatVector(max_len))
    language_model = Sequential()
    language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len))
    language_model.add(LSTM(256, return_sequences=True))
    language_model.add(TimeDistributed(Dense(embedding_size)))
    conca = Concatenate()([image_model.output, language_model.output])
    x = LSTM(128, return_sequences=True)(conca)
    x = LSTM(512, return_sequences=False)(x)
    x = Dense(vocab_size)(x)
    out = Activation('softmax')(x)
    model = Model([image_model.input, language_model.input],out)
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    model.load_weights('final-imcap.h5')
    # model.load_weights('83.h5')
    return model

page_bg_img = '''
<style>
body {
    background-image: url("https://ak.picdn.net/shutterstock/videos/1016880070/thumb/1.jpg");
    background-size: cover;
    }
    </style>
    '''
st.markdown(page_bg_img, unsafe_allow_html=True)

@st.cache_data
def preprocessing(img_path):
    im = image.load_img(img_path, target_size=(224,224,3))
    im = image.img_to_array(im)
    im = np.expand_dims(im, axis=0)
    return im

def get_encoding(model, img):
    image = preprocessing(img)
    pred = model.predict(image).reshape(2048)
    return pred

tit='''
    <div style="color:black;
              background-color:white;
              font-size:200%;
              font-weight: bold;
              font-style: italic;
              display:inline-block;
              padding:5px;
              border-radius: 15px"
       >Image Captioning</div>
 '''
t=st.markdown(tit, unsafe_allow_html=True)


with open('w2i (2).p', 'rb') as f:
    word_2_indices= pickle.load(f, encoding="bytes")
with open('i2w (2).p', 'rb') as f:
    indices_2_word= pickle.load(f, encoding="bytes")

@st.cache_data
def predict_captions(image):
    start_word = ["<start>"]
    while True:
        par_caps = [word_2_indices[i] for i in start_word]
        par_caps = sequence.pad_sequences([par_caps], maxlen=max_len, padding='post')
        model=im_model()
        preds = model.predict([np.array([image]), np.array(par_caps)])
        word_pred = indices_2_word[np.argmax(preds[0])]
        start_word.append(word_pred)

        if word_pred == "<end>" or len(start_word) > max_len:
            break

    return ' '.join(start_word[1:-1])

uploaded_file=st.file_uploader('Upload the image',type=['jpg','png'])
st.set_option('deprecation.showfileUploaderEncoding', False)


if uploaded_file is not None:
    img   = Image.open(uploaded_file)
    st.image(img)
    s=st.success('Generating Caption')
    test_img = get_encoding(resnet(), uploaded_file)
    Argmax_Search = predict_captions(test_img)
    s.empty()
    st.markdown(
        f'''<html>
    <p style="color:white;
              background-color:black;
              font-size:140%;
              display:inline-block;
              padding:10px;
              border-radius: 15px;"
       >{Argmax_Search}</p>
  </html> ''',
        unsafe_allow_html=True)