Image-Caption-Generator/clean_captions.py at master · udaram/Image-Caption-Generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
import numpy as np
import string
import pickle
from pickle import load, dump
import os

#reading caption file
file = pd.read_csv('./Dataset/results.csv',delimiter='|')

file.columns = ['image_name', 'comment_number', 'comment']
file = np.array(file)

#caption <--> ID mapping
def caption_IDMapping(file):
  mapping=dict()
  for row in file:
    if row[0] not in mapping:
      mapping[row[0]]=list()
    mapping[row[0]].append(row[2])

  return mapping

captions = caption_IDMapping(file)
#selecting only those captionins whose images are available in dataset
images = os.listdir("Dataset/flickr30k_images")
keys = [k[0] for k in captions.items()]
for key in keys:
  if key not in images:
    del captions[key]


def clean_captions(captions):
  # prepare translation table for removing punctuation
  table = str.maketrans('', '', string.punctuation)
  for key, caption_list in captions.items():
    for i in range(len(caption_list)):
      caption = caption_list[i]
      caption = caption.split()
			# convert to lower case
      caption = [word.lower() for word in caption]
			# remove punctuation from each token
      caption = [w.translate(table) for w in caption]
			# remove hanging 's' and 'a'
      caption = [word for word in caption if len(word)>1]
			# remove tokens with numbers in them
      caption = [word for word in caption if word.isalpha()]
      caption_list[i] =  'startseq ' + ' '.join(caption) + ' endseq'

clean_captions(captions)

with open("Preprocessed Data/clean_captions.pkl", "wb") as encoded_pickle:
    pickle.dump(captions, encoded_pickle)

print("Preprocessed Captions are succesfully stored as **clean_captions.pkl** in folder Preprocessed Data")