wise/metadata.py at wise2 · ox-vgg/wise · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
"""

Table of Contents
  A. Command line interface (CLI) parser and handler
  B. Import metadata

"""

import argparse
import sys
from pathlib import Path
import csv
import json
import sqlite3

from src.wise_project import WiseProject
from src import db
from src.metadata_type import MetadataType

from src.data_models import (
    MediaMetadata,
    SourceCollection,
    ExtraMediaMetadata,
    VectorMetadata,
    MediaType,
    SourceCollectionType,
)
from src.repository import (
    SourceCollectionRepo,
    MediaRepo,
    VectorRepo,
    MediaMetadataRepo,
)

WISE_COLNAME_PREFIX = '__'
wise_colnames = {}
wise_colnames[MetadataType.SEGMENT] = []
for colname in [ 'filename', 'metadata_id', 'starttime', 'stoptime']:
    wise_colnames[MetadataType.SEGMENT].append( WISE_COLNAME_PREFIX + colname )

##
## A. Command line interface (CLI) parser and handler
##

def main():
    parser = argparse.ArgumentParser(prog='metadata',
                                     description='Manage metadata associated with media contained in a WISE project',
                                     epilog='''
                                     Notes: Each column in the input CSV file can be referenced using column name (e.g. "filename").
                                     A column can be composed by combining two or more columns. For example,
                                     --col-filename "{participant_id}/videos/{video_id}.MP4" will construct filename using values
                                     taken from "participant_id" and "video_id" before matching it to one of the existing media
                                     files in the WISE project.''')

    parser.add_argument('command',
                        choices=['import'],
                        nargs='?',
                        help='various modes of operation supported by the metadata script')

    parser.add_argument('--from-csv',
                        required=False,
                        type=str,
                        help='a CSV filename, must have a following column header as the first line')

    parser.add_argument('--metadata-id',
                        required=False,
                        type=str,
                        help='a unique id of the form FOLDER_NAME/DB_NAME/TABLE_NAME (e.g. "EpicKitchens-100/retrieval_annotations/train"')

    parser.add_argument('--col-metadata-id',
                        required=False,
                        type=str,
                        help='column containing the unique id of each row of metadata (see notes)')

    parser.add_argument('--col-filename',
                        required=False,
                        type=str,
                        help='filename column that maps to existing media files in the WISE project (see notes)')

    parser.add_argument('--col-starttime',
                        required=False,
                        type=str,
                        help='column containing start time of a temporal segment (see notes)')

    parser.add_argument('--col-stoptime',
                        required=False,
                        type=str,
                        help='column containing stop time of the temporal segment (see notes)')

    parser.add_argument('--col-metadata',
                        required=False,
                        action='append',
                        type=str,
                        help='column(s) containing metadata of the temporal segment (see notes)')

    parser.add_argument('--project-dir',
                        required=True,
                        type=str,
                        help='folder where all project assets are stored')

    args = parser.parse_args()

    if(args.command == 'import'):
        import_metadata(args)
    else:
        print(f'unknown command {args.command}')

##
## B. Import metadata
##

def import_metadata(args):
    project = WiseProject(args.project_dir, create_project=False, db_kwargs={'echo': False})
    project_assets = project.discover_assets()
    if len(project_assets) == 0:
        print(f'failed to load assets from {args.project_dir}')
        sys.exit(1)
    db_engine = project.db_engine

    metadata_db, metadata_table = project.metadata_db_table(args.metadata_id)
    if metadata_exist(metadata_db, metadata_table):
        print(f'metadata "{args.metadata_id}" already exists in file {metadata_db}')
        return

    if args.from_csv:
        csv_filename = Path(args.from_csv)
        if not csv_filename.exists():
            print(f'csv does not exist: {csv_filename}')
        metadata, wise_colnames, metadata_colnames = load_metadata_from_csv(args.from_csv, args)

    metadata_count = len(metadata)
    if metadata_count == 0:
        print(f'metadata not found')
        return

    valid_metadata = get_valid_metadata(metadata, db_engine)

    metadata_type = MetadataType.SEGMENT
    add_metadata(metadata_db,
                 metadata_table,
                 valid_metadata,
                 metadata_type,
                 wise_colnames,
                 metadata_colnames)

def load_metadata_from_csv(csv_filename, args):
    print(f'Loading metadata from CSV file ...')
    metadata_store = []
    metadata_colnames = [ colname for colname in args.col_metadata ]
    with open(csv_filename, 'r') as csv_file:
        if not csv.Sniffer().has_header(csv_file.read(2048)):
            print(f'csv file must have a header row')
            sys.exit(1)
        csv_file.seek(0)
        dialect = csv.Sniffer().sniff(csv_file.read(2048))
        csv_file.seek(0)
        reader = csv.DictReader(csv_file, dialect=dialect)
        colnames = reader.fieldnames

        for row in reader:
            try:
                metadata_id = get_csv_row_col_value(row, args.col_metadata_id)
                filename = get_csv_row_col_value(row, args.col_filename)
                starttime = get_csv_row_col_value(row, args.col_starttime)
                stoptime = get_csv_row_col_value(row, args.col_stoptime)
                metadata = {
                    WISE_COLNAME_PREFIX + 'filename': filename,
                    WISE_COLNAME_PREFIX + 'metadata_id': metadata_id,
                    WISE_COLNAME_PREFIX + 'starttime': time2sec(starttime),
                    WISE_COLNAME_PREFIX + 'stoptime': time2sec(stoptime),
                }
                for col_id in args.col_metadata:
                    metadata[col_id] = row[col_id]
                metadata_store.append(metadata)
            except ex:
                print(f'Error parsing row: {row}')
    return metadata_store, wise_colnames, metadata_colnames

def get_valid_metadata(metadata, db_engine):
    mismatched_timestamp_count = 0
    missing_filename_list = []
    mismatch_filename_count = 0
    valid_metadata = []
    with db_engine.connect() as conn:
        for i in range(0, len(metadata)):
            filename = metadata[i][WISE_COLNAME_PREFIX + 'filename']
            media_metadata = MediaRepo.get_row_by_column_match(conn,
                                                               column_name_to_match='path',
                                                               column_value=filename)
            if media_metadata:
                duration = float(media_metadata.duration)
                starttime = metadata[i][WISE_COLNAME_PREFIX + 'starttime']
                stoptime = metadata[i][WISE_COLNAME_PREFIX + 'stoptime']
                if starttime < 0 or starttime >= duration and stoptime < 0 and stoptime >= duration:
                    mismatched_timestamp_count += 1
                    print(f'Discarding malformed media segment: {row}')
                else:
                    valid_metadata.append( metadata[i] )
            else:
                if filename not in missing_filename_list:
                    missing_filename_list.append(filename)
                mismatch_filename_count += 1

    print(f'Adding {len(valid_metadata)} rows of metadata (discarded {len(metadata) - len(valid_metadata)} rows)')
    if mismatch_filename_count:
        print(f'  - there were {mismatch_filename_count} rows in the input metadata file whose filename were not found in WISE project')
        print(f'  - missing filenames: {missing_filename_list}')
    if mismatched_timestamp_count:
        print(f'  - discarding {mismatched_timestamp_count} row that contained malformed media segment')

    return valid_metadata

def add_metadata(metadata_db, metadata_table, metadata, metadata_type, wise_colnames, metadata_colnames):
    print(f'Adding {len(metadata)} rows of metadata to following sqlite table: {metadata_table} ...')
    # check that all the required WISE columns are contained in the metadata
    sqlite_data = []
    for metadata_index in range(0, len(metadata)):
        for wise_colname in wise_colnames[metadata_type]:
            if wise_colname not in metadata[metadata_index]:
                print(f'Invalid metadata, missing {wise_colname}. All entried must contain the following fields:')
                print(f'{wise_colnames[metadata_type]}')
                return

    sql_col_specs = []
    metadata_table_colname = []
    for colname in wise_colnames[metadata_type]:
        if colname in ['__starttime', '__stoptime']:
            sql_col_specs.append(f'{colname} NUMERIC')
        else:
            sql_col_specs.append(f'{colname} TEXT')
        metadata_table_colname.append(colname)
    for colname in metadata_colnames:
        # FIXME: the user supplied metadata can be any type (e.g. year stored as number)
        sql_col_specs.append(f'{colname} TEXT')
        metadata_table_colname.append(colname)
    sql_col_specs_str = ', '.join(sql_col_specs)
    sql = f'CREATE TABLE {metadata_table} ( {sql_col_specs_str} )'

    with sqlite3.connect(metadata_db) as sqlite_connection:
        cursor = sqlite_connection.cursor()
        ## 0. debug
        cursor.execute(f'BEGIN TRANSACTION')
        cursor.execute(f'DROP TABLE IF EXISTS {metadata_table}')

        ## 1. create metadata table
        sql = f'CREATE TABLE {metadata_table} ( {sql_col_specs_str} )'
        cursor.execute(sql)

        ## 2. Insert data in bulk
        sql_data = []
        metadata_table_colname_csv = ','.join(metadata_table_colname)
        value_placeholders = ','.join( ['?'] * len(metadata_table_colname) )
        for metadata_index in range(0, len(metadata)):
            sql_data.append( tuple(metadata[metadata_index][colname] for colname in metadata_table_colname) )
        sql = f'INSERT INTO {metadata_table}({metadata_table_colname_csv}) VALUES ({value_placeholders})'
        cursor.executemany(sql, sql_data)

        ## Note: Index for full text search is created using create-index.py script

##
## Helper functions
##
def get_csv_row_col_value(row, col_id):
    if '{' in col_id and '}' in col_id:
        col_value = col_id.format(**row)
    else:
        col_value = row[col_id]
    return col_value

def time2sec(time):
    if isinstance(time, int) or isinstance(time, float):
        return float(time)
    if isinstance(time, str):
        if ':' in time:
            return hhmmss_to_sec(time)
        else:
            try:
                time_sec = float(time)
                return time_sec
            except ex:
                print(ex)

def hhmmss_to_sec(hhmmss):
    tok = hhmmss.split(':')
    assert len(tok) == 3
    hh = int(tok[0])
    mm = int(tok[1])
    ssms_tok = tok[2].split('.')
    ss = int(ssms_tok[0])
    ms = int(ssms_tok[1])
    sec = hh*60*60 + mm*60 + ss + ms/100.0
    return float(sec)

def metadata_exist(metadata_db, metadata_table):
    if metadata_db.exists():
        with sqlite3.connect( str(metadata_db) ) as sqlite_connection:
            cursor = sqlite_connection.cursor()
            res = cursor.execute(f'SELECT COUNT(*) FROM sqlite_master WHERE type="table" AND name="{metadata_table}"')
            if res == (1,):
                return True
    return False

if __name__ == '__main__':
    main()