nicknochnack · sphynxlee · Mar 27, 2024 · Apr 3, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+# LipNet
+data/
+models/
+results/
+__pycache__/
+.vscode/
+*.zip
diff --git a/LipNet.ipynb b/LipNet.ipynb
diff --git a/LipNet_my.ipynb b/LipNet_my.ipynb
diff --git a/aligment_generator/align_gen.py b/aligment_generator/align_gen.py
@@ -0,0 +1,49 @@
+from moviepy.editor import VideoFileClip
+import speech_recognition as sr
+import os
+video_path = os.getcwd() + "/data_reprocess/input.mp4"
+print('video_path:', video_path)
+audio_path = os.getcwd() + "/data_reprocess/input.wav"
+print('audio_path:', audio_path)
+align_file_path = os.getcwd() + "/data_reprocess/input.txt"
+print('align_file_path:', align_file_path)
+
+def extract_audio_from_video(video_path, audio_path):
+    video_clip = VideoFileClip(video_path)
+    audio_clip = video_clip.audio
+    audio_clip.write_audiofile(audio_path)
+
+def recognize_speech_from_file(file_path):
+    recognizer = sr.Recognizer()
+
+    with sr.AudioFile(file_path) as source:
+        audio = recognizer.record(source)
+
+    return recognizer.recognize_google(audio)
+
+def format_align_file(transcription):
+    # For simplicity, let's assume that the words in the transcription are separated by spaces
+    words = transcription.split()
+
+    # Set the total duration of the video (in milliseconds)
+    total_duration = 74500
+
+    # Calculate the approximate duration of each word
+    word_duration = total_duration / len(words)
+
+    align_file_content = ""
+
+    start_time = 0
+    for i, word in enumerate(words):
+        end_time = start_time + word_duration
+        align_file_content += f"{start_time} {end_time} {word}\n"
+        start_time = end_time
+
+    return align_file_content
+
+extract_audio_from_video(video_path, audio_path)
+transcription = recognize_speech_from_file(audio_path)
+align_file_content = format_align_file(transcription)
+
+with open(align_file_path, "w") as align_file:
+    align_file.write(align_file_content)
diff --git a/animation.gif b/animation.gif
diff --git a/app/animation.gif b/app/animation.gif
diff --git a/app/modelutil.py b/app/modelutil.py
@@ -1,8 +1,13 @@
-import os 
-from tensorflow.keras.models import Sequential 
-from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
-
-def load_model() -> Sequential: 
+import os
+# from tensorflow.keras.models import Sequential
+# from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
+from keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
+from keras.models import Sequential
+from keras.layers import LSTM, Bidirectional
+from keras.initializers import Orthogonal
+import tensorflow as tf
+
+def load_model() -> Sequential:
     model = Sequential()
 
     model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
@@ -19,14 +24,29 @@ def load_model() -> Sequential:
 
     model.add(TimeDistributed(Flatten()))
 
-    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    # model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
     model.add(Dropout(.5))
 
-    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    # model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
+    model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
     model.add(Dropout(.5))
 
     model.add(Dense(41, kernel_initializer='he_normal', activation='softmax'))
 
-    model.load_weights(os.path.join('..','models','checkpoint'))
+    # model.load_weights(os.path.join('..','models','checkpoint'))
+
+    # Ensure the checkpoint directory is correctly set relative to the script's execution directory
+    checkpoint_dir = os.path.join('..', 'models')
+
+    # Create a checkpoint instance that points to the folder where the checkpoints are saved
+    checkpoint = tf.train.Checkpoint(model=model)
+
+    # Restore the latest checkpoint
+    latest = tf.train.latest_checkpoint(checkpoint_dir)
+    if latest:
+        checkpoint.restore(latest)
+    else:
+        raise FileNotFoundError(f"No checkpoint found in {checkpoint_dir}")
 
     return model
diff --git a/app/streamlitapp.py b/app/streamlitapp.py
@@ -1,48 +1,50 @@
 # Import all of the dependencies
 import streamlit as st
-import os 
-import imageio 
+import os
+import imageio
+import keras
 
-import tensorflow as tf 
+import tensorflow as tf
 from utils import load_data, num_to_char
 from modelutil import load_model
 
-# Set the layout to the streamlit app as wide 
+# Set the layout to the streamlit app as wide
 st.set_page_config(layout='wide')
 
 # Setup the sidebar
-with st.sidebar: 
-    st.image('https://www.onepointltd.com/wp-content/uploads/2020/03/inno2.png')
-    st.title('LipBuddy')
-    st.info('This application is originally developed from the LipNet deep learning model.')
+# with st.sidebar:
+#     st.image('https://www.onepointltd.com/wp-content/uploads/2020/03/inno2.png')
+#     st.title('LipBuddy')
+#     st.info('This application is originally developed from the LipNet deep learning model.')
 
-st.title('LipNet Full Stack App') 
-# Generating a list of options or videos 
+st.title('LipNet Full Stack App')
+# Generating a list of options or videos
 options = os.listdir(os.path.join('..', 'data', 's1'))
 selected_video = st.selectbox('Choose video', options)
 
-# Generate two columns 
+# Generate two columns
 col1, col2 = st.columns(2)
 
-if options: 
+if options:
 
-    # Rendering the video 
-    with col1: 
+    # Rendering the video
+    with col1:
         st.info('The video below displays the converted video in mp4 format')
         file_path = os.path.join('..','data','s1', selected_video)
         os.system(f'ffmpeg -i {file_path} -vcodec libx264 test_video.mp4 -y')
 
         # Rendering inside of the app
-        video = open('test_video.mp4', 'rb') 
-        video_bytes = video.read() 
+        video = open('test_video.mp4', 'rb')
+        # video = open('input.mp4', 'rb')
+        video_bytes = video.read()
         st.video(video_bytes)
 
 
-    with col2: 
+    with col2:
         st.info('This is all the machine learning model sees when making a prediction')
         video, annotations = load_data(tf.convert_to_tensor(file_path))
         imageio.mimsave('animation.gif', video, fps=10)
-        st.image('animation.gif', width=400) 
+        st.image('animation.gif', width=400)
 
         st.info('This is the output of the machine learning model as tokens')
         model = load_model()
@@ -54,4 +56,4 @@
         st.info('Decode the raw tokens into words')
         converted_prediction = tf.strings.reduce_join(num_to_char(decoder)).numpy().decode('utf-8')
         st.text(converted_prediction)
-        
+
diff --git a/app/utils.py b/app/utils.py
@@ -1,7 +1,8 @@
 import tensorflow as tf
 from typing import List
+import numpy as np
 import cv2
-import os 
+import os
 
 vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
 char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
@@ -10,39 +11,40 @@
     vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
 )
 
-def load_video(path:str) -> List[float]: 
+def load_video(path:str) -> List[float]:
     #print(path)
     cap = cv2.VideoCapture(path)
     frames = []
-    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
+    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
         ret, frame = cap.read()
         frame = tf.image.rgb_to_grayscale(frame)
         frames.append(frame[190:236,80:220,:])
     cap.release()
-    
+
     mean = tf.math.reduce_mean(frames)
     std = tf.math.reduce_std(tf.cast(frames, tf.float32))
     return tf.cast((frames - mean), tf.float32) / std
-    
-def load_alignments(path:str) -> List[str]: 
+
+def load_alignments(path:str) -> List[str]:
     #print(path)
-    with open(path, 'r') as f: 
-        lines = f.readlines() 
+    with open(path, 'r') as f:
+        lines = f.readlines()
     tokens = []
     for line in lines:
         line = line.split()
-        if line[2] != 'sil': 
+        if line[2] != 'sil':
             tokens = [*tokens,' ',line[2]]
     return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
 
-def load_data(path: str): 
+def load_data(path: str):
     path = bytes.decode(path.numpy())
     file_name = path.split('/')[-1].split('.')[0]
     # File name splitting for windows
     file_name = path.split('\\')[-1].split('.')[0]
     video_path = os.path.join('..','data','s1',f'{file_name}.mpg')
     alignment_path = os.path.join('..','data','alignments','s1',f'{file_name}.align')
-    frames = load_video(video_path) 
+    frames = load_video(video_path)
+    print(frames.shape)
     alignments = load_alignments(alignment_path)
-    
+
     return frames, alignments