Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
91392f3
Created using Colaboratory
sphynxlee Mar 27, 2024
f123d9b
Refactor modelutil.py: Import formatting and load_model function sign…
sphynxlee Apr 3, 2024
3a6f5e5
Refactor modelutil.py: Import Orthogonal initializer and update LSTM …
sphynxlee Apr 10, 2024
7463738
Refactor modelutil.py: Import Keras layers and models, update load_mo…
sphynxlee Apr 10, 2024
dc64e4e
Refactor streamlitapp.py: Add new video file and comment out old vide…
sphynxlee Apr 10, 2024
bb19d97
Refactor modelutil.py: Import Keras layers and models, update load_mo…
sphynxlee Apr 12, 2024
b2b9df5
Add new video loading functionality and update alignment loading in u…
sphynxlee Apr 15, 2024
3b58183
Update kernelspec display name and language version in LipNet.ipynb
sphynxlee Apr 16, 2024
a1509ef
Refactor .gitignore: Remove unnecessary files and directories
sphynxlee Apr 16, 2024
ac4ede4
Add alignment generator script to extract audio from video and genera…
sphynxlee Apr 16, 2024
111f803
Refactor modelutil.py: Import Orthogonal initializer and update LSTM …
sphynxlee Jun 21, 2024
2cde8a3
Refactor modelutil.py: Import Orthogonal initializer and update LSTM …
sphynxlee Jun 21, 2024
e87ba91
Refactor modelutil.py: Import Orthogonal initializer and update LSTM …
sphynxlee Jun 21, 2024
34d5f24
Refactor .gitignore: Update ignore rules for __pycache__ and .vscode/
sphynxlee Jun 21, 2024
2919510
Refactor .gitignore: Update ignore rules for __pycache__ and .vscode/
sphynxlee Jun 21, 2024
0f5b5c8
Refactor .gitignore: Update ignore rules for __pycache__ and .vscode/
sphynxlee Jun 21, 2024
7399d29
Update ipynb
sphynxlee Jun 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# LipNet
data/
models/
results/
__pycache__/
.vscode/
*.zip
2,389 changes: 2,063 additions & 326 deletions LipNet.ipynb

Large diffs are not rendered by default.

2,520 changes: 2,520 additions & 0 deletions LipNet_my.ipynb

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions aligment_generator/align_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from moviepy.editor import VideoFileClip
import speech_recognition as sr
import os
video_path = os.getcwd() + "/data_reprocess/input.mp4"
print('video_path:', video_path)
audio_path = os.getcwd() + "/data_reprocess/input.wav"
print('audio_path:', audio_path)
align_file_path = os.getcwd() + "/data_reprocess/input.txt"
print('align_file_path:', align_file_path)

def extract_audio_from_video(video_path, audio_path):
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile(audio_path)

def recognize_speech_from_file(file_path):
recognizer = sr.Recognizer()

with sr.AudioFile(file_path) as source:
audio = recognizer.record(source)

return recognizer.recognize_google(audio)

def format_align_file(transcription):
# For simplicity, let's assume that the words in the transcription are separated by spaces
words = transcription.split()

# Set the total duration of the video (in milliseconds)
total_duration = 74500

# Calculate the approximate duration of each word
word_duration = total_duration / len(words)

align_file_content = ""

start_time = 0
for i, word in enumerate(words):
end_time = start_time + word_duration
align_file_content += f"{start_time} {end_time} {word}\n"
start_time = end_time

return align_file_content

extract_audio_from_video(video_path, audio_path)
transcription = recognize_speech_from_file(audio_path)
align_file_content = format_align_file(transcription)

with open(align_file_path, "w") as align_file:
align_file.write(align_file_content)
Binary file added animation.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified app/animation.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
36 changes: 28 additions & 8 deletions app/modelutil.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten

def load_model() -> Sequential:
import os
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional
from keras.initializers import Orthogonal
import tensorflow as tf

def load_model() -> Sequential:
model = Sequential()

model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
Expand All @@ -19,14 +24,29 @@ def load_model() -> Sequential:

model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
# model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
model.add(Dropout(.5))

model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
# model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(41, kernel_initializer='he_normal', activation='softmax'))

model.load_weights(os.path.join('..','models','checkpoint'))
# model.load_weights(os.path.join('..','models','checkpoint'))

# Ensure the checkpoint directory is correctly set relative to the script's execution directory
checkpoint_dir = os.path.join('..', 'models')

# Create a checkpoint instance that points to the folder where the checkpoints are saved
checkpoint = tf.train.Checkpoint(model=model)

# Restore the latest checkpoint
latest = tf.train.latest_checkpoint(checkpoint_dir)
if latest:
checkpoint.restore(latest)
else:
raise FileNotFoundError(f"No checkpoint found in {checkpoint_dir}")

return model
40 changes: 21 additions & 19 deletions app/streamlitapp.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,50 @@
# Import all of the dependencies
import streamlit as st
import os
import imageio
import os
import imageio
import keras

import tensorflow as tf
import tensorflow as tf
from utils import load_data, num_to_char
from modelutil import load_model

# Set the layout to the streamlit app as wide
# Set the layout to the streamlit app as wide
st.set_page_config(layout='wide')

# Setup the sidebar
with st.sidebar:
st.image('https://www.onepointltd.com/wp-content/uploads/2020/03/inno2.png')
st.title('LipBuddy')
st.info('This application is originally developed from the LipNet deep learning model.')
# with st.sidebar:
# st.image('https://www.onepointltd.com/wp-content/uploads/2020/03/inno2.png')
# st.title('LipBuddy')
# st.info('This application is originally developed from the LipNet deep learning model.')

st.title('LipNet Full Stack App')
# Generating a list of options or videos
st.title('LipNet Full Stack App')
# Generating a list of options or videos
options = os.listdir(os.path.join('..', 'data', 's1'))
selected_video = st.selectbox('Choose video', options)

# Generate two columns
# Generate two columns
col1, col2 = st.columns(2)

if options:
if options:

# Rendering the video
with col1:
# Rendering the video
with col1:
st.info('The video below displays the converted video in mp4 format')
file_path = os.path.join('..','data','s1', selected_video)
os.system(f'ffmpeg -i {file_path} -vcodec libx264 test_video.mp4 -y')

# Rendering inside of the app
video = open('test_video.mp4', 'rb')
video_bytes = video.read()
video = open('test_video.mp4', 'rb')
# video = open('input.mp4', 'rb')
video_bytes = video.read()
st.video(video_bytes)


with col2:
with col2:
st.info('This is all the machine learning model sees when making a prediction')
video, annotations = load_data(tf.convert_to_tensor(file_path))
imageio.mimsave('animation.gif', video, fps=10)
st.image('animation.gif', width=400)
st.image('animation.gif', width=400)

st.info('This is the output of the machine learning model as tokens')
model = load_model()
Expand All @@ -54,4 +56,4 @@
st.info('Decode the raw tokens into words')
converted_prediction = tf.strings.reduce_join(num_to_char(decoder)).numpy().decode('utf-8')
st.text(converted_prediction)

26 changes: 14 additions & 12 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import tensorflow as tf
from typing import List
import numpy as np
import cv2
import os
import os

vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
Expand All @@ -10,39 +11,40 @@
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

def load_video(path:str) -> List[float]:
def load_video(path:str) -> List[float]:
#print(path)
cap = cv2.VideoCapture(path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()

mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_alignments(path:str) -> List[str]:

def load_alignments(path:str) -> List[str]:
#print(path)
with open(path, 'r') as f:
lines = f.readlines()
with open(path, 'r') as f:
lines = f.readlines()
tokens = []
for line in lines:
line = line.split()
if line[2] != 'sil':
if line[2] != 'sil':
tokens = [*tokens,' ',line[2]]
return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

def load_data(path: str):
def load_data(path: str):
path = bytes.decode(path.numpy())
file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('..','data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('..','data','alignments','s1',f'{file_name}.align')
frames = load_video(video_path)
frames = load_video(video_path)
print(frames.shape)
alignments = load_alignments(alignment_path)

return frames, alignments