SightGuardAI/app.py at master · smaranjitghose/SightGuardAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import tempfile
import cv2
from io import BytesIO
from pathlib import Path

import streamlit as st
import moondream as md
from PIL import Image, ImageDraw
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load environment variables
load_dotenv()

# Cache the Moondream API client
@st.cache_resource
def load_moondream_api(api_key):
    try:
        return md.vl(api_key=api_key)
    except Exception as e:
        st.error(f"Failed to initialize Moondream API: {str(e)}")
        return None

# Function to extract frames from video and their timestamps
def extract_frames_with_timestamps(video_path, interval=1):
    cap = cv2.VideoCapture(video_path)
    frames = []
    timestamps = []
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    success, image = cap.read()
    count = 0

    while success:
        timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
        timestamp_sec = timestamp_ms / 1000.0

        if count % (interval * frame_rate) == 0:
            img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            frames.append(img)
            timestamps.append(timestamp_sec)

        success, image = cap.read()
        count += 1

    cap.release()  # Release the video capture object
    print(f"Total frames captured: {len(frames)}")
    return frames, timestamps

# Function to calculate cosine similarity between two descriptions
def calculate_similarity(prev_description, current_description):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([prev_description, current_description])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def main():
    st.set_page_config(
        page_title="SightGuardAI: Automatic Surveillance Tagging",
        page_icon="👁️",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    st.title("👁️ SightGuardAI")
    st.write("Upload a surveillance video to extract frames, generate descriptions, and identify key frames.")

    # Add API key input in sidebar
    with st.sidebar:
        api_key = st.text_input("Enter your Moondream API Key", type="password")
        if api_key:
            os.environ["MOONDREAM_API_KEY"] = api_key

        uploaded_file = st.file_uploader(
            "Upload a surveillance video",
            type=["mp4", "avi", "mov"]
        )

    # Initialize Moondream API client if API key is provided
    if api_key:
        model = load_moondream_api(api_key)
        if model is None:
            st.error("Failed to initialize Moondream API. Please check your API key.")
            return
    else:
        st.warning("Please enter your Moondream API Key in the sidebar.")
        return

    if uploaded_file:
        # Save uploaded video to a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        temp_file.write(uploaded_file.read())
        video_path = temp_file.name
        temp_file.close()  # Close the file to release the handle

        # Display the uploaded video
        st.video(uploaded_file)

        # Add an "Analyze" button
        if st.button("Analyze"):
            with st.spinner("Analyzing video..."):
                # Extract frames and timestamps
                frames, timestamps = extract_frames_with_timestamps(video_path, interval=1)  # Extract 1 frame per second

                # Process each frame using the Moondream API
                descriptions = []
                key_frames = []
                similarity_threshold = 0.8  # Adjust this threshold as needed

                prev_description = ""
                for i, frame in enumerate(frames):
                    try:
                        encoded_image = model.encode_image(frame)
                        description = model.caption(encoded_image)["caption"]

                        # Calculate similarity with the previous frame's description
                        if prev_description:
                            similarity = calculate_similarity(prev_description, description)
                            if similarity < similarity_threshold:  # Key frame if similarity is below threshold
                                key_frames.append((timestamps[i], frame))

                        descriptions.append((timestamps[i], description))
                        prev_description = description
                    except Exception as e:
                        st.error(f"Error processing frame {i + 1}: {str(e)}")
                        continue

                # Create a DataFrame for frames and descriptions
                frame_data = {
                    "Frame": [f"Frame {i + 1}" for i in range(len(frames))],
                    "Timestamp (s)": [f"{timestamp:.2f}" for timestamp in timestamps],
                    "Description": [description for _, description in descriptions],
                    "Key Frame": ["Yes" if (timestamp, frame) in key_frames else "No" for timestamp, frame in zip(timestamps, frames)]
                }
                df = pd.DataFrame(frame_data)

                # Display the table
                st.header("Frame Descriptions and Key Frames")
                st.dataframe(df, use_container_width=True)

                # Display key frames in a grid layout
                if key_frames:
                    st.header("Key Frames")
                    num_columns_key_frames = 3  # Number of columns for key frames grid
                    num_rows_key_frames = (len(key_frames) + num_columns_key_frames - 1) // num_columns_key_frames  # Calculate number of rows needed

                    for row in range(num_rows_key_frames):
                        cols = st.columns(num_columns_key_frames)
                        for col in range(num_columns_key_frames):
                            index = row * num_columns_key_frames + col
                            if index < len(key_frames):
                                timestamp, frame = key_frames[index]
                                cols[col].image(frame, caption=f"Key Frame {index + 1} at {timestamp:.2f}s")

        # Clean up temporary file
        try:
            os.unlink(video_path)
        except Exception as e:
            st.error(f"Could not delete temporary file: {e}")

if __name__ == "__main__":
    main()