Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions idtap/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,28 @@ def download_spectrogram_metadata(self, audio_id: str) -> Dict[str, Any]:
endpoint = f"spec_data/{audio_id}/spec_shape.json"
return self._get(endpoint)

def get_audio_recording(self, audio_id: str) -> Dict[str, Any]:
"""Get audio recording metadata by ID.

Fetches complete recording metadata including duration, musicians,
ragas, location, and permissions.

Args:
audio_id: The audio recording ID

Returns:
Dictionary with recording metadata including:
- duration: Audio duration in seconds (float)
- musicians: Dictionary of performer information
- raags: Dictionary of raga information
- title: Recording title
- etc.

Raises:
requests.HTTPError: If recording not found (404)
"""
return self._get("getAudioRecording", params={"_id": audio_id})

def save_transcription(self, piece: Piece, fill_duration: bool = True) -> Any:
"""Save a transcription piece to the server.

Expand Down
35 changes: 27 additions & 8 deletions idtap/spectrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,21 @@ class SpectrogramData:
# Constants matching web app implementation
DEFAULT_FREQ_RANGE = (75.0, 2400.0) # Hz
DEFAULT_BINS_PER_OCTAVE = 72
DEFAULT_TIME_RESOLUTION = 0.015080 # seconds per frame (fallback when DB unavailable)

def __init__(self, data: np.ndarray, audio_id: str,
freq_range: Tuple[float, float] = DEFAULT_FREQ_RANGE,
bins_per_octave: int = DEFAULT_BINS_PER_OCTAVE):
bins_per_octave: int = DEFAULT_BINS_PER_OCTAVE,
time_resolution: Optional[float] = None):
"""Initialize SpectrogramData with raw data.

Args:
data: Raw uint8 spectrogram array [freq_bins, time_frames]
audio_id: Audio recording ID
freq_range: Frequency range (min_hz, max_hz)
bins_per_octave: Number of frequency bins per octave
time_resolution: Time resolution in seconds per frame (optional)
If None, uses DEFAULT_TIME_RESOLUTION fallback
"""
if not isinstance(data, np.ndarray):
raise TypeError(f"data must be numpy array, got {type(data)}")
Expand All @@ -72,19 +76,21 @@ def __init__(self, data: np.ndarray, audio_id: str,
self.audio_id = audio_id
self.freq_range = freq_range
self.bins_per_octave = bins_per_octave
self._time_resolution = time_resolution if time_resolution is not None else self.DEFAULT_TIME_RESOLUTION

@classmethod
def from_audio_id(cls, audio_id: str, client: Optional['SwaraClient'] = None) -> 'SpectrogramData':
"""Download and load spectrogram data from audio ID.

Fetches compressed spectrogram data from https://swara.studio/spec_data/{audio_id}/
and calculates accurate time_resolution from the audio recording duration in the database.

Args:
audio_id: IDTAP audio recording ID
client: Optional SwaraClient instance (creates one if not provided)

Returns:
SpectrogramData instance
SpectrogramData instance with accurate time_resolution

Raises:
requests.HTTPError: If spectrogram data doesn't exist or download fails
Expand All @@ -105,7 +111,19 @@ def from_audio_id(cls, audio_id: str, client: Optional['SwaraClient'] = None) ->
shape = tuple(metadata['shape']) # [freq_bins, time_frames]
data = np.frombuffer(decompressed, dtype=np.uint8).reshape(shape)

return cls(data, audio_id)
# Get exact audio duration from recording database
time_resolution = None
try:
recording = client.get_audio_recording(audio_id)
audio_duration = recording['duration']
time_frames = shape[1]
time_resolution = audio_duration / time_frames
except Exception:
# Fallback to DEFAULT_TIME_RESOLUTION if recording not found
# This will be handled by __init__
pass

return cls(data, audio_id, time_resolution=time_resolution)

@classmethod
def from_piece(cls, piece: 'Piece', client: Optional['SwaraClient'] = None) -> Optional['SpectrogramData']:
Expand Down Expand Up @@ -501,12 +519,13 @@ def duration(self) -> float:
def time_resolution(self) -> float:
"""Time resolution in seconds per frame.

Estimated based on typical CQT parameters for audio sampling.
Calculated from audio recording duration in database (when available).
Falls back to DEFAULT_TIME_RESOLUTION if recording metadata unavailable.

Note: Spectrograms always cover the full audio recording, even when
the associated Piece transcribes only an excerpt.
"""
# Typical hop size for CQT is around 0.01s per frame
# This is an approximation - exact value depends on sample rate and hop length
# For 44100 Hz sample rate with hop_length=512: 512/44100 ≈ 0.0116s
return 0.0116 # seconds per frame (approximate)
return self._time_resolution

@property
def freq_bins(self) -> np.ndarray:
Expand Down
Loading