From 357af0005a33bb1f435f32837e477c79a1d96ad2 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Wed, 7 Jan 2026 17:07:19 +0100 Subject: [PATCH 1/7] add public api Dataset.build_from_files method --- src/osekit/public_api/dataset.py | 42 +++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/osekit/public_api/dataset.py b/src/osekit/public_api/dataset.py index e284350d..ec992650 100644 --- a/src/osekit/public_api/dataset.py +++ b/src/osekit/public_api/dataset.py @@ -10,11 +10,10 @@ import logging import shutil +from collections.abc import Iterable from pathlib import Path from typing import TYPE_CHECKING, TypeVar -from pandas import Timestamp - from osekit import config from osekit.config import DPDEFAULT, resample_quality_settings from osekit.core_api import audio_file_manager as afm @@ -32,6 +31,10 @@ from osekit.utils.path_utils import move_tree if TYPE_CHECKING: + from os import PathLike + + from pandas import Timestamp + from osekit.core_api.audio_file import AudioFile from osekit.utils.job import JobBuilder @@ -124,7 +127,9 @@ def analyses(self) -> list[str]: """Return the list of the names of the analyses ran with this Dataset.""" return list({dataset["analysis"] for dataset in self.datasets.values()}) - def build(self) -> None: + def build( + self, + ) -> None: """Build the Dataset. Building a dataset moves the original audio files to a specific folder @@ -170,6 +175,37 @@ def build(self) -> None: self.logger.info("Build done!") + def build_from_files( + self, + files: Iterable[PathLike | str], + *, + move_files: bool = False, + ) -> None: + """Build the dataset from the specified files. + + The files will be copied (or moved) to the dataset.folder folder. + + Parameters + ---------- + files: Iterable[PathLike|str] + Files that are included in the dataset. + move_files: bool + If set to True, the files will be moved (rather than copied) in the dataset + folder. + + """ + if not self.folder.exists(): + self.folder.mkdir(mode=DPDEFAULT) + + for file in map(Path, files): + destination = self.folder / file.name + if move_files: + file.replace(destination) + else: + shutil.copyfile(file, destination) + + self.build() + def _create_logger(self) -> None: if not logging.getLogger("dataset").handlers: message = ( From 1e250cd1ffd103e4e17cdb37e02089c169c73994 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Wed, 7 Jan 2026 17:13:54 +0100 Subject: [PATCH 2/7] setup dataset logger before file manipulation --- src/osekit/public_api/dataset.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/osekit/public_api/dataset.py b/src/osekit/public_api/dataset.py index ec992650..f9ce0d95 100644 --- a/src/osekit/public_api/dataset.py +++ b/src/osekit/public_api/dataset.py @@ -10,7 +10,6 @@ import logging import shutil -from collections.abc import Iterable from pathlib import Path from typing import TYPE_CHECKING, TypeVar @@ -31,6 +30,7 @@ from osekit.utils.path_utils import move_tree if TYPE_CHECKING: + from collections.abc import Iterable from os import PathLike from pandas import Timestamp @@ -107,6 +107,7 @@ def __init__( # noqa: PLR0913 self.job_builder = job_builder self.instrument = instrument self.first_file_begin = first_file_begin + self.logger = None @property def origin_files(self) -> list[AudioFile] | None: @@ -194,6 +195,11 @@ def build_from_files( folder. """ + self._create_logger() + + msg = f"{'Moving' if move_files else 'Copying'} files to the dataset folder." + self.logger.info(msg) + if not self.folder.exists(): self.folder.mkdir(mode=DPDEFAULT) @@ -207,6 +213,8 @@ def build_from_files( self.build() def _create_logger(self) -> None: + if self.logger: + return if not logging.getLogger("dataset").handlers: message = ( "Logging has not been configured. " @@ -219,7 +227,7 @@ def _create_logger(self) -> None: logs_directory = self.folder / "log" if not logs_directory.exists(): - logs_directory.mkdir(mode=DPDEFAULT) + logs_directory.mkdir(mode=DPDEFAULT, parents=True) self.logger = logging.getLogger("dataset").getChild(self.folder.name) file_handler = logging.FileHandler(logs_directory / "logs.log", mode="w") file_handler.setFormatter( From 30e5839076fd1ee2a0c901c724b0902d5c2d983f Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Wed, 7 Jan 2026 17:39:37 +0100 Subject: [PATCH 3/7] remove dataset logger on Dataset.reset() call --- src/osekit/public_api/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osekit/public_api/dataset.py b/src/osekit/public_api/dataset.py index f9ce0d95..790ac2aa 100644 --- a/src/osekit/public_api/dataset.py +++ b/src/osekit/public_api/dataset.py @@ -261,6 +261,7 @@ def reset(self) -> None: file.unlink() self.datasets = {} + self.logger = None def get_analysis_audiodataset(self, analysis: Analysis) -> AudioDataset: """Return an AudioDataset created from the analysis parameters. From 4794de089b120a9281d2f3bfd60582ff622d598c Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 12 Jan 2026 14:40:56 +0100 Subject: [PATCH 4/7] add test_build_specific_files --- tests/test_public_api.py | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 1add226e..5a8bffa9 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import deepcopy +from pathlib import Path import numpy as np import pytest @@ -1411,3 +1412,59 @@ def test_spectro_analysis_with_existing_ads( assert ad.begin == sd.begin assert ad.end == sd.end assert sd.audio_data == ad + + +def test_build_specific_files(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + p1, p2, p3, p4 = ( + Path(r"end.mp3"), + Path(r"glory.mp3"), + Path(r"slow.mp3"), + Path(r"story.mp3"), + ) + + base_folder = [p1, p2, p3, p4] + dest_folder = [] + + dataset = Dataset(folder=tmp_path, strptime_format="%y%m%d%H%M%S") + + def mock_copyfile(file: Path, destination: Path) -> None: + assert destination.parent == dataset.folder + dest_folder.append(file) + + def mock_replace(self: Path, destination: Path) -> None: + assert destination.parent == dataset.folder + base_folder.remove(self) + dest_folder.append(self) + + built_files = [] + + def build_mock(*args: list, **kwargs: dict) -> None: + for file in dest_folder: + built_files.append(file) + + monkeypatch.setattr("shutil.copyfile", mock_copyfile) + monkeypatch.setattr(Path, "replace", mock_replace) + monkeypatch.setattr(Dataset, "build", build_mock) + + # Build from files COPY MODE + dataset.build_from_files( + (p1, p2), + ) + + assert np.array_equal(base_folder, [p1, p2, p3, p4]) + assert np.array_equal(dest_folder, [p1, p2]) + assert np.array_equal(built_files, [p1, p2]) + + # Build from files MOVE MODE + + dest_folder = [] + built_files = [] + + dataset.build_from_files( + (p1, p2), + move_files=True, + ) + + assert np.array_equal(base_folder, [p3, p4]) + assert np.array_equal(dest_folder, [p1, p2]) + assert np.array_equal(built_files, [p1, p2]) From 63547b632e3385d18adb19a47e20a2bbcfb525d7 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 12 Jan 2026 18:10:38 +0100 Subject: [PATCH 5/7] check dataset.folder.mkdir() call on build_from_files() --- tests/test_public_api.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 5a8bffa9..b795614a 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -1425,7 +1425,10 @@ def test_build_specific_files(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - base_folder = [p1, p2, p3, p4] dest_folder = [] - dataset = Dataset(folder=tmp_path, strptime_format="%y%m%d%H%M%S") + dataset = Dataset( + folder=tmp_path / "non_existing_folder", + strptime_format="%y%m%d%H%M%S", + ) def mock_copyfile(file: Path, destination: Path) -> None: assert destination.parent == dataset.folder @@ -1446,11 +1449,22 @@ def build_mock(*args: list, **kwargs: dict) -> None: monkeypatch.setattr(Path, "replace", mock_replace) monkeypatch.setattr(Dataset, "build", build_mock) + mkdir_calls = [] + + def mkdir_mock(self: Path, *args: list, **kwargs: dict) -> None: + mkdir_calls.append(self) + + monkeypatch.setattr(Path, "mkdir", mkdir_mock) + + assert dataset.folder not in mkdir_calls + # Build from files COPY MODE dataset.build_from_files( (p1, p2), ) + assert dataset.folder in mkdir_calls + assert np.array_equal(base_folder, [p1, p2, p3, p4]) assert np.array_equal(dest_folder, [p1, p2]) assert np.array_equal(built_files, [p1, p2]) From 1a1ed12694109cfdda41e6f1919629401b415a66 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Tue, 13 Jan 2026 14:41:57 +0100 Subject: [PATCH 6/7] add Dataset.build_from_files in the docs --- docs/source/publicapi_usage.rst | 47 +++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/docs/source/publicapi_usage.rst b/docs/source/publicapi_usage.rst index 39b96efc..fe5b2dc3 100644 --- a/docs/source/publicapi_usage.rst +++ b/docs/source/publicapi_usage.rst @@ -1,19 +1,19 @@ +.. _publicapi_usage: + Public API ---------- -.. _publicapi_usage: - This API provides tools for working on large sets of audio data. Basically, the whole point of **OSEkit**'s Public API is to export large amounts of spectrograms and/or reshaped audio files with no consideration of the original format of the audio files. The :class:`osekit.public_api.dataset.Dataset` class is the cornerstone of **OSEkit**'s Public API. +.. _build: + Building a ``Dataset`` ^^^^^^^^^^^^^^^^^^^^^^ -.. _build: - At first, A ``Dataset`` is built from a raw folder containing the audio files to be processed. For example, this folder containing 4 audio files plus some extra files: @@ -88,6 +88,38 @@ In this ``AudioDataset``, one :class:`osekit.core_api.audio_data.AudioData` has Additionally, both this Core API ``Audiodataset`` and the Public API ``Dataset`` have been serialized into the ``original.json`` and ``dataset.json`` files, respectively. +Building a dataset from specific files +"""""""""""""""""""""""""""""""""""""" + +It is possible to pass a specific collection of files to build within the dataset. + +This is done thanks to the :meth:`osekit.public_api.dataset.Dataset.build_from_files` method. +The collection of files passed as the ``files`` parameter will be either moved or copied (depending on +the ``move_files`` parameter) within ``Dataset.folder`` before the build is done. If ``Dataset.folder`` does +not exist, it will be created before the files are moved: + +.. code-block:: python + + from pathlib import Path + from osekit.public_api.dataset import Dataset + + # Pick the files you want to include to the dataset + files = ( + r"cool\stuff\2007-11-05_00-01-00.wav", + r"cool\things\2007-11-05_00-03-00.wav", + r"cool\things\2007-11-05_00-05-00.wav" + ) + + # Set the DESTINATION folder of the dataset in the folder parameter + dataset = Dataset( + folder = Path(r"cool\odd_hours"), + strptime_format="%Y-%m-%d_%H-%M-%S", + ) + + dataset.build_from_files( + files=files, + move_files=False, # Copy files rather than moving them + ) Running an ``Analysis`` ^^^^^^^^^^^^^^^^^^^^^^^ @@ -139,12 +171,11 @@ The remaining parameters of the analysis (begin and end **Timestamps**, duration If the ``Analysis`` contains spectral computations (either ``AnalysisType.MATRIX``, ``AnalysisType.SPECTROGRAM`` or ``AnalysisType.WELCH`` is in ``analysis_type``), a `scipy ShortTimeFFT instance `_ should be passed to the ``Analysis`` initializer. +.. _editing_analysis: Checking/Editing the analysis """"""""""""""""""""""""""""" -.. _editing_analysis: - If you want to take a peek at what the analysis output will be before actually running it, the :meth:`osekit.public_api.dataset.Dataset.get_analysis_audiodataset` and :meth:`osekit.public_api.dataset.Dataset.get_analysis_spectrodataset` methods return a :class:`osekit.core_api.audio_dataset.AudioDataset` and a :class:`osekit.core_api.spectro_dataset.SpectroDataset` instance, respectively. @@ -218,11 +249,11 @@ The corresponding ``Analysis`` is the following: dataset.run_analysis(analysis=analysis) # And that's it! +.. _output_1: + Output 1 """""""" -.. _output_1: - Once the analysis is run, a :class:`osekit.core_api.audio_dataset.AudioDataset` instance named ``cool_reshape`` has been created and added to the dataset's :attr:`osekit.public_api.dataset.Dataset.datasets` field. The dataset folder now looks like this: From 359810a4a6fe504d095842b40deea9decd41d78b Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Wed, 14 Jan 2026 12:23:34 +0100 Subject: [PATCH 7/7] ensure afm file closing before moving file in build --- src/osekit/public_api/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osekit/public_api/dataset.py b/src/osekit/public_api/dataset.py index 790ac2aa..19b5629a 100644 --- a/src/osekit/public_api/dataset.py +++ b/src/osekit/public_api/dataset.py @@ -159,6 +159,7 @@ def build( } self.logger.info("Organizing dataset folder...") + afm.close() move_tree( source=self.folder, destination=self.folder / "other",