diff --git a/docs/source/publicapi_usage.rst b/docs/source/publicapi_usage.rst index 39b96efc..fe5b2dc3 100644 --- a/docs/source/publicapi_usage.rst +++ b/docs/source/publicapi_usage.rst @@ -1,19 +1,19 @@ +.. _publicapi_usage: + Public API ---------- -.. _publicapi_usage: - This API provides tools for working on large sets of audio data. Basically, the whole point of **OSEkit**'s Public API is to export large amounts of spectrograms and/or reshaped audio files with no consideration of the original format of the audio files. The :class:`osekit.public_api.dataset.Dataset` class is the cornerstone of **OSEkit**'s Public API. +.. _build: + Building a ``Dataset`` ^^^^^^^^^^^^^^^^^^^^^^ -.. _build: - At first, A ``Dataset`` is built from a raw folder containing the audio files to be processed. For example, this folder containing 4 audio files plus some extra files: @@ -88,6 +88,38 @@ In this ``AudioDataset``, one :class:`osekit.core_api.audio_data.AudioData` has Additionally, both this Core API ``Audiodataset`` and the Public API ``Dataset`` have been serialized into the ``original.json`` and ``dataset.json`` files, respectively. +Building a dataset from specific files +"""""""""""""""""""""""""""""""""""""" + +It is possible to pass a specific collection of files to build within the dataset. + +This is done thanks to the :meth:`osekit.public_api.dataset.Dataset.build_from_files` method. +The collection of files passed as the ``files`` parameter will be either moved or copied (depending on +the ``move_files`` parameter) within ``Dataset.folder`` before the build is done. If ``Dataset.folder`` does +not exist, it will be created before the files are moved: + +.. code-block:: python + + from pathlib import Path + from osekit.public_api.dataset import Dataset + + # Pick the files you want to include to the dataset + files = ( + r"cool\stuff\2007-11-05_00-01-00.wav", + r"cool\things\2007-11-05_00-03-00.wav", + r"cool\things\2007-11-05_00-05-00.wav" + ) + + # Set the DESTINATION folder of the dataset in the folder parameter + dataset = Dataset( + folder = Path(r"cool\odd_hours"), + strptime_format="%Y-%m-%d_%H-%M-%S", + ) + + dataset.build_from_files( + files=files, + move_files=False, # Copy files rather than moving them + ) Running an ``Analysis`` ^^^^^^^^^^^^^^^^^^^^^^^ @@ -139,12 +171,11 @@ The remaining parameters of the analysis (begin and end **Timestamps**, duration If the ``Analysis`` contains spectral computations (either ``AnalysisType.MATRIX``, ``AnalysisType.SPECTROGRAM`` or ``AnalysisType.WELCH`` is in ``analysis_type``), a `scipy ShortTimeFFT instance `_ should be passed to the ``Analysis`` initializer. +.. _editing_analysis: Checking/Editing the analysis """"""""""""""""""""""""""""" -.. _editing_analysis: - If you want to take a peek at what the analysis output will be before actually running it, the :meth:`osekit.public_api.dataset.Dataset.get_analysis_audiodataset` and :meth:`osekit.public_api.dataset.Dataset.get_analysis_spectrodataset` methods return a :class:`osekit.core_api.audio_dataset.AudioDataset` and a :class:`osekit.core_api.spectro_dataset.SpectroDataset` instance, respectively. @@ -218,11 +249,11 @@ The corresponding ``Analysis`` is the following: dataset.run_analysis(analysis=analysis) # And that's it! +.. _output_1: + Output 1 """""""" -.. _output_1: - Once the analysis is run, a :class:`osekit.core_api.audio_dataset.AudioDataset` instance named ``cool_reshape`` has been created and added to the dataset's :attr:`osekit.public_api.dataset.Dataset.datasets` field. The dataset folder now looks like this: diff --git a/src/osekit/public_api/dataset.py b/src/osekit/public_api/dataset.py index e284350d..19b5629a 100644 --- a/src/osekit/public_api/dataset.py +++ b/src/osekit/public_api/dataset.py @@ -13,8 +13,6 @@ from pathlib import Path from typing import TYPE_CHECKING, TypeVar -from pandas import Timestamp - from osekit import config from osekit.config import DPDEFAULT, resample_quality_settings from osekit.core_api import audio_file_manager as afm @@ -32,6 +30,11 @@ from osekit.utils.path_utils import move_tree if TYPE_CHECKING: + from collections.abc import Iterable + from os import PathLike + + from pandas import Timestamp + from osekit.core_api.audio_file import AudioFile from osekit.utils.job import JobBuilder @@ -104,6 +107,7 @@ def __init__( # noqa: PLR0913 self.job_builder = job_builder self.instrument = instrument self.first_file_begin = first_file_begin + self.logger = None @property def origin_files(self) -> list[AudioFile] | None: @@ -124,7 +128,9 @@ def analyses(self) -> list[str]: """Return the list of the names of the analyses ran with this Dataset.""" return list({dataset["analysis"] for dataset in self.datasets.values()}) - def build(self) -> None: + def build( + self, + ) -> None: """Build the Dataset. Building a dataset moves the original audio files to a specific folder @@ -153,6 +159,7 @@ def build(self) -> None: } self.logger.info("Organizing dataset folder...") + afm.close() move_tree( source=self.folder, destination=self.folder / "other", @@ -170,7 +177,45 @@ def build(self) -> None: self.logger.info("Build done!") + def build_from_files( + self, + files: Iterable[PathLike | str], + *, + move_files: bool = False, + ) -> None: + """Build the dataset from the specified files. + + The files will be copied (or moved) to the dataset.folder folder. + + Parameters + ---------- + files: Iterable[PathLike|str] + Files that are included in the dataset. + move_files: bool + If set to True, the files will be moved (rather than copied) in the dataset + folder. + + """ + self._create_logger() + + msg = f"{'Moving' if move_files else 'Copying'} files to the dataset folder." + self.logger.info(msg) + + if not self.folder.exists(): + self.folder.mkdir(mode=DPDEFAULT) + + for file in map(Path, files): + destination = self.folder / file.name + if move_files: + file.replace(destination) + else: + shutil.copyfile(file, destination) + + self.build() + def _create_logger(self) -> None: + if self.logger: + return if not logging.getLogger("dataset").handlers: message = ( "Logging has not been configured. " @@ -183,7 +228,7 @@ def _create_logger(self) -> None: logs_directory = self.folder / "log" if not logs_directory.exists(): - logs_directory.mkdir(mode=DPDEFAULT) + logs_directory.mkdir(mode=DPDEFAULT, parents=True) self.logger = logging.getLogger("dataset").getChild(self.folder.name) file_handler = logging.FileHandler(logs_directory / "logs.log", mode="w") file_handler.setFormatter( @@ -217,6 +262,7 @@ def reset(self) -> None: file.unlink() self.datasets = {} + self.logger = None def get_analysis_audiodataset(self, analysis: Analysis) -> AudioDataset: """Return an AudioDataset created from the analysis parameters. diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 1add226e..b795614a 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import deepcopy +from pathlib import Path import numpy as np import pytest @@ -1411,3 +1412,73 @@ def test_spectro_analysis_with_existing_ads( assert ad.begin == sd.begin assert ad.end == sd.end assert sd.audio_data == ad + + +def test_build_specific_files(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + p1, p2, p3, p4 = ( + Path(r"end.mp3"), + Path(r"glory.mp3"), + Path(r"slow.mp3"), + Path(r"story.mp3"), + ) + + base_folder = [p1, p2, p3, p4] + dest_folder = [] + + dataset = Dataset( + folder=tmp_path / "non_existing_folder", + strptime_format="%y%m%d%H%M%S", + ) + + def mock_copyfile(file: Path, destination: Path) -> None: + assert destination.parent == dataset.folder + dest_folder.append(file) + + def mock_replace(self: Path, destination: Path) -> None: + assert destination.parent == dataset.folder + base_folder.remove(self) + dest_folder.append(self) + + built_files = [] + + def build_mock(*args: list, **kwargs: dict) -> None: + for file in dest_folder: + built_files.append(file) + + monkeypatch.setattr("shutil.copyfile", mock_copyfile) + monkeypatch.setattr(Path, "replace", mock_replace) + monkeypatch.setattr(Dataset, "build", build_mock) + + mkdir_calls = [] + + def mkdir_mock(self: Path, *args: list, **kwargs: dict) -> None: + mkdir_calls.append(self) + + monkeypatch.setattr(Path, "mkdir", mkdir_mock) + + assert dataset.folder not in mkdir_calls + + # Build from files COPY MODE + dataset.build_from_files( + (p1, p2), + ) + + assert dataset.folder in mkdir_calls + + assert np.array_equal(base_folder, [p1, p2, p3, p4]) + assert np.array_equal(dest_folder, [p1, p2]) + assert np.array_equal(built_files, [p1, p2]) + + # Build from files MOVE MODE + + dest_folder = [] + built_files = [] + + dataset.build_from_files( + (p1, p2), + move_files=True, + ) + + assert np.array_equal(base_folder, [p3, p4]) + assert np.array_equal(dest_folder, [p1, p2]) + assert np.array_equal(built_files, [p1, p2])