diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml new file mode 100644 index 000000000..74dd68dd9 --- /dev/null +++ b/.github/workflows/publish-docs.yml @@ -0,0 +1,47 @@ +# This workflow builds and publishes the latest docs to +# the `gh-pages` branch. +# For more details: https://github.com/marketplace/actions/deploy-to-github-pages +name: Publish docs + +on: + release: + types: [created] + workflow_dispatch: + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + permissions: + contents: write + pages: write + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v2 + with: + # fetch all tags so `versioneer` can properly determine current version + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install pandoc + uses: pandoc/actions/setup@v1 + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -r requirements-ml.txt + pip install -r requirements-reports.txt + pip install -r requirements-docs.txt + pip install -e . + + - name: Build + run: | + cd _docs/docs + python update_documentation.py + - name: Publish + uses: JamesIves/github-pages-deploy-action@v4 + with: + branch: gh-pages + folder: _docs/docs/LATEST/html diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml new file mode 100644 index 000000000..9a230cd27 --- /dev/null +++ b/.github/workflows/publish-package.yml @@ -0,0 +1,52 @@ +# This workflow publishes the package to pypi. +# For more details: +# https://docs.github.com/en/actions/guides/building-and-testing-python#publishing-to-package-registries +name: Publish to PyPi + +on: + release: + types: [created] + workflow_dispatch: + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + # fetch all tags so `versioneer` can properly determine current version + with: + fetch-depth: 0 + - name: Check if current commit is tagged + # fails and cancels release if the current commit is not tagged + run: | + git describe --exact-match --tags + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + if [ -f requirements-ml.txt ]; then pip install -r requirements-ml.txt; fi + if [ -f requirements-reports.txt ]; then pip install -r requirements-reports.txt; fi + pip install setuptools wheel twine + - name: Build + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_REPOSITORY: pypi + run: | + python setup.py sdist bdist_wheel + - name: Test build + # fails and cancels release if the built package fails to import + run: | + pip install dist/*.whl + python -c 'import dataprofiler; print(dataprofiler.__version__)' + - name: Publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_REPOSITORY: pypi + run: | + twine upload dist/* diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml deleted file mode 100644 index 75b9a41e2..000000000 --- a/.github/workflows/publish-python-package.yml +++ /dev/null @@ -1,38 +0,0 @@ - -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -name: Publish Python Package - -on: - release: - types: [created] - branches: - - 'release/*' - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - if [ -f requirements-ml.txt ]; then pip install -r requirements-ml.txt; fi - if [ -f requirements-reports.txt ]; then pip install -r requirements-reports.txt; fi - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - TWINE_REPOSITORY: pypi - run: | - python setup.py sdist bdist_wheel - twine upload dist/* diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-package.yml similarity index 88% rename from .github/workflows/test-python-package.yml rename to .github/workflows/test-package.yml index fa84b3d3a..47416a938 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-package.yml @@ -7,8 +7,6 @@ on: pull_request: branches: - 'main' - - 'feature/**' - - 'dev' jobs: build: @@ -16,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8, 3.9, "3.10"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v4 @@ -38,4 +36,4 @@ jobs: pre-commit run --all-files - name: Test with pytest run: | - DATAPROFILER_SEED=0 pytest --forked --cov=dataprofiler --cov-fail-under=80 + DATAPROFILER_SEED=0 pytest --cov=dataprofiler --cov-fail-under=80 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7baeb59ec..666cde4b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ repos: rev: 22.3.0 hooks: - id: black + exclude: (versioneer.py|dataprofiler/_version.py|_docs/) types: [file, python] language_version: python3 # Isort: sort import statements @@ -15,6 +16,7 @@ repos: rev: 5.12.0 hooks: - id: isort + exclude: _docs/ language_version: python3 # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html @@ -23,7 +25,7 @@ repos: hooks: - id: flake8 additional_dependencies: [flake8-docstrings] - exclude: (^docs/|^dataprofiler/tests/|^.*/__init__.py) + exclude: (^docs/|^dataprofiler/tests/|^.*/__init__.py|_docs/) language_version: python3 # General fixers: format files for white spaces and trailing new lines, warn on debug statements # https://github.com/pre-commit/pre-commit-hooks#hooks-available @@ -31,31 +33,31 @@ repos: rev: v4.0.1 hooks: - id: trailing-whitespace - exclude: (^dataprofiler/tests/data/|^dataprofiler/tests/speed_tests/data/) + exclude: (^dataprofiler/tests/data/|^dataprofiler/tests/speed_tests/data/|_docs/) - id: debug-statements - id: end-of-file-fixer - exclude: (^dataprofiler/tests/data/) + exclude: (^dataprofiler/tests/data/|_docs/) # Mypy: Optional static type checking # https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.982 hooks: - id: mypy - exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/) + exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/|versioneer.py|dataprofiler/_version.py|_docs/) language_version: python3 additional_dependencies: # Keep up-to-date with the respective requirement files [ # requirements.txt h5py>=2.10.0, wheel>=0.33.1, - numpy>=1.22.0, + numpy<2.0.0, pandas>=1.1.2, python-dateutil>=2.7.5, pytz>=2020.1, pyarrow>=1.0.1, chardet>=3.0.4, fastavro>=1.0.0.post1, - python-snappy>=0.5.4, + python-snappy>=0.7.1, charset-normalizer>=1.3.6, psutil>=4.0.0, scipy>=1.4.1, @@ -80,7 +82,7 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>=2.4.3,<3.0.0', + 'keras>=2.4.3,<=3.4.0', rapidfuzz>=2.6.1, "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'", "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'", @@ -93,7 +95,6 @@ repos: # requirements-test.txt coverage>=5.0.1, - dask>=2.29.0, fsspec>=0.3.3, pytest>=6.0.1, pytest-cov>=2.8.1, @@ -108,7 +109,7 @@ repos: rev: "0.48" hooks: - id: check-manifest - additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas', + additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] @@ -118,11 +119,13 @@ repos: hooks: - id: pyupgrade args: ["--py38-plus"] + exclude: (versioneer.py|dataprofiler/_version.py| _docs/) # Autoflake - cleanup unused variables and imports - repo: https://github.com/PyCQA/autoflake rev: v2.0.0 hooks: - id: autoflake + exclude: _docs/ args: - "--in-place" - "--ignore-pass-statements" diff --git a/.whitesource b/.whitesource new file mode 100644 index 000000000..37dfa8e25 --- /dev/null +++ b/.whitesource @@ -0,0 +1,3 @@ +{ + "settingsInheritedFrom": "capitalone/whitesource-config" +} diff --git a/MANIFEST.in b/MANIFEST.in index 12480abd8..3f426b7bb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ global-exclude .DS_Store +global-exclude */__pycache__/* include *.txt include CODEOWNERS @@ -16,4 +17,17 @@ recursive-include resources *.json recursive-include resources *.pb recursive-include resources *.py -recursive-include dataprofiler/labelers/embeddings/ *.txt +recursive-include dataprofiler/labelers/embeddings *.txt +include versioneer.py +include dataprofiler/_version.py +include .whitesource + +recursive-exclude _docs *.html +recursive-exclude _docs *.cfg +exclude _docs/LICENSE +recursive-exclude _docs *.md +recursive-exclude _docs *.nojekyll +recursive-exclude _docs *.png +recursive-exclude _docs *.py +recursive-exclude _docs *.rst +recursive-exclude _docs Makefile diff --git a/_docs/README.md b/_docs/README.md new file mode 100644 index 000000000..0f925ac58 --- /dev/null +++ b/_docs/README.md @@ -0,0 +1,59 @@ +Visit our [documentation page.](https://capitalone.github.io/DataProfiler) + +### How to properly write documentation: + +#### Packages +In any package directory, overall package comments can be made in the +\_\_init\_\_.py of the directory. At the top of the \_\_init\_\_.py, +include your comments in between triple quotations. + +#### Classes +In any class file, include overall class comments at the top of the file +in between triple quotes and/or in the init function. + +#### Functions +reStructuredText Docstring Format is the standard. Here is an example: + + def format_data(self, predictions, verbose=False): + """ + Formats word level labeling of the Unstructured Data Labeler as you want + + :param predictions: A 2D list of word level predictions/labeling + :type predictions: Dict + :param verbose: A flag to determine verbosity + :type verbose: Bool + :return: JSON structure containing specified formatted output + :rtype: JSON + + :Example: + Look at this test. Don't forget the double colons to make a code block:: + This is a codeblock + Type example code here + """ + +### How to update the documentation: + + +1. Set up your local environment +```bash +# install sphinx requirements +# install the requirements from the feature branch +pip install pandoc && +pip install -r requirements.txt && +pip install -r requirements-ml.txt && +pip install -r requirements-reports.txt && +pip install -r requirements-docs.txt && +pip install -e . + +``` +2. And finally, from the root of `DataProfiler`, run the following commands to generate the sphinx documentation: +```bash +cd _docs/docs +python update_documentation.py + +``` + +3. View new docs +```bash +open index.html +``` diff --git a/_docs/docs/Makefile b/_docs/docs/Makefile new file mode 100644 index 000000000..81ca02cf5 --- /dev/null +++ b/_docs/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = buildcode + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/_docs/docs/make.bat b/_docs/docs/make.bat new file mode 100644 index 000000000..6247f7e23 --- /dev/null +++ b/_docs/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/_docs/docs/source/API.rst b/_docs/docs/source/API.rst new file mode 100644 index 000000000..fdbf2242b --- /dev/null +++ b/_docs/docs/source/API.rst @@ -0,0 +1,16 @@ +.. _API: + +API +*** + +The API is split into 4 main components: Profilers, Labelers, Data Readers, and +Validators. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + dataprofiler.data_readers + dataprofiler.profilers + dataprofiler.labelers + dataprofiler.validators \ No newline at end of file diff --git a/_docs/docs/source/DL-Flowchart.png b/_docs/docs/source/DL-Flowchart.png new file mode 100644 index 000000000..696eeb5dc Binary files /dev/null and b/_docs/docs/source/DL-Flowchart.png differ diff --git a/_docs/docs/source/_static/custom.css b/_docs/docs/source/_static/custom.css new file mode 100644 index 000000000..8a7c7cb54 --- /dev/null +++ b/_docs/docs/source/_static/custom.css @@ -0,0 +1,50 @@ +/* + the ipython3 code blocks coming from the notebooks + were not getting the dark theme styles applied, so + manually overriding them +*/ +@media (prefers-color-scheme: dark) { + .highlight-ipython3 { + border: none !important; + border-radius: 2px !important; + background: #202020 !important; + color: #d0d0d0 !important; + } +} + +@media (prefers-color-scheme: dark) { + tr:nth-child(odd) { + background-color: #202020 !important; + } +} + +@media (prefers-color-scheme: dark) { + .dataframe { + color: white !important; + } +} + +.hidden { + display: none; +} + +.version { + text-align: right; + font-size: 24px; + margin-top: -47px; + margin-right: 3px; +} + +.sidebar-brand { + margin-bottom: -10px; + margin-top: 10px; +} + +/* unknown warning was showing, manually hiding */ +#Visualizing-Logged-Dataframes .admonition.warning { + display: none; +} + +div.output_area.stderr { + display: none; +} diff --git a/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png b/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png new file mode 100644 index 000000000..a339e0f6a Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerDarkLogoLong.png differ diff --git a/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png b/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerLogoLightTheme.png differ diff --git a/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png b/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png new file mode 100644 index 000000000..ca86fe167 Binary files /dev/null and b/_docs/docs/source/_static/images/DataProfilerLogoLightThemeLong.png differ diff --git a/_docs/docs/source/_static/images/branching_workflow_diagram.png b/_docs/docs/source/_static/images/branching_workflow_diagram.png new file mode 100644 index 000000000..60a9515d0 Binary files /dev/null and b/_docs/docs/source/_static/images/branching_workflow_diagram.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_0.png b/_docs/docs/source/_static/images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_0.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_1.png b/_docs/docs/source/_static/images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_1.png differ diff --git a/_docs/docs/source/_static/images/histogram_example_2.png b/_docs/docs/source/_static/images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/_docs/docs/source/_static/images/histogram_example_2.png differ diff --git a/_docs/docs/source/_static/images/missing_value_barchart_example_0.png b/_docs/docs/source/_static/images/missing_value_barchart_example_0.png new file mode 100644 index 000000000..33cb7afd2 Binary files /dev/null and b/_docs/docs/source/_static/images/missing_value_barchart_example_0.png differ diff --git a/_docs/docs/source/_static/images/missing_value_matrix_example_0.png b/_docs/docs/source/_static/images/missing_value_matrix_example_0.png new file mode 100644 index 000000000..21799cddf Binary files /dev/null and b/_docs/docs/source/_static/images/missing_value_matrix_example_0.png differ diff --git a/_docs/docs/source/add_new_model_to_data_labeler.nblink b/_docs/docs/source/add_new_model_to_data_labeler.nblink new file mode 100644 index 000000000..4c5fe646a --- /dev/null +++ b/_docs/docs/source/add_new_model_to_data_labeler.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/add_new_model_to_data_labeler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/architecture.rst b/_docs/docs/source/architecture.rst new file mode 100644 index 000000000..469308993 --- /dev/null +++ b/_docs/docs/source/architecture.rst @@ -0,0 +1,48 @@ +.. _architecture: + +Architecture & Design Overview +****************************** + +This section describes the design rationale, algorithmic choices, assumptions, testing strategy, and contribution process used in the DataProfiler library. + +Overview +-------- + +DataProfiler computes numeric statistics (e.g., mean, variance, skewness, kurtosis) using **streaming algorithms** that allow efficient, incremental updates without recomputing from raw data. Approximate quantile metrics like the median are calculated using histogram-based estimation, making the system scalable for large or streaming datasets. + +Additionally, DataProfiler uses a **Convolutional Neural Network (CNN)** to detect and label entities (e.g., names, emails, credit cards) in unstructured text. This supports critical tasks such as **PII detection**, **schema inference**, and **data quality analysis** across structured and unstructured data. + +Algorithm Rationale +------------------- + +The algorithms used are designed for **speed, scalability, and flexibility**: + +- **Streaming numeric methods** (e.g., Welford's algorithm, moment-based metrics, histogram binning) efficiently summarize data without full recomputation. +- **CNNs for entity detection** are fast, high-throughput, and well-suited for sequence labeling tasks in production environments. + +These choices align with the tool's goal of delivering fast, accurate data profiling with minimal configuration. + +Assumptions & Limitations +------------------------- + +- **Consistent formatting** of sensitive entities is assumed (e.g., standardized credit card or SSN formats). +- **Overlapping entity types** (e.g., phone vs. SSN) may lead to misclassification without context. +- **Synthetic training data** may not fully capture real-world diversity, reducing model accuracy on natural or unstructured text. +- **Quantile estimation** (e.g., median) is approximate and based on binning rather than exact sorting. + +Testing & Validation +-------------------- + +- Comprehensive **unit testing** is performed across Python 3.9, 3.10, and 3.11. +- Tests are executed on every pull request targeting `dev` or `main` branches. +- All pull requests require **two code reviewer approvals** before merging. +- Testing includes correctness, performance, and compatibility checks to ensure production readiness. + +Versioning & Contributions +-------------------------- + +- Versioning and development are managed via **GitHub**. +- Future changes must follow the guidelines in `CONTRIBUTING.md`, including: + - Forking the repo and branching from `dev` or an active feature branch. + - Ensuring **80%+ unit test coverage** for all new functionality. + - Opening a PR and securing **two approvals** prior to merging. diff --git a/_docs/docs/source/column_name_labeler_example.nblink b/_docs/docs/source/column_name_labeler_example.nblink new file mode 100644 index 000000000..c39e674fb --- /dev/null +++ b/_docs/docs/source/column_name_labeler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/column_name_labeler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/conf.py b/_docs/docs/source/conf.py new file mode 100644 index 000000000..80168effd --- /dev/null +++ b/_docs/docs/source/conf.py @@ -0,0 +1,84 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +import re + +sys.path.insert(0, os.path.abspath(f'../../../')) + +# -- Project information ----------------------------------------------------- + +project = 'Data Profiler' +copyright = '2024, Jeremy Goodsitt, Austin Walters, Anh Truong, Grant Eden, and Chris Wallace' +author = 'Jeremy Goodsitt, Austin Walters, Anh Truong, Grant Eden, and Chris Wallace' + +# The full version, including alpha/beta/rc tags +# release = '21.01.20' +from dataprofiler import __version__ as version # noqa F401 + + +version_clip = re.search(r'\s*([\d.]+)', version).group(1) +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'nbsphinx', + 'nbsphinx_link', +] + +# Don't execute the notebook cells when generating the documentation +# This can be configured on a per notebook basis as well +# See: https://nbsphinx.readthedocs.io/en/0.2.15/never-execute.html#Explicitly-Dis-/ +nbsphinx_execute = "never" +nbsphinx_prolog = """ +`View this notebook on GitHub `_ +""" + +autoclass_content = 'both' +autodoc_default_options = { + 'members': True, + 'member-order': 'bysource', + 'undoc-members': True, + 'exclude-members': '__weakref__', + 'inherited-members': True, +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "furo" +html_title = f"
v{version_clip}
" +html_static_path = ["_static"] +html_css_files = ["custom.css"] +html_favicon = "_static/images/DataProfilerLogoLightTheme.png" +html_theme_options = { + "light_logo": "images/DataProfilerLogoLightThemeLong.png", + "dark_logo": "images/DataProfilerDarkLogoLong.png", +} diff --git a/_docs/docs/source/data_labeling.rst b/_docs/docs/source/data_labeling.rst new file mode 100644 index 000000000..db76fe791 --- /dev/null +++ b/_docs/docs/source/data_labeling.rst @@ -0,0 +1,365 @@ +.. _data_labeling: + +Labeler (Sensitive Data) +************************ + +In this library, the term *data labeling* refers to entity recognition. + +Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well. + +*Data Labels* are determined per cell for structured data (column/row when +the *profiler* is used) or at the character level for unstructured data. This +is a list of the default labels. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Identify Entities in Structured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Makes predictions and identifying labels: + +.. code-block:: python + + import dataprofiler as dp + + # load data and data labeler + data = dp.Data("your_data.csv") + data_labeler = dp.DataLabeler(labeler_type='structured') + + # make predictions and get labels per cell + predictions = data_labeler.predict(data) + +Identify Entities in Unstructured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Predict which class characters belong to in unstructured text: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured') + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Prediction what class each character belongs to + model_predictions = data_labeler.predict( + sample, predict_options=dict(show_confidences=True)) + + # Predictions / confidences are at the character level + final_results = model_predictions["pred"] + final_confidences = model_predictions["conf"] + +It's also possible to change output formats, output similar to a **SpaCy** format: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True) + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Set the output to the NER format (start position, end position, label) + data_labeler.set_params( + { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } + ) + + results = data_labeler.predict(sample) + + print(results) + +Train a New Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for training your own data labeler on their own set of structured data +(tabular): + +.. code-block:: python + + import dataprofiler as dp + + # Will need one column with a default label of UNKNOWN + data = dp.Data("your_file.csv") + + data_labeler = dp.train_structured_labeler( + data=data, + save_dirpath="/path/to/save/labeler", + epochs=2 + ) + + data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse + +Load an Existing Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for loading an existing data_labeler: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler( + labeler_type='structured', dirpath="/path/to/my/labeler") + + # get information about the parameters/inputs/output formats for the DataLabeler + data_labeler.help() + +Extending a Data Labeler with Transfer Learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, **a labeler loaded will not be trainable**. In order to load a +trainable DataLabeler, the user must set `trainable=True` or load a labeler +using the `TrainableDataLabeler` class. + +The following illustrates how to change the labels: + +.. code-block:: python + + import dataprofiler as dp + + labels = ['label1', 'label2', ...] # new label set can also be an encoding dict + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will use transfer learning to retrain the data labeler on your new + # dataset and labels. + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2, labels=labels) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + +The following illustrates how to extend the labels: + +.. code-block:: python + + import dataprofiler as dp + + new_labels = ['label1', 'label2', ...] + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will maintain current labels and model weights, but extend the model's + # labels + for label in new_labels: + data_labeler.add_label(label) + + # NOTE: a user can also add a label which maps to the same index as an existing + # label + # data_labeler.add_label(label, same_as='') + + # For a trainable model, the user must then train the model to be able to + # continue using the labeler since the model's graph has likely changed + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + + +Changing pipeline parameters: + +.. code-block:: python + + import dataprofiler as dp + + # load default Data Labeler + data_labeler = dp.DataLabeler(labeler_type='structured') + + # change parameters of specific component + data_labeler.preprocessor.set_params({'param1': 'value1'}) + + # change multiple simultaneously. + data_labeler.set_params({ + 'preprocessor': {'param1': 'value1'}, + 'model': {'param2': 'value2'}, + 'postprocessor': {'param3': 'value3'} + }) + + +Build Your Own Data Labeler +=========================== + +The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused. + +Given a set of the 3 components, you can construct your own DataLabeler: + +.. code-block:: python + from dataprofiler.labelers.base_data_labeler import BaseDataLabeler, \ + TrainableDataLabeler + from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + # load a non-trainable data labeler + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = BaseDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + + # load trainable data labeler + data_labeler = TrainableDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + +Option for swapping out specific components of an existing labeler. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.labelers.character_level_cnn_model import \ + CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = dp.DataLabeler(labeler_type='structured') + data_labeler.set_preprocessor(preprocessor) + data_labeler.set_model(model) + data_labeler.set_postprocessor(postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + +Model Component +~~~~~~~~~~~~~~~ + +In order to create your own model component for data labeling, you can utilize +the `BaseModel` class from `dataprofiler.labelers.base_model` and +overriding the abstract class methods. + +Reviewing `CharacterLevelCnnModel` from +`dataprofiler.labelers.character_level_cnn_model` illustrates the functions +which need an override. + +#. `__init__`: specifying default parameters and calling base `__init__` +#. `_validate_parameters`: validating parameters given by user during setting +#. `_need_to_reconstruct_model`: flag for when to reconstruct a model (i.e. + parameters change or labels change require a model reconstruction) +#. `_construct_model`: initial construction of the model given the parameters +#. `_reconstruct_model`: updates model architecture for new label set while + maintaining current model weights +#. `fit`: mechanism for the model to learn given training data +#. `predict`: mechanism for model to make predictions on data +#. `details`: prints a summary of the model construction +#. `save_to_disk`: saves model and model parameters to disk +#. `load_from_disk`: loads model given a path on disk + + +Preprocessor Component +~~~~~~~~~~~~~~~~~~~~~~ + +In order to create your own preprocessor component for data labeling, you can +utilize the `BaseDataPreprocessor` class +from `dataprofiler.labelers.data_processing` and override the abstract class +methods. + +Reviewing `StructCharPreprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the user data and converts it into an digestible, + iterable format for the model +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor + +Postprocessor Component +~~~~~~~~~~~~~~~~~~~~~~~ + +The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +`BaseDataPostprocessor` class from `dataprofiler.labelers.data_processing` +and override the abstract class methods. + +Reviewing `StructCharPostprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the output of the model and processes for output to + the user +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor diff --git a/_docs/docs/source/data_reader.nblink b/_docs/docs/source/data_reader.nblink new file mode 100644 index 000000000..8d7215f46 --- /dev/null +++ b/_docs/docs/source/data_reader.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/data_readers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/data_readers.rst b/_docs/docs/source/data_readers.rst new file mode 100644 index 000000000..877ea56dd --- /dev/null +++ b/_docs/docs/source/data_readers.rst @@ -0,0 +1,184 @@ +.. _data_readers: + +Data Readers +************ + +The `Data` class itself will identify then output one of the following `Data` class types. +Using the data reader is easy, just pass it through the Data object. + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("your_file.csv") + +The supported file types are: + +* CSV file (or any delimited file) +* JSON object +* Avro file +* Parquet file +* Graph data file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + +It's also possible to specifically call one of the data classes such as the following command: + +.. code-block:: python + + from dataprofiler.data_readers.csv_data import CSVData + data = CSVData("your_file.csv", options={"delimiter": ","}) + +Additionally any of the data classes can be loaded using a URL: + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"}) + +Below are descriptions of the various `Data` classes and the available options. + +CSVData +======= + +Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter. + +`CSVData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - Must be a string, for example `"delimiter": ","` +* data_format - Must be a string, possible choices: "dataframe", "records" +* selected_columns - Columns being selected from the entire dataset, must be a + list `["column 1", "ssn"]` +* sample_nrows - Reservoir sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +JSONData +======== + +Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the "data" property, the "metadata" property, and the +"data_and_metadata" property. + +`JSONData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* payload_keys - The dictionary keys for the payload of the JSON, typically called "data" + or "payload". Defaults to ["data", "payload", "response"]. + + +AVROData +======== + +Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter. + +`AVROData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "avro", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for AVROs with a JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` + +ParquetData +=========== + +Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter. + +`ParquetData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json" +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* sample_nrows - Random sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. + +GraphData +========= + +Data Class for loading datasets of graph data. Currently takes CSV format, +further type formats will be supported. Can be specified by passing +in memory data (NetworkX Graph) or via a file path. Options pertaining the CSV file may also +be specified using the options dict parameter. Loads data from CSV into memory +as a NetworkX Graph. + +`GraphData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - must be a string, for example `"delimiter": ","` +* data_format - must be a string, possible choices: "graph", "dataframe", "records" +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +TextData +======== + +Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. + +`TextData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format: user selected format in which to return data. Currently only supports "text". +* samples_per_line - chunks by which to read in the specified dataset + + +Data Using a URL +================ + +Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter. + +`Data(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* verify_ssl: must be a boolean string, choices: "True", "False". Set to "True" by default. + +Data Using an AWS S3 URI +======================== + +Data class for loading datasets from AWS S3 URI. Specified by passing in +any valid bucket path that points to one of the valid data types. + +`Data('s3a://my-bucket/file_name.txt')` + +Possible `options`: + +* `storage_options`: must be a dictionary where the keys for boto3 initialization are set + If `storage_options` is provided in `options`, the below variables are retrieved from the dictionary provided. Otherwise, will retrieve from `environment variables `_. + + * `AWS_ACCESS_KEY_ID` + * `AWS_SECRET_ACCESS_KEY` + * `AWS_SESSION_TOKEN` + * `AWS_REGION` (default `us-east-1`) diff --git a/_docs/docs/source/examples.rst b/_docs/docs/source/examples.rst new file mode 100644 index 000000000..3637da6ac --- /dev/null +++ b/_docs/docs/source/examples.rst @@ -0,0 +1,24 @@ +.. _examples: + +Examples +******** + +These examples provide a more in-depth look into the details of the ``Data Profiler`` library. + +Basics +------ + +.. toctree:: + :maxdepth: 0 + + Overview of Data Profiler + Data Reader + Structured Profiler + Unstructured Profiler + Graph Profiler + Labeler + Adding Models to a Labeler Pipeline + Creating a Regex Labeler + Creating a ColumnName Labeler + Merge Profile List + Dataloader with Popmon Reports diff --git a/_docs/docs/source/graph_data_demo.nblink b/_docs/docs/source/graph_data_demo.nblink new file mode 100644 index 000000000..40408c3ae --- /dev/null +++ b/_docs/docs/source/graph_data_demo.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/graph_data_demo.ipynb" +} diff --git a/_docs/docs/source/graphs.rst b/_docs/docs/source/graphs.rst new file mode 100644 index 000000000..23c2d316b --- /dev/null +++ b/_docs/docs/source/graphs.rst @@ -0,0 +1,196 @@ +.. _reports: + +Graphs +****** + +Graph Your Data +=============== + +We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples. + +The following plots are currently available to work directly with your profilers: + + * histogram (numeric columns only) + * missing values matrix + +Below shows how to do so with examples. + +What we need to import +~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + from dataprofiler.reports import graphs + +The main functions that is used to plot histograms are in graphs. **You will also need the `dataprofiler[reports]` requirement to be installed**: + +.. code-block:: console + + pip install 'dataprofiler[reports]' + +Plotting from a StructuredProfiler class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms. + +.. code-block:: python + + graphs.plot_histograms(profiler, column_names, column_inds) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **columns** - (Optional) The list of IntColumn or FloatColumn *names* we want to specifically plot. If specified, `column_inds` cannot be specified. + * **column_inds** - (Optional) The list of IntColumn or FloatColumn *indexes* we want to specifically plot. If specified, `column_names` cannot be specified. + + +Additionally, we can also plot the missing values matrix for a StructuredProfiler: + +.. code-block:: python + + graphs.plot_missing_values_matrix(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + + +Plotting an individual IntColumn or FloatColumn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a column's Int or Float profile, we can plot their respective histograms. + +.. code-block:: python + + graphs.plot_col_histogram(column, axes, title) + +These are what the variables mean: + + * **column** - The IntColumn or FloatColumn we want to plot + * **axes** - (Optional) The MatPlotLib Axes to plot the histogram within. + * **title** - (Optional) The title of the axes we want to define. + + +Additionally, we can also plot the missing values bargraph for any column profile: + +.. code-block:: python + + graphs.plot_col_missing_values(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - The StructuredColProfiler we want to plot + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + +Examples +~~~~~~~~ + +Histograms +---------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = [[1, 'a', 1.0], + [2, 'b', 2.2], + [3, 'c', 3.5], + [None, 'd', 10.0]] + profiler = dp.StructuredProfiler(data) + + # This will plot all IntColumn and FloatColumn as histograms (The first and last column). + fig = graphs.plot_histograms(profiler) + fig.show() + + # This will only plot the specified column, 0. + columns_names = [0] + fig = graphs.plot_histograms(profiler, columns_names) + fig.show() + +.. image:: _static/images/histogram_example_0.png + :alt: First Histogram Example Image + +.. image:: _static/images/histogram_example_1.png + :alt: Second Histogram Example Image + +2. This example demonstrates how we can plot a low level profiler. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers import IntColumn + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3], dtype=str) + profiler = IntColumn('example') + profiler.update(data) + + # Plot the axes + ax = graphs.plot_col_histogram(profiler) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/histogram_example_2.png + :alt: Histogram Column Only Example Image + + +Missing Values Matrix +--------------------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot a missing values matrix. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = pd.DataFrame( + [[None, '', 1.0, '1/2/2021'], + [3, None, 3.5, ''], + [1, None, 1.0, '2/5/2020'], + [None, 1, 10.0, '3/5/2020']], + columns=['integer', 'str', 'float', 'datetime'], + dtype=object + ) + profiler = dp.StructuredProfiler(data) + + # This will plot the missing values matrix for all columns. + fig = graphs.plot_missing_values_matrix(profiler) + fig.show() + +.. image:: _static/images/missing_value_matrix_example_0.png + :alt: Missing Values Matrix Example Image + +2. This example demonstrates how we can plot barchart of a column's missing values. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers.profile_builder import StructuredColProfiler + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3, None, None, 4], name='example', dtype=str) + profiler = StructuredColProfiler(data) + + # Plot the axes, can be a list of multiple columns + ax = graphs.plot_col_missing_values([profiler]) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/missing_value_barchart_example_0.png + :alt: Missing Values Column Only Example Image \ No newline at end of file diff --git a/_docs/docs/source/index.rst b/_docs/docs/source/index.rst new file mode 100644 index 000000000..8225be28f --- /dev/null +++ b/_docs/docs/source/index.rst @@ -0,0 +1,479 @@ +.. _Data Profiler: + +==================================== +Data Profiler | What's in your data? +==================================== + +Purpose +======= + +The DataProfiler is a Python library designed to make data analysis, monitoring and **sensitive data detection** easy. + +Loading **Data** with a single command, the library automatically formats & loads files into a DataFrame. **Profiling** the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports. + +The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify **sensitive data** (or **PII**). If customization is needed, it's easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition. + +The best part? Getting started only takes a few lines of code (`Example CSV`_): + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + readable_report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(readable_report, indent=4)) + + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +If you have suggestions or find a bug, please open an `issue`_. + +Visit the :ref:`API` to explore Data Profiler's terminology. + + +What is a Data Profile? +======================= + +In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are "global statistics" or `global_stats`, which contain dataset level data and there are "column/row level statistics" or `data_stats` (each column is a new key-value entry). + +The format for a structured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "column_count": int, + "row_count": int, + "row_has_null_ratio": float, + "row_is_null_ratio": float, + "unique_row_ratio": float, + "duplicate_row_count": int, + "file_type": string, + "encoding": string, + "correlation_matrix": list[list[int]], (*) + "chi2_matrix": list[list[float]], + "profile_schema": dict[string, list[int]] + }, + "data_stats": [ + { + "column_name": string, + "data_type": string, + "data_label": string, + "categorical": bool, + "order": string, + "samples": list[str], + "statistics": { + "sample_size": int, + "null_count": int, + "null_types": list[string], + "null_types_index": dict[string, list[int]], + "data_type_representation": dict[string, list[string]], + "min": [null, float], + "max": [null, float], + "sum": float, + "mode": list[float], + "median": float, + "median_absolute_deviation": float, + "mean": float, + "variance": float, + "stddev": float, + "skewness": float, + "kurtosis": float, + "num_zeros": int, + "num_negatives": int, + "histogram": { + "bin_counts": list[int], + "bin_edges": list[float], + }, + "quantiles": { + int: float + }, + "vocab": list[char], + "avg_predictions": dict[string, float], + "data_label_representation": dict[string, float], + "categories": list[str], + "unique_count": int, + "unique_ratio": float, + "categorical_count": dict[string, int], + "gini_impurity": float, + "unalikeability": float, + "precision": { + 'min': int, + 'max': int, + 'mean': float, + 'var': float, + 'std': float, + 'sample_size': int, + 'margin_of_error': float, + 'confidence_level': float + }, + "times": dict[string, float], + "format": string + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + ] + +(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True. + +The format for an unstructured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "empty_line_count": int, + "file_type": string, + "encoding": string, + "memory_size": float, # in MB + }, + "data_stats": { + "data_label": { + "entity_counts": { + "word_level": dict[string, int], + "true_char_level": dict[string, int], + "postprocess_char_level": dict[string, int] + }, + "entity_percentages": { + "word_level": dict[string, float], + "true_char_level": dict[string, float], + "postprocess_char_level": dict[string, float] + }, + "times": dict[string, float] + }, + "statistics": { + "vocab": list[char], + "vocab_count": dict[string, int], + "words": list[string], + "word_count": dict[string, int], + "times": dict[string, float] + } + } + +The format for a graph profile is below: + +.. code-block:: python + + "num_nodes": int, + "num_edges": int, + "categorical_attributes": list[string], + "continuous_attributes": list[string], + "avg_node_degree": float, + "global_max_component_size": int, + "continuous_distribution": { + "": { + "name": string, + "scale": float, + "properties": list[float, np.array] + }, + "": None, + }, + "categorical_distribution": { + "": None, + "": { + "bin_counts": list[int], + "bin_edges": list[float] + }, + }, + "times": dict[string, float] + +Supported Data Formats +~~~~~~~~~~~~~~~~~~~~~~ + +* Any delimited file (CSV, TSV, etc.) +* JSON object +* Avro file +* Parquet file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + + +Data Labels +~~~~~~~~~~~ + +*Data Labels* are determined per cell for structured data (column/row when the *profiler* is used) or at the character level for unstructured data. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Get Started +=========== + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Unstructured Profiler +~~~~~~~~~~~~~~~~~~~~~ + +In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as `unstructured`. Below is an example of unstructured profile +with a text file. + +.. code-block:: python + + import dataprofiler as dp + import json + my_text = dp.Data('text_file.txt') + profile = dp.Profiler(my_text) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Another example of unstructured profile with pd.Series of string is given as below + +.. code-block:: python + + import dataprofiler as dp + import pandas as pd + import json + + text_data = pd.Series(['first string', 'second string']) + profile = dp.Profiler(text_data, profiler_type="unstructured") + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Graph Profiler +~~~~~~~~~~~~~~ + +DataProfiler also provides the ability to profile graph data from a csv file. Below is an example of the graph profiler with a graph data csv file: + +.. code-block:: python + + import dataprofiler as dp + import pprint + + my_graph = dp.Data('graph_file.csv') + profile = dp.Profiler(my_graph) + + # print the report using pretty print (json dump does not work on numpy array values inside dict) + report = profile.report() + printer = pprint.PrettyPrinter(sort_dicts=False, compact=True) + printer.pprint(report) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + import os + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4)) + + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Getting Started: + + Intro + install.rst + data_readers.rst + profiler.rst + data_labeling.rst + graphs.rst + architecture.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: User Guide: + + examples.rst + API.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Community: + + roadmap.rst + Changelog + Feedback + GitHub + Contributing + +.. _Example CSV: https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv +.. _issue: https://github.com/capitalone/DataProfiler/issues/new/choose + + + diff --git a/_docs/docs/source/install.rst b/_docs/docs/source/install.rst new file mode 100644 index 000000000..bdf4c3bb4 --- /dev/null +++ b/_docs/docs/source/install.rst @@ -0,0 +1,145 @@ +.. _install: + +Install +******* + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +Snappy Installation +=================== + +This is required to profile parquet/avro datasets + +MacOS (intel chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/usr/local/include -L/usr/local/lib" pip install python-snappy + + +MacOS (apple chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/opt/homebrew/include -L/opt/homebrew/lib" pip install python-snappy + + +Linux install: + +.. code-block:: console + + sudo apt-get -y install libsnappy-dev + + +Build From Scratch +================== + +NOTE: Installation for python3 + +virtualenv install: + +.. code-block:: console + + python3 -m pip install virtualenv + + +Setup virtual env: + +.. code-block:: console + + python3 -m virtualenv --python=python3 venv3 + source venv3/bin/activate + + +Install requirements: + +.. code-block:: console + + pip3 install -r requirements.txt + +Install labeler dependencies: + +.. code-block:: console + + pip3 install -r requirements-ml.txt + + +Install via the repo -- Build setup.py and install locally: + +.. code-block:: console + + python3 setup.py sdist bdist bdist_wheel + pip3 install dist/DataProfiler*-py3-none-any.whl + + +If you see: + +.. code-block:: console + + ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler') + +This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above. + +Install via github: + +.. code-block:: console + + pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler + + + +Testing +======= + +For testing, install test requirements: + +.. code-block:: console + + pip3 install -r requirements-test.txt + + +To run all unit tests, use: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py" + + +To run file of unit tests, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py + + +To run a file with Pytest use: + +.. code-block:: console + + DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v + + +To run individual of unit test, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler + + diff --git a/_docs/docs/source/labeler.nblink b/_docs/docs/source/labeler.nblink new file mode 100644 index 000000000..f862443fd --- /dev/null +++ b/_docs/docs/source/labeler.nblink @@ -0,0 +1,6 @@ +{ + "path": "../../../examples/labeler.ipynb", + "extra-media": [ + "../../../examples/DL-Flowchart.png" + ] +} \ No newline at end of file diff --git a/_docs/docs/source/merge_profile_list.nblink b/_docs/docs/source/merge_profile_list.nblink new file mode 100644 index 000000000..39102658b --- /dev/null +++ b/_docs/docs/source/merge_profile_list.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/merge_profile_list.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/modules.rst b/_docs/docs/source/modules.rst new file mode 100644 index 000000000..0593459df --- /dev/null +++ b/_docs/docs/source/modules.rst @@ -0,0 +1,7 @@ +dataprofiler +============ + +.. toctree:: + :maxdepth: 4 + + dataprofiler diff --git a/_docs/docs/source/overview.nblink b/_docs/docs/source/overview.nblink new file mode 100644 index 000000000..4c118878e --- /dev/null +++ b/_docs/docs/source/overview.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/intro_data_profiler.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/popmon_dp_loader_example.nblink b/_docs/docs/source/popmon_dp_loader_example.nblink new file mode 100644 index 000000000..1a288a318 --- /dev/null +++ b/_docs/docs/source/popmon_dp_loader_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/popmon_dp_loader_example.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/profiler.rst b/_docs/docs/source/profiler.rst new file mode 100644 index 000000000..56d16a274 --- /dev/null +++ b/_docs/docs/source/profiler.rst @@ -0,0 +1,965 @@ +.. _profiler: + +Profiler +******** + +Profile Your Data +================= + +Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + + readable_report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(readable_report, indent=4)) + +If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page. + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Profile Differences +~~~~~~~~~~~~~~~~~~~ + +Profile differences take two profiles and find the differences +between them. Create the difference report like this: + +.. code-block:: python + + from dataprofiler import Data, Profiler + + # Load a CSV file + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + diff_report = profile1.diff(profile2) + print(diff_report) + +The `.diff()` operation is available between two profiles, although there are different +outputs depending on the type of profile being differenced. For example, for numerical +column profiles (e.g. integers and floats), two valuable calculations that +`.diff()` returns are `t-test`, `chi2-test`, and `psi` (Popoulation Stability Index) +for understanding distributional changes. + +The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference: + +* **Int/Float** - One profile subtracts the value from the other. + +* **String** - The strings will be shown in a list: + + - [profile1 str, profile2 str] +* **List** - A list of 3 will be returned showing the unique values of + each profile and the shared values: + + - [profile 1 unique values, shared values, profile 2 unique values] +* **Dict** - Some dictionaries with varied keys will also return a list + of three in the format: + + - [profile 1 unique key-values, shared key differences, profile 2 unique key-values] + +Otherwise, when no differences occur: + +* **Any Type No Differences** - A string will report: "unchanged". + +Below is the structured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'column_count': int, + 'row_count': int, + 'row_has_null_ratio': float, + 'row_is_null_ratio': float, + 'unique_row_ratio': float, + 'duplicate_row_count': int, + 'correlation_matrix': list[list[float]], + 'chi2_matrix': list[list[float]], + 'profile_schema': list[dict[str, int]] + }, + 'data_stats': [{ + 'column_name': str, + 'data_type': [str, str], + 'data_label': [list[str], list[str], list[str]], + 'categorical': [str, str], + 'order': [str, str], + 'statistics': { + 'min': float, + 'max': float, + 'sum': float, + 'mean': float, + 'median': float, + 'mode': [list[float], list[float], list[float]], + 'median_absolute_deviation': float, + 'variance': float, + 'stddev': float, + 't-test': { + 't-statistic': float, + 'conservative': {'deg_of_free': int, + 'p-value': float}, + 'welch': {'deg_of_free': float, + 'p-value': float}}, + 'psi': float, + "chi2-test": { + "chi2-statistic": float, + "deg_of_free": int, + "p-value": float + }, + 'unique_count': int, + 'unique_ratio': float, + 'categories': [list[str], list[str], list[str]], + 'gini_impurity': float, + 'unalikeability': float, + 'categorical_count': [dict[str, int], dict[str, int], dict[str, int]], + 'avg_predictions': [dict[str, float]], + 'label_representation': [dict[str, float]], + 'sample_size': int, + 'null_count': int, + 'null_types': [list[str], list[str], list[str]], + 'null_types_index': [dict[str, int], dict[str, int], dict[str, int]], + 'data_type_representation': [dict[str, float]] + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + +Below is the unstructured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'empty_line_count': int, + 'memory_size': float + }, + 'data_stats': { + 'data_label': { + 'entity_counts': { + 'word_level': dict[str, int], + 'true_char_level': dict[str, int], + 'postprocess_char_level': dict[str, int] + }, + 'entity_percentages': { + 'word_level': dict[str, float], + 'true_char_level': dict[str, float], + 'postprocess_char_level': dict[str, float] + } + }, + 'statistics': { + 'vocab': [list[str], list[str], list[str]], + 'vocab_count': [dict[str, int], dict[str, int], dict[str, int]], + 'words': [list[str], list[str], list[str]], + 'word_count': [dict[str, int], dict[str, int], dict[str, int]] + } + } + } + + +Saving and Loading a Profile +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The profiles can easily be saved and loaded as shown below: + +**NOTE: Json saving and loading only supports Structured Profiles currently.** + +There are two save/load methods: + +* **Pickle save/load** + + * Save a profile as a `.pkl` file. + * Load a `.pkl` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to pkl file + profile.save(filepath="my_profile.pkl") + + # load pkl file to structured profile + loaded_pkl_profile = dp.Profiler.load(filepath="my_profile.pkl") + + print(json.dumps(loaded_pkl_profile.report(report_options={"output_format": "compact"}), + indent=4)) + +* **Json save/load** + + * Save a profile as a human-readable `.json` file. + * Load a `.json` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to json file + profile.save(filepath="my_profile.json", save_method="json") + + # load json file to structured profile + loaded_json_profile = dp.Profiler.load(filepath="my_profile.json", load_method="json") + + print(json.dumps(loaded_json_profile.report(report_options={"output_format": "compact"}), + indent=4)) + + +Structured vs Unstructured Profiles +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below: + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Creating a structured profile + data1 = Data("normal_csv_file.csv") + structured_profile = Profiler(data1, profiler_type="structured") + + structured_report = structured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(structured_report, indent=4)) + + # Creating an unstructured profile + data2 = Data("normal_text_file.txt") + unstructured_profile = Profiler(data2, profiler_type="unstructured") + + unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(unstructured_report, indent=4)) + + +Setting the Sample Size +~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2) + +The first two samples (1.0 and NULL) are used for the statistical analysis. + +In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2) + +This will use all samples in the statistical analysis until the number of "true" +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples. + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4)) + +Setting Profiler Seed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a seed for reproducibility. + +.. code-block:: python + + import dataprofiler as dp + + # Set seed to non-negative integer value or None + dp.set_seed(0) + + +Profile Statistic Descriptions +============================== + +Structured Profile +~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* column_count - the number of columns contained in the input dataset +* row_count - the number of rows contained in the input dataset +* row_has_null_ratio - the proportion of rows that contain at least one null value to the total number of rows +* row_is_null_ratio - the proportion of rows that are fully comprised of null values (null rows) to the total number of rows +* unique_row_ratio - the proportion of distinct rows in the input dataset to the total number of rows +* duplicate_row_count - the number of rows that occur more than once in the input dataset +* file_type - the format of the file containing the input dataset (ex: .csv) +* encoding - the encoding of the file containing the input dataset (ex: UTF-8) +* correlation_matrix - matrix of shape `column_count` x `column_count` containing the correlation coefficients between each column in the dataset +* chi2_matrix - matrix of shape `column_count` x `column_count` containing the chi-square statistics between each column in the dataset +* profile_schema - a description of the format of the input dataset labeling each column and its index in the dataset + * string - the label of the column in question and its index in the profile schema +* times - the duration of time it took to generate the global statistics for this dataset in milliseconds + +**data_stats**: + +* column_name - the label/title of this column in the input dataset +* data_type - the primitive python data type that is contained within this column +* data_label - the label/entity of the data in this column as determined by the Labeler component +* categorical - 'true' if this column contains categorical data +* order - the way in which the data in this column is ordered, if any, otherwise “random†+* samples - a small subset of data entries from this column +* statistics - statistical information on the column + * sample_size - number of input data samples used to generate this profile + * null_count - the number of null entries in the sample + * null_types - a list of the different null types present within this sample + * null_types_index - a dict containing each null type and a respective list of the indicies that it is present within this sample + * data_type_representation - the percentage of samples used identifying as each data_type + * min - minimum value in the sample + * max - maximum value in the sample + * mode - mode of the entries in the sample + * median - median of the entries in the sample + * median_absolute_deviation - the median absolute deviation of the entries in the sample + * sum - the total of all sampled values from the column + * mean - the average of all entries in the sample + * variance - the variance of all entries in the sample + * stddev - the standard deviation of all entries in the sample + * skewness - the statistical skewness of all entries in the sample + * kurtosis - the statistical kurtosis of all entries in the sample + * num_zeros - the number of entries in this sample that have the value 0 + * num_negatives - the number of entries in this sample that have a value less than 0 + * histogram - contains histogram relevant information + * bin_counts - the number of entries within each bin + * bin_edges - the thresholds of each bin + * quantiles - the value at each percentile in the order they are listed based on the entries in the sample + * vocab - a list of the characters used within the entries in this sample + * avg_predictions - average of the data label prediction confidences across all data points sampled + * categories - a list of each distinct category within the sample if `categorial` = 'true' + * unique_count - the number of distinct entries in the sample + * unique_ratio - the proportion of the number of distinct entries in the sample to the total number of entries in the sample + * categorical_count - number of entries sampled for each category if `categorical` = 'true' + * gini_impurity - measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset + * unalikeability - a value denoting how frequently entries differ from one another within the sample + * precision - a dict of statistics with respect to the number of digits in a number for each sample + * times - the duration of time it took to generate this sample's statistics in milliseconds + * format - list of possible datetime formats +* null_replication_metrics - statistics of data partitioned based on whether column value is null (index 1 of lists referenced by dict keys) or not (index 0) + * class_prior - a list containing probability of a column value being null and not null + * class_sum - a list containing sum of all other rows based on whether column value is null or not + * class_mean - a list containing mean of all other rows based on whether column value is null or not + +Unstructured Profile +~~~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* empty_line_count - the number of empty lines in the input data +* file_type - the file type of the input data (ex: .txt) +* encoding - file encoding of the input data file (ex: UTF-8) +* memory_size - size of the input data in MB +* times - duration of time it took to generate this profile in milliseconds + +**data_stats**: + +* data_label - labels and statistics on the labels of the input data + * entity_counts - the number of times a specific label or entity appears inside the input data + * word_level - the number of words counted within each label or entity + * true_char_level - the number of characters counted within each label or entity as determined by the model + * postprocess_char_level - the number of characters counted within each label or entity as determined by the postprocessor + * entity_percentages - the percentages of each label or entity within the input data + * word_level - the percentage of words in the input data that are contained within each label or entity + * true_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the model + * postprocess_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the postprocessor + * times - the duration of time it took for the data labeler to predict on the data +* statistics - statistics of the input data + * vocab - a list of each character in the input data + * vocab_count - the number of occurrences of each distinct character in the input data + * words - a list of each word in the input data + * word_count - the number of occurrences of each distinct word in the input data + * times - the duration of time it took to generate the vocab and words statistics in milliseconds + +Graph Profile +~~~~~~~~~~~~~~~~~~ + +* num_nodes - number of nodes in the graph +* num_edges - number of edges in the graph +* categorical_attributes - list of categorical edge attributes +* continuous_attributes - list of continuous edge attributes +* avg_node_degree - average degree of nodes in the graph +* global_max_component_size: size of the global max component + +**continuous_distribution**: + +* : name of N-th edge attribute in list of attributes + * name - name of distribution for attribute + * scale - negative log likelihood used to scale and compare distributions + * properties - list of statistical properties describing the distribution + * [shape (optional), loc, scale, mean, variance, skew, kurtosis] + +**categorical_distribution**: + +* : name of N-th edge attribute in list of attributes + * bin_counts: counts in each bin of the distribution histogram + * bin_edges: edges of each bin of the distribution histogram + +* times - duration of time it took to generate this profile in milliseconds + +Profile Options +=============== + +The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options). + + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler, ProfilerOptions + + # Load and profile a CSV file + data = Data("your_file.csv") + profile_options = ProfilerOptions() + + #All of these are different examples of adjusting the profile options + + # Options can be toggled directly like this: + profile_options.structured_options.text.is_enabled = False + profile_options.structured_options.text.vocab.is_enabled = True + profile_options.structured_options.int.variance.is_enabled = True + profile_options.structured_options.data_labeler.data_labeler_dirpath = \ + "Wheres/My/Datalabeler" + profile_options.structured_options.data_labeler.is_enabled = False + + # A dictionary can be sent in to set the properties for all the options + profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False}) + + # Specific columns can be set/disabled/enabled in the same way + profile_options.structured_options.text.set({"max.is_enabled":True, + "variance.is_enabled": True}) + + # numeric stats can be turned off/on entirely + profile_options.set({"is_numeric_stats_enabled": False}) + profile_options.set({"int.is_numeric_stats_enabled": False}) + + profile = Profiler(data, options=profile_options) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Below is an breakdown of all the options. + +* **ProfilerOptions** - The top-level options class that contains options for the Profiler class + + * **presets** - A pre-configured mapping of a string name to group of options: + + * **default is None** + + * **"complete"** + + .. code-block:: python + + options = ProfilerOptions(presets="complete") + + * **"data_types"** + + .. code-block:: python + + options = ProfilerOptions(presets="data_types") + + * **"numeric_stats_disabled"** + + .. code-block:: python + + options = ProfilerOptions(presets="numeric_stats_disabled") + + * **"lower_memory_sketching"** + + .. code-block:: python + + options = ProfilerOptions(presets="lower_memory_sketching") + + * **structured_options** - Options responsible for all structured data + + * **multiprocess** - Option to enable multiprocessing. If on, multiprocessing is toggled on if the dataset contains more than 750,000 rows or more than 20 columns. + Automatically selects the optimal number of pooling processes to utilize based on system constraints when toggled on. + + * is_enabled - (Boolean) Enables or disables multiprocessing + + * **sampling_ratio** - A percentage, as a decimal, ranging from greater than 0 to less than or equal to 1 indicating how much input data to sample. Default value set to 0.2. + + * **int** - Options for the integer columns + + * is_enabled - (Boolean) Enables or disables the integer operations + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **float** - Options for the float columns + + * is_enabled - (Boolean) Enables or disables the float operations + * precision - Finds the precision (significant figures) within the column + + * is_enabled - (Boolean) Enables or disables precision + * sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **text** - Options for the text columns + + * is_enabled - (Boolean) Enables or disables the text operations + * vocab - Finds all the unique characters used in a column + + * is_enabled - (Boolean) Enables or disables vocab + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **datetime** - Options for the datetime columns + + * is_enabled - (Boolean) Enables or disables the datetime operations + * **order** - Options for the order columns + + * is_enabled - (Boolean) Enables or disables the order operations + * **category** - Options for the category columns + + * is_enabled - (Boolean) Enables or disables the category operations + * top_k_categories - (int) Number of categories to be displayed when reporting + * max_sample_size_to_check_stop_condition - (int) The maximum sample size before categorical stop conditions are checked + * stop_condition_unique_value_ratio - (float) The highest ratio of unique values to dataset size that is to be considered a categorical type + * cms - (Boolean) Enables or Disables the use of count min sketch / heavy hitters for approximate frequency counts + * cms_confidence - (float) Defines the number of hashes used in CMS, default 0.95 + * cms_relative_error - (float) Defines the number of buckets used in CMS, default 0.01 + * cms_max_num_heavy_hitters - (int) The value used to define the threshold for minimum frequency required by a category to be counted + * **data_labeler** - Options for the data labeler columns + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + * **correlation** - Option set for correlation profiling + * is_enabled - (Boolean) Enables or disables performing correlation profiling + * columns - Columns considered to calculate correlation + * **row_statistics** - (Boolean) Option to enable/disable row statistics calculations + + * unique_count - (UniqueCountOptions) Option to enable/disable unique row count calculations + + * is_enabled - (Bool) Enables or disables options for unique row count + * hashing_method - (String) Property to specify row hashing method ("full" | "hll") + * hll - (HyperLogLogOptions) Options for alternative method of estimating unique row count (activated when `hll` is the selected hashing_method) + + * seed - (Int) Used to set HLL hashing function seed + * register_count - (Int) Number of registers is equal to 2^register_count + + * null_count - (Boolean) Option to enable/disable functionalities for row_has_null_ratio and row_is_null_ratio + * **chi2_homogeneity** - Options for the chi-squared test matrix + + * is_enabled - (Boolean) Enables or disables performing chi-squared tests for homogeneity between the categorical columns of the dataset. + * **null_replication_metrics** - Options for calculating null replication metrics + + * is_enabled - (Boolean) Enables or disables calculation of null replication metrics + * **unstructured_options** - Options responsible for all unstructured data + + * **text** - Options for the text profile + + * is_case_sensitive - (Boolean) Specify whether the profile is case sensitive + * stop_words - (List of Strings) List of stop words to be removed when profiling + * top_k_chars - (Int) Number of top characters to be retrieved when profiling + * top_k_words - (Int) Number of top words to be retrieved when profiling + * vocab - Options for vocab count + + * is_enabled - (Boolean) Enables or disables the vocab stats + * words - Options for word count + + * is_enabled - (Boolean) Enables or disables the word stats + * **data_labeler** - Options for the data labeler + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + + + +Statistical Dependency on Order of Updates +========================================== + +Some profile features/statistics are dependent on the order in which the profiler +is updated with new data. + +Order Profile +~~~~~~~~~~~~~ + +The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order. + +For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value `>=` than that of the +previous batch of data. + +Ex. of ascending: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [3, 4, 5] + +Ex. of random: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch + + +Reporting Structure +=================== + +For every profile, we can provide a report and customize it with a couple optional parameters: + +* output_format (string) + + * This will allow the user to decide the output format for report. + + * Options are one of [pretty, compact, serializable, flat]: + + * Pretty: floats are rounded to four decimal places, and lists are shortened. + * Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc. + * Serializable: Output is json serializable and not prettified + * Flat: Nested output is returned as a flattened dictionary +* num_quantile_groups (int) + + * You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups! + +.. code-block:: python + + report = profile.report(report_options={"output_format": "pretty"}) + report = profile.report(report_options={"output_format": "compact"}) + report = profile.report(report_options={"output_format": "serializable"}) + report = profile.report(report_options={"output_format": "flat"}) diff --git a/_docs/docs/source/profiler_example.nblink b/_docs/docs/source/profiler_example.nblink new file mode 100644 index 000000000..142ebd97f --- /dev/null +++ b/_docs/docs/source/profiler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/structured_profilers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/regex_labeler_from_scratch.nblink b/_docs/docs/source/regex_labeler_from_scratch.nblink new file mode 100644 index 000000000..3d98c5f1e --- /dev/null +++ b/_docs/docs/source/regex_labeler_from_scratch.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/regex_labeler_from_scratch/DataLabeler_from_scratch.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/source/roadmap.rst b/_docs/docs/source/roadmap.rst new file mode 100644 index 000000000..93886690b --- /dev/null +++ b/_docs/docs/source/roadmap.rst @@ -0,0 +1,58 @@ +.. _roadmap: + +Roadmap +******* + +For more detailed tasks, checkout the repo's github issues page here: +`Github Issues `_. + + +Data Reader Updates +=================== +- Read data from S3 bucket + - All in the current `dp.Data()` API paradigm, we want to enable passing an S3 bucket file path to read in data from AWS s3. +- Pass list of data file paths to data reader +- Pass in linst of data frames to data reader + +New Model +========= +- Transformer model from sensitive data detection + +Historical Profiles +=================== +- Some questions about Historical Profiles / need to step back and rething design to start: + - Meta profile on top? + - Stored windowed info inside? Etc... +- Branch with current state of Historical Profiles +- Two example notebooks of current state: + - Notebook example `one `_. + - Notebook example `two `_. + + +Conditional Report Metric +========================= +- Based on what is populated on other metrics in the report, have "secondary" / "derivatives" of that number (or that number in conjunction with another number) populate in thie report as well. +- For example, if null_count is not None, then populate a null_percent key with a value of the dividence of (null_count / sample_count). + +Space / Time Testing +==================== +- Automatic comparison testing for space and time analysis on PR’s + - Standardize a report for space time analysis for future comparisons (create baseline numbers) + - Include those in integration tests that will automatically run on code when it is changed in PRs +- Could be an optional test, if the user thinks there is concern around the change driving an issue in the library performance + +Testing Suite Upgrades +====================== +- Add mocking to unit tests where mocking is not utilized +- Integration testing separated out from the unit testing suite. Determine how to only run remotely during PRs +- Backward compatibility testing along with informative warnings and errors when a user is utilizing incompatible versions of the library and saved profile object + +Historical Versions +=================== +- Legacy version upgrades to enable patches to prior versions of the Data Profiler + +Miscellaneous +============== +- Refact/or Pandas to Polars DataFrames +- Spearman correlation calculation +- Workflow Profiles diff --git a/_docs/docs/source/unstructured_profiler_example.nblink b/_docs/docs/source/unstructured_profiler_example.nblink new file mode 100644 index 000000000..5b6829754 --- /dev/null +++ b/_docs/docs/source/unstructured_profiler_example.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/unstructured_profilers.ipynb" +} \ No newline at end of file diff --git a/_docs/docs/update_documentation.py b/_docs/docs/update_documentation.py new file mode 100644 index 000000000..7be79612d --- /dev/null +++ b/_docs/docs/update_documentation.py @@ -0,0 +1,87 @@ +#!/usr/bin/python +"""Script which auto updates the github pages documentation.""" +import os +import subprocess +import sys + +sys.path.insert(0, os.path.abspath(f'../../')) + +from dataprofiler import __version__ as version # noqa F401 + +# Make the rst files from the current repo +subprocess.run( + [ + "sphinx-apidoc", + "--templatedir=./source/_templates/", + "-f", + "-e", + "-o", + "../docs/source", + f"../../dataprofiler", + f"../../dataprofiler/tests/", + ] +) + +update_index_rst = True + +if not version: + Exception("There must be a valid version argument.") + +# Check if the source index file has already been updated +source_index = open("source/index.rst", "r+") +source_index_lines = source_index.readlines() +source_index.close() +for sentence in source_index_lines: + if sentence.startswith("* `" + version): + update_index_rst = False + +# Update the index file if needed +version_reference = "" +if update_index_rst: + buffer = 0 + source_index = open("source/index.rst", "w") + for sentence in source_index_lines: + if sentence.startswith("Documentation for"): + doc_version = "Documentation for " + version + "\n" + source_index.write(doc_version) + elif sentence.startswith("Versions"): + source_index.write("Versions\n") + source_index.write("========\n") + version_tag = "* `" + version + "`_\n" + source_index.write(version_tag) + version_reference = ( + ".. _" + version + ": ../../" + version + "/html/index.html\n\n" + ) + buffer = 1 + else: + if buffer == 0: + source_index.write(sentence) + else: + buffer = buffer - 1 + source_index.write(version_reference) +source_index.close() + +# Make the html files + +build_directory = "BUILDDIR= LATEST" +subprocess.run(["make", "html", build_directory]) + +# update the index file to redirect to the most current version of documentation +index_file = open("../index.html", "w") +redirect_link = ( + '' +) +index_file.write(redirect_link) +index_file.close() + +# update the profiler_options.html file to redirect to detailed options docs +index_file = open("../profiler_options.html", "w") +redirect_link = ( + '' +) +index_file.write(redirect_link) +index_file.close() diff --git a/_docs/index.html b/_docs/index.html new file mode 100644 index 000000000..fb51eaca9 --- /dev/null +++ b/_docs/index.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_docs/profiler_options.html b/_docs/profiler_options.html new file mode 100644 index 000000000..831f653ff --- /dev/null +++ b/_docs/profiler_options.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_docs/setup.cfg b/_docs/setup.cfg new file mode 100644 index 000000000..c9c21e52f --- /dev/null +++ b/_docs/setup.cfg @@ -0,0 +1,7 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 + +[isort] +multi_line_output=3 +profile=black diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index 2e89d3e2b..ed54be241 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -1,5 +1,6 @@ """Package for dataprofiler.""" from . import settings +from ._version import get_versions from .data_readers.data import Data from .dp_logging import get_logger, set_verbosity from .labelers.data_labelers import ( @@ -18,23 +19,8 @@ from .profilers.profiler_options import ProfilerOptions from .reports import graphs from .validators.base_validators import Validator -from .version import __version__ -try: - import snappy -except ImportError: - import warnings - - warnings.warn( - "Snappy must be installed to use parquet/avro datasets." - "\n\n" - "For macOS use Homebrew:\n" - "\t`brew install snappy`" - "\n\n" - "For linux use apt-get:\n`" - "\tsudo apt-get -y install libsnappy-dev`\n", - ImportWarning, - ) +__version__ = get_versions()["version"] def set_seed(seed=None): diff --git a/dataprofiler/_version.py b/dataprofiler/_version.py new file mode 100644 index 000000000..669959883 --- /dev/null +++ b/dataprofiler/_version.py @@ -0,0 +1,524 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "dataprofiler-" + cfg.versionfile_source = "dataprofiler/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} # type: ignore +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print(f"unable to find command, tried {commands}") + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs) + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index b168e9234..a4a44e03a 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -237,7 +237,8 @@ def _construct_model(self) -> None: model_loc = self._parameters["model_path"] self._model: tf.keras.Model = tf.keras.models.load_model(model_loc) - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + self._model = tf.keras.Model(self._model.inputs, self._model.outputs) + softmax_output_layer_name = self._model.output_names[0] softmax_layer_ind = cast( int, labeler_utils.get_tf_layer_index_from_name( @@ -252,21 +253,28 @@ def _construct_model(self) -> None: num_labels, activation="softmax", name="softmax_output" )(self._model.layers[softmax_layer_ind - 1].output) - # Output the model into a .pb file for TensorFlow - argmax_layer = tf.keras.backend.argmax(new_softmax_layer) + # Add argmax layer to get labels directly as an output + argmax_layer = tf.keras.ops.argmax(new_softmax_layer, axis=2) argmax_outputs = [new_softmax_layer, argmax_layer] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + self._model = tf.keras.Model(self._model.inputs, self._model.outputs) # Compile the model w/ metrics - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" ) - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + metrics = { + softmax_output_layer_name: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -294,30 +302,33 @@ def _reconstruct_model(self) -> None: num_labels = self.num_labels default_ind = self.label_mapping[self._parameters["default_label"]] - # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax') - for _ in range(2): - self._model.layers.pop() - # Add the final Softmax layer to the previous spot + # self._model.layers[-2] to skip: original softmax final_softmax_layer = tf.keras.layers.Dense( num_labels, activation="softmax", name="softmax_output" - )(self._model.layers[-4].output) + )(self._model.layers[-2].output) - # Output the model into a .pb file for TensorFlow - argmax_layer = tf.keras.backend.argmax(final_softmax_layer) + # Add argmax layer to get labels directly as an output + argmax_layer = tf.keras.ops.argmax(final_softmax_layer, axis=2) argmax_outputs = [final_softmax_layer, argmax_layer] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) # Compile the model - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" ) - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + metrics = { + softmax_output_layer_name: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -370,7 +381,7 @@ def fit( f1_report: dict = {} self._model.reset_metrics() - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] start_time = time.time() batch_id = 0 diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 3194a2616..2cbb7051a 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -74,6 +74,133 @@ def create_glove_char(n_dims: int, source_file: str = None) -> None: file.write(word + " " + " ".join(str(num) for num in embd) + "\n") +@tf.keras.utils.register_keras_serializable(package="CharacterLevelCnnModel") +class ThreshArgMaxLayer(tf.keras.layers.Layer): + """Keras layer applying a thresholded argmax.""" + + def __init__( + self, threshold_: float, num_labels_: int, default_ind: int = 1, *args, **kwargs + ) -> None: + """Apply a minimum threshold to the argmax value. + + When below this threshold the index will be the default. + + :param num_labels: number of entities + :type num_labels: int + :param threshold: default set to 0 so all confidences pass. + :type threshold: float + :param default_ind: default index + :type default_ind: int + :return: final argmax threshold layer for the model + :return : tensor containing argmax thresholded integers, labels out + :rtype: tf.Tensor + """ + super().__init__(*args, **kwargs) + self._threshold_ = threshold_ + self._num_labels_ = num_labels_ + self._default_ind = default_ind + thresh_init = tf.constant_initializer(threshold_) + self.thresh_vec = tf.Variable( + name="ThreshVec", + initial_value=thresh_init(shape=[num_labels_]), + trainable=False, + ) + + def get_config(self): + """Return a serializable config for saving the layer.""" + config = super().get_config().copy() + config.update( + { + "threshold_": self._threshold_, + "num_labels_": self._num_labels_, + "default_ind": self._default_ind, + } + ) + return config + + def call(self, argmax_layer: tf.Tensor, confidence_layer: tf.Tensor) -> tf.Tensor: + """Apply the threshold argmax to the input tensor.""" + threshold_at_argmax = tf.gather(self.thresh_vec, argmax_layer) + + confidence_max_layer = tf.keras.backend.max(confidence_layer, axis=2) + + # Check if the confidences meet the threshold minimum. + argmax_mask = tf.keras.backend.cast( + tf.keras.backend.greater_equal(confidence_max_layer, threshold_at_argmax), + dtype=argmax_layer.dtype, + ) + + # Create a vector the same size as the batch_size which + # represents the background label + bg_label_tf = tf.keras.backend.constant( + self._default_ind, dtype=argmax_layer.dtype + ) + + # Generate the final predicted output using the function: + final_predicted_layer = tf.add( + bg_label_tf, + tf.multiply(tf.subtract(argmax_layer, bg_label_tf), argmax_mask), + name="ThreshArgMax", + ) + # final_predicted_layer.set_shape(argmax_layer.shape) + return final_predicted_layer + + +@tf.keras.utils.register_keras_serializable(package="CharacterLevelCnnModel") +class EncodingLayer(tf.keras.layers.Layer): + """Encodes strings to integers.""" + + def __init__( + self, max_char_encoding_id: int, max_len: int, *args, **kwargs + ) -> None: + """ + Encode characters for the list of sentences. + + :param max_char_encoding_id: Maximum integer value for encoding the + input + :type max_char_encoding_id: int + :param max_len: Maximum char length in a sample + :type max_len: int + """ + super().__init__(*args, **kwargs) + self.max_char_encoding_id = max_char_encoding_id + self.max_len = max_len + + def get_config(self): + """Return a serializable config for saving the layer.""" + config = super().get_config().copy() + config.update( + { + "max_char_encoding_id": self.max_char_encoding_id, + "max_len": self.max_len, + } + ) + return config + + def call(self, input_str_tensor: tf.Tensor) -> tf.Tensor: + """ + Encode characters for the list of sentences. + + :param input_str_tensor: input list of sentences converted to tensor + :type input_str_tensor: tf.tensor + :return : tensor containing encoded list of input sentences + :rtype: tf.Tensor + """ + # convert characters to indices + input_str_flatten = tf.reshape(input_str_tensor, [-1]) + sentences_encode = tf.strings.unicode_decode( + input_str_flatten, input_encoding="UTF-8" + ) + sentences_encode = tf.add(tf.cast(1, tf.int32), sentences_encode) + sentences_encode = tf.math.minimum( + sentences_encode, self.max_char_encoding_id + 1 + ) + + # padding + sentences_encode_pad = sentences_encode.to_tensor(shape=[None, self.max_len]) + return sentences_encode_pad + + class CharacterLevelCnnModel(BaseTrainableModel, metaclass=AutoSubRegistrationMeta): """Class for training char data labeler.""" @@ -280,7 +407,7 @@ def save_to_disk(self, dirpath: str) -> None: labels_dirpath = os.path.join(dirpath, "label_mapping.json") with open(labels_dirpath, "w") as fp: json.dump(self.label_mapping, fp) - self._model.save(os.path.join(dirpath)) + self._model.save(os.path.join(dirpath, "model.keras")) @classmethod def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel: @@ -301,15 +428,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel: with open(labels_dirpath) as fp: label_mapping = json.load(fp) - # use f1 score metric - custom_objects = { - "F1Score": labeler_utils.F1Score( - num_classes=max(label_mapping.values()) + 1, average="micro" - ), - "CharacterLevelCnnModel": cls, - } - with tf.keras.utils.custom_object_scope(custom_objects): - tf_model = tf.keras.models.load_model(dirpath) + tf_model = tf.keras.models.load_model(os.path.join(dirpath, "model.keras")) loaded_model = cls(label_mapping, parameters) loaded_model._model = tf_model @@ -333,35 +452,6 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel: ] return loaded_model - @staticmethod - def _char_encoding_layer( - input_str_tensor: tf.Tensor, max_char_encoding_id: int, max_len: int - ) -> tf.Tensor: - """ - Encode characters for the list of sentences. - - :param input_str_tensor: input list of sentences converted to tensor - :type input_str_tensor: tf.tensor - :param max_char_encoding_id: Maximum integer value for encoding the - input - :type max_char_encoding_id: int - :param max_len: Maximum char length in a sample - :type max_len: int - :return : tensor containing encoded list of input sentences - :rtype: tf.Tensor - """ - # convert characters to indices - input_str_flatten = tf.reshape(input_str_tensor, [-1]) - sentences_encode = tf.strings.unicode_decode( - input_str_flatten, input_encoding="UTF-8" - ) - sentences_encode = tf.add(tf.cast(1, tf.int32), sentences_encode) - sentences_encode = tf.math.minimum(sentences_encode, max_char_encoding_id + 1) - - # padding - sentences_encode_pad = sentences_encode.to_tensor(shape=[None, max_len]) - return sentences_encode_pad - @staticmethod def _argmax_threshold_layer( num_labels: int, threshold: float = 0.0, default_ind: int = 1 @@ -383,47 +473,7 @@ def _argmax_threshold_layer( """ # Initialize the thresholds vector variable and create the threshold # matrix. - class ThreshArgMaxLayer(tf.keras.layers.Layer): - def __init__(self, threshold_: float, num_labels_: int) -> None: - super().__init__() - thresh_init = tf.constant_initializer(threshold_) - self.thresh_vec = tf.Variable( - name="ThreshVec", - initial_value=thresh_init(shape=[num_labels_]), - trainable=False, - ) - - def call( - self, argmax_layer: tf.Tensor, confidence_layer: tf.Tensor - ) -> tf.Tensor: - threshold_at_argmax = tf.gather(self.thresh_vec, argmax_layer) - - confidence_max_layer = tf.keras.backend.max(confidence_layer, axis=2) - - # Check if the confidences meet the threshold minimum. - argmax_mask = tf.keras.backend.cast( - tf.keras.backend.greater_equal( - confidence_max_layer, threshold_at_argmax - ), - dtype=argmax_layer.dtype, - ) - - # Create a vector the same size as the batch_size which - # represents the background label - bg_label_tf = tf.keras.backend.constant( - default_ind, dtype=argmax_layer.dtype - ) - - # Generate the final predicted output using the function: - final_predicted_layer = tf.add( - bg_label_tf, - tf.multiply(tf.subtract(argmax_layer, bg_label_tf), argmax_mask), - name="ThreshArgMax", - ) - - return final_predicted_layer - - return ThreshArgMaxLayer(threshold, num_labels) + return ThreshArgMaxLayer(threshold, num_labels, default_ind) def _construct_model(self) -> None: """ @@ -449,17 +499,13 @@ def _construct_model(self) -> None: max_length = self._parameters["max_length"] max_char_encoding_id = self._parameters["max_char_encoding_id"] - # Encoding layer - def encoding_function(input_str: tf.Tensor) -> tf.Tensor: - char_in_vector = CharacterLevelCnnModel._char_encoding_layer( - input_str, max_char_encoding_id, max_length - ) - return char_in_vector - self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string)) self._model.add( - tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length])) + EncodingLayer( + max_char_encoding_id=max_char_encoding_id, + max_len=max_length, + ), ) # Create a pre-trained weight matrix @@ -474,7 +520,6 @@ def encoding_function(input_str: tf.Tensor) -> tf.Tensor: ) embedding_dict = build_embd_dictionary(embed_file) - input_shape = tuple([max_length]) # Fill in the weight matrix: let pad and space be 0s for ascii_num in range(max_char_encoding_id): if chr(ascii_num) in embedding_dict: @@ -485,7 +530,6 @@ def encoding_function(input_str: tf.Tensor) -> tf.Tensor: max_char_encoding_id + 2, self._parameters["dim_embed"], weights=[embedding_matrix], - input_length=input_shape[0], trainable=True, ) ) @@ -502,8 +546,7 @@ def encoding_function(input_str: tf.Tensor) -> tf.Tensor: ) if self._parameters["dropout"]: self._model.add(tf.keras.layers.Dropout(self._parameters["dropout"])) - # Add batch normalization, set fused = True for compactness - self._model.add(tf.keras.layers.BatchNormalization(fused=False, scale=True)) + self._model.add(tf.keras.layers.BatchNormalization(scale=True)) # Add the fully connected layers for size in self._parameters["size_fc"]: @@ -514,29 +557,35 @@ def encoding_function(input_str: tf.Tensor) -> tf.Tensor: # Add the final Softmax layer self._model.add(tf.keras.layers.Dense(num_labels, activation="softmax")) - # Output the model into a .pb file for TensorFlow - argmax_layer = tf.keras.backend.argmax(self._model.output) + # Add argmax layer to get labels directly as an output + argmax_layer = tf.keras.ops.argmax(self._model.outputs[0], axis=2) # Create confidence layers - final_predicted_layer = CharacterLevelCnnModel._argmax_threshold_layer( - num_labels, threshold=0.0, default_ind=default_ind + final_predicted_layer = ThreshArgMaxLayer( + threshold_=0.0, num_labels_=num_labels, default_ind=default_ind ) argmax_outputs = self._model.outputs + [ argmax_layer, - final_predicted_layer(argmax_layer, self._model.output), + final_predicted_layer(argmax_layer, self._model.outputs[0]), ] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) # Compile the model - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" ) - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + metrics = { + softmax_output_layer_name: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -564,22 +613,18 @@ def _reconstruct_model(self) -> None: num_labels = self.num_labels default_ind = self.label_mapping[self._parameters["default_label"]] - # Remove the 3 output layers (dense_2', 'tf_op_layer_ArgMax', - # 'thresh_arg_max_layer') - for _ in range(3): - self._model.layers.pop() - # Add the final Softmax layer to the previous spot + # self._model.layers[-3] to skip: thresh and original softmax final_softmax_layer = tf.keras.layers.Dense( num_labels, activation="softmax", name="dense_2" - )(self._model.layers[-4].output) + )(self._model.layers[-3].output) - # Output the model into a .pb file for TensorFlow - argmax_layer = tf.keras.backend.argmax(final_softmax_layer) + # Add argmax layer to get labels directly as an output + argmax_layer = tf.keras.ops.argmax(final_softmax_layer, axis=2) # Create confidence layers - final_predicted_layer = CharacterLevelCnnModel._argmax_threshold_layer( - num_labels, threshold=0.0, default_ind=default_ind + final_predicted_layer = ThreshArgMaxLayer( + threshold_=0.0, num_labels_=num_labels, default_ind=default_ind ) argmax_outputs = [final_softmax_layer] + [ @@ -589,14 +634,20 @@ def _reconstruct_model(self) -> None: self._model = tf.keras.Model(self._model.inputs, argmax_outputs) # Compile the model - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" ) - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + metrics = { + softmax_output_layer_name: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } self._model.compile(loss=losses, optimizer="adam", metrics=metrics) self._epoch_id = 0 @@ -648,7 +699,7 @@ def fit( f1_report: dict = {} self._model.reset_metrics() - softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] + softmax_output_layer_name = self._model.output_names[0] start_time = time.time() batch_id = 0 @@ -729,7 +780,9 @@ def _validate_training( for x_val, y_val in val_data: y_val_pred.append( self._model.predict( - x_val, batch_size=batch_size_test, verbose=verbose_keras + tf.convert_to_tensor(x_val), + batch_size=batch_size_test, + verbose=verbose_keras, )[1] ) y_val_test.append(np.argmax(y_val, axis=-1)) diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index 7172e7472..a6d9932b7 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -141,11 +141,11 @@ def load_from_library(cls, name: str, trainable: bool = False) -> BaseDataLabele :type trainable: bool :return: DataLabeler class """ + for labeler_name, labeler_class_obj in cls.labeler_classes.items(): + if name == labeler_name: + name = labeler_class_obj._default_model_loc if trainable: return TrainableDataLabeler.load_from_library(name) - for _, labeler_class_obj in cls.labeler_classes.items(): - if name in labeler_class_obj._default_model_loc: - return labeler_class_obj() return BaseDataLabeler.load_from_library(name) @classmethod diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index b6070ff72..3a24886f3 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -358,7 +358,7 @@ def __init__( def _zero_wt_init(name: str) -> tf.Variable: return self.add_weight( - name, shape=self.init_shape, initializer="zeros", dtype=self.dtype + name=name, shape=self.init_shape, initializer="zeros", dtype=self.dtype ) self.true_positives = _zero_wt_init("true_positives") @@ -435,11 +435,6 @@ def get_config(self) -> dict: base_config = super().get_config() return {**base_config, **config} - def reset_state(self) -> None: - """Reset state.""" - reset_value = tf.zeros(self.init_shape, dtype=self.dtype) - tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) - @protected_register_keras_serializable() class F1Score(FBetaScore): diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index fbfde0c49..c6d70f740 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -272,7 +272,7 @@ def test_fit_and_predict(self, *mocks): ) # predict after fitting on just the text - model.predict(data_gen[0][0]) + model.predict([data_gen[0][0]]) @mock.patch("os.makedirs", return_value=None) def test_validation_evaluate_and_classification_report(self, *mocks): diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index ad549cc53..e120a9754 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -9,7 +9,10 @@ import pkg_resources import tensorflow as tf -from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel +from dataprofiler.labelers.character_level_cnn_model import ( + CharacterLevelCnnModel, + EncodingLayer, +) _file_dir = os.path.dirname(os.path.abspath(__file__)) _resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") @@ -272,7 +275,7 @@ def test_fit_and_predict_with_new_labels(self): ) # predict after fitting on just the text - cnn_model.predict(data_gen[0][0]) + cnn_model.predict([data_gen[0][0]]) def test_fit_and_predict_with_new_labels_set_via_method(self): # Initialize model @@ -301,7 +304,7 @@ def test_fit_and_predict_with_new_labels_set_via_method(self): history, f1, f1_report = cnn_model.fit(data_gen, cv_gen) # test predict on just the text - cnn_model.predict(data_gen[0][0]) + cnn_model.predict([data_gen[0][0]]) def test_validation(self): @@ -368,9 +371,8 @@ def test_input_encoding(self): max_char_encoding_id = 127 max_len = 10 - encode_output = cnn_model._char_encoding_layer( - input_str_tensor, max_char_encoding_id, max_len - ).numpy()[0] + encode_layer = EncodingLayer(max_char_encoding_id, max_len) + encode_output = encode_layer.call(input_str_tensor).numpy()[0] expected_output = [117, 102, 116, 117, 0, 0, 0, 0, 0, 0] self.assertCountEqual(encode_output, expected_output) @@ -464,7 +466,6 @@ def test_model_construct(self): "dense_1", "dropout_5", "dense_2", - "tf_op_layer_ArgMax", "thresh_arg_max_layer", ] model_layers = [layer.name for layer in cnn_model._model.layers] diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index f59a43e3f..c14fca54f 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -1,6 +1,6 @@ import logging +import tempfile import unittest -from unittest import mock import numpy as np import pandas as pd @@ -235,9 +235,7 @@ def test_verbose(self): self.assertIn("f1-score ", log_output) self.assertIn("F1 Score: ", log_output) - @mock.patch("dataprofiler.labelers.labeler_utils.classification_report") - @mock.patch("pandas.DataFrame") - def test_save_conf_mat(self, mock_dataframe, mock_report): + def test_save_conf_mat(self): # ideally mock out the actual contents written to file, but # would be difficult to get this completely worked out. @@ -248,28 +246,25 @@ def test_save_conf_mat(self, mock_dataframe, mock_report): [0, 1, 2], ] ) - expected_row_col_names = dict( - columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"], - index=["true:PAD", "true:UNKNOWN", "true:OTHER"], - ) - mock_instance_df = mock.Mock(spec=pd.DataFrame)() - mock_dataframe.return_value = mock_instance_df - - # still omit bc confusion mat should include all despite omit - f1, f1_report = labeler_utils.evaluate_accuracy( - self.y_pred, - self.y_true, - self.num_labels, - self.reverse_label_mapping, - omitted_labels=["PAD"], - verbose=False, - confusion_matrix_file="test.csv", - ) + expected_columns = ["pred:PAD", "pred:UNKNOWN", "pred:OTHER"] + expected_index = ["true:PAD", "true:UNKNOWN", "true:OTHER"] - self.assertTrue((mock_dataframe.call_args[0][0] == expected_conf_mat).all()) - self.assertDictEqual(expected_row_col_names, mock_dataframe.call_args[1]) + with tempfile.NamedTemporaryFile() as tmpFile: + # still omit bc confusion mat should include all despite omit + f1, f1_report = labeler_utils.evaluate_accuracy( + self.y_pred, + self.y_true, + self.num_labels, + self.reverse_label_mapping, + omitted_labels=["PAD"], + verbose=False, + confusion_matrix_file=tmpFile.name, + ) - mock_instance_df.to_csv.assert_called() + df1 = pd.read_csv(tmpFile.name, index_col=0) + self.assertListEqual(list(df1.columns), expected_columns) + self.assertListEqual(list(df1.index), expected_index) + np.testing.assert_array_equal(df1.values, expected_conf_mat) class TestTFFunctions(unittest.TestCase): diff --git a/dataprofiler/tests/profilers/test_histogram_utils.py b/dataprofiler/tests/profilers/test_histogram_utils.py index 3be8cdcae..10c88b344 100644 --- a/dataprofiler/tests/profilers/test_histogram_utils.py +++ b/dataprofiler/tests/profilers/test_histogram_utils.py @@ -32,7 +32,7 @@ def mock_sqrt_return_nan(profile): return float("nan") -class TestColumn(NumericStatsMixin): +class MockColumn(NumericStatsMixin): def __init__(self): NumericStatsMixin.__init__(self) self.times = defaultdict(float) @@ -75,7 +75,7 @@ def test_ptp(self): def test_calc_doane_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin.stddev", new_callable=mock_stddev @@ -177,7 +177,7 @@ def test_calc_doane_bin_width_from_profile(self): def test_calc_rice_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -230,7 +230,7 @@ def test_calc_rice_bin_width_from_profile(self): def test_calc_sturges_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -283,7 +283,7 @@ def test_calc_sturges_bin_width_from_profile(self): def test_calc_sqrt_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: min, max, and match_count are set expected_dataset_size = profile.match_count @@ -336,7 +336,7 @@ def test_calc_sqrt_bin_width_from_profile(self): def test_calc_fd_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin._get_percentile", @@ -359,7 +359,7 @@ def test_calc_fd_bin_width_from_profile(self): def test_calc_auto_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.histogram_utils._calc_fd_bin_width_from_profile" @@ -396,7 +396,7 @@ def test_calc_auto_bin_width_from_profile(self): def test_calc_scott_bin_width_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() with mock.patch( "dataprofiler.profilers.NumericStatsMixin.stddev", new_callable=mock_stddev @@ -418,7 +418,7 @@ def test_calc_scott_bin_width_from_profile(self): def test_calculate_bins_from_profile(self): # Initial setup of profile - profile = TestColumn() + profile = MockColumn() # Case 1: bin method not in set of valid bin methods with self.assertRaises(ValueError): @@ -457,7 +457,7 @@ def test_calculate_bins_from_profile(self): dataprofiler.profilers.histogram_utils._hist_bin_width_selectors_for_profile, {"sqrt": mock_sqrt_return_none}, ): - profile = TestColumn() + profile = MockColumn() actual = histogram_utils._calculate_bins_from_profile(profile, "sqrt") self.assertEqual(1, actual) @@ -466,6 +466,6 @@ def test_calculate_bins_from_profile(self): dataprofiler.profilers.histogram_utils._hist_bin_width_selectors_for_profile, {"sqrt": mock_sqrt_return_nan}, ): - profile = TestColumn() + profile = MockColumn() actual = histogram_utils._calculate_bins_from_profile(profile, "sqrt") self.assertEqual(1, actual) diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index e112781ab..cac04bfc9 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -18,7 +18,7 @@ test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -class TestColumn(NumericStatsMixin): +class MockColumn(NumericStatsMixin): def __init__(self): NumericStatsMixin.__init__(self) self.match_count = 0 @@ -31,7 +31,7 @@ def _filter_properties_w_options(self, calculations, options): pass -class TestColumnWProps(TestColumn): +class MockColumnWProps(MockColumn): # overrides the property func median = None mode = None @@ -117,9 +117,9 @@ def test_check_int(self): def test_hist_loss_on_merge(self): # Initial setup of profiles - profile3 = TestColumn() - profile1 = TestColumn() - profile2 = TestColumn() + profile3 = MockColumn() + profile1 = MockColumn() + profile2 = MockColumn() mock_histogram1 = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([2, 4, 6, 8, 10]), @@ -161,7 +161,7 @@ def test_update_variance(self): Checks update variance :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # test update variance data1 = [-3.0, 2.0, 11.0] @@ -209,7 +209,7 @@ def test_update_variance_with_varying_data_length(self): data1 = [] mean1, var1, count1 = 0, np.nan, 0 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean1, var1, count1 ) @@ -221,7 +221,7 @@ def test_update_variance_with_varying_data_length(self): data2 = [5.0] mean2, var2, count2 = 5.0, 0, 1 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean2, var2, count2 ) @@ -239,7 +239,7 @@ def test_update_variance_with_varying_data_length(self): + (-11.0 - mean3) ** 2 ) / 3 - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._biased_variance = num_profiler._update_variance( mean3, var3 * 3 / 4, count3 ) @@ -252,7 +252,7 @@ def test_update_variance_with_empty_data(self): Checks update variance :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() data1 = [-3.0, 2.0, 11.0] mean1 = (-3.0 + 2.0 + 11.0) / 3 @@ -284,7 +284,7 @@ def test_timeit_merge(self): Checks profiles have been merged and timed :return: """ - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() mock_histogram = { "bin_counts": np.array([1, 1, 1, 1]), "bin_edges": np.array([2.0, 5.25, 8.5, 11.75, 15.0]), @@ -331,7 +331,7 @@ def test_timeit(self): Checks stat properties have been timed :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make min call prev_dependent_properties = { @@ -402,8 +402,8 @@ def test_from_dict_helper(self): fake_profile_name = "Fake profile name" # Build expected CategoricalColumn - actual_profile = TestColumn() - expected_profile = TestColumn() + actual_profile = MockColumn() + expected_profile = MockColumn() mock_saved_profile = dict( { "quantiles": None, @@ -429,7 +429,7 @@ def test_from_dict_helper(self): test_utils.assert_profiles_equal(expected_profile, actual_profile) def test_histogram_bin_error(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data for calculating bin error num_profiler._stored_histogram = { @@ -475,7 +475,7 @@ def test_histogram_bin_error(self): assert sum_error == np.inf def test_get_best_histogram_profile(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._histogram_for_profile = mock.MagicMock( side_effect=[("hist_1", 3), ("hist_2", 2), ("hist_3", 1)] @@ -509,7 +509,7 @@ def test_get_best_histogram_profile(self): assert best_histogram == "hist_3" def test_get_best_histogram_profile_infinite_loss(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler._histogram_for_profile = mock.MagicMock(return_value=("hist_1", 3)) @@ -529,7 +529,7 @@ def test_get_best_histogram_profile_infinite_loss(self): assert best_histogram == "hist_1" def test_get_percentile_median(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data for calculating bin error num_profiler._stored_histogram = { "histogram": { @@ -541,7 +541,7 @@ def test_get_percentile_median(self): self.assertListEqual([10, 10], median) def test_num_zeros(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make num_zeros call prev_dependent_properties = {"mean": 0} @@ -568,7 +568,7 @@ def test_num_zeros(self): self.assertEqual(subset_properties["num_zeros"], 4) def test_num_negatives(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make num_negatives call prev_dependent_properties = {"mean": 0} @@ -595,7 +595,7 @@ def test_num_negatives(self): self.assertEqual(subset_properties["num_negatives"], 4) def test_fold_histogram(self): - num_profiler = TestColumn() + num_profiler = MockColumn() # the break point is at the mid point of a bin bin_counts = np.array([1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) @@ -670,7 +670,7 @@ def test_timeit_num_zeros_and_negatives(self): Checks num_zeros and num_negatives have been timed :return: """ - num_profiler = TestColumn() + num_profiler = MockColumn() # Dummy data to make min call prev_dependent_properties = {"mean": 0} @@ -702,14 +702,14 @@ def test_merge_num_zeros_and_negatives(self): Checks num_zeros and num_negatives can be merged :return: """ - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() other1.num_zeros, other1.num_negatives = 3, 1 other2.num_zeros, other2.num_negatives = 7, 1 num_profiler._add_helper(other1, other2) self.assertEqual(num_profiler.num_zeros, 10) self.assertEqual(num_profiler.num_negatives, 2) - num_profiler, other1, other2 = TestColumn(), TestColumn(), TestColumn() + num_profiler, other1, other2 = MockColumn(), MockColumn(), MockColumn() other1.num_zeros, other1.num_negatives = 0, 0 other2.num_zeros, other2.num_negatives = 0, 0 num_profiler._add_helper(other1, other2) @@ -717,7 +717,7 @@ def test_merge_num_zeros_and_negatives(self): self.assertEqual(num_profiler.num_negatives, 0) def test_profile(self): - num_profiler = TestColumn() + num_profiler = MockColumn() mock_profile = dict( min=1.0, @@ -815,7 +815,7 @@ def test_report(self): self.assertIn(disabled_key, report_keys) def test_report_no_numerical_options(self): - num_profiler = TestColumn() + num_profiler = MockColumn() num_profiler.match_count = 0 num_profiler.times = defaultdict(float) @@ -833,7 +833,7 @@ def test_diff(self): Checks _diff_helper() works appropriately. """ - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -881,7 +881,7 @@ def test_diff(self): self.assertDictEqual(expected_diff, difference) # Invalid statistics - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = np.nan # NaN variance @@ -931,7 +931,7 @@ def test_diff(self): self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all()) # Insufficient match count - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -980,7 +980,7 @@ def test_diff(self): self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all()) # Constant values - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 0 # constant value has 0 variance @@ -1028,7 +1028,7 @@ def test_diff(self): self.assertDictEqual(expected_diff, difference) # Small p-value - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.min = 3 other1.max = 4 other1._biased_variance = 1 @@ -1075,11 +1075,11 @@ def test_diff(self): other1.diff("Inproper input") self.assertEqual( str(exc.exception), - "Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'", + "Unsupported operand type(s) for diff: 'MockColumnWProps' and" " 'str'", ) # PSI same distribution test - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 55 other1._stored_histogram = { "total_loss": 0, @@ -1112,7 +1112,7 @@ def test_diff(self): self.assertEqual(expected_psi_value, psi_value) # PSI min_min_edge == max_max_edge - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 10 other1._stored_histogram = { "total_loss": 0, @@ -1139,7 +1139,7 @@ def test_diff(self): self.assertEqual(expected_psi_value, psi_value) # PSI regen other / not self - other1, other2 = TestColumnWProps(), TestColumnWProps() + other1, other2 = MockColumnWProps(), MockColumnWProps() other1.match_count = 55 other1._stored_histogram = { "total_loss": 0, diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py index ef7664cea..5dd12e69d 100644 --- a/dataprofiler/tests/test_data_profiler.py +++ b/dataprofiler/tests/test_data_profiler.py @@ -29,8 +29,6 @@ def setUpClass(cls): def test_set_seed(self): import dataprofiler as dp - self.assertEqual(dp.settings._seed, None) - dp.set_seed(5) self.assertEqual(dp.settings._seed, 5) @@ -56,46 +54,6 @@ def test_data_profiling(self): self.assertIsNotNone(profile.profile) self.assertIsNotNone(profile.report()) - def test_no_snappy(self): - import importlib - import sys - import types - - orig_import = __import__ - # necessary for any wrapper around the library to test if snappy caught - # as an issue - - def reload_data_profiler(): - """Recursively reload modules.""" - sys_modules = sys.modules.copy() - for module_name, module in sys_modules.items(): - # Only reload top level of the dataprofiler - if "dataprofiler" in module_name and len(module_name.split(".")) < 3: - if isinstance(module, types.ModuleType): - importlib.reload(module) - - def import_mock(name, *args, **kwargs): - if name == "snappy": - raise ImportError("test") - return orig_import(name, *args, **kwargs) - - with mock.patch("builtins.__import__", side_effect=import_mock): - with self.assertWarns(ImportWarning) as w: - import dataprofiler - - reload_data_profiler() - - self.assertEqual( - str(w.warning), - "Snappy must be installed to use parquet/avro datasets." - "\n\n" - "For macOS use Homebrew:\n" - "\t`brew install snappy`" - "\n\n" - "For linux use apt-get:\n`" - "\tsudo apt-get -y install libsnappy-dev`\n", - ) - def test_no_tensorflow(self): import sys diff --git a/dataprofiler/tests/test_dp_logging.py b/dataprofiler/tests/test_dp_logging.py index 7f78903ee..99496e314 100644 --- a/dataprofiler/tests/test_dp_logging.py +++ b/dataprofiler/tests/test_dp_logging.py @@ -22,12 +22,6 @@ def tearDownClass(cls): root_logger.removeHandler(dp_logging.get_logger()) dp_logging._dp_logger = None - def test_default_verbosity(self, mock_stdout): - # Ensure that default effective level is INFO - self.assertEqual( - logging.INFO, logging.getLogger("DataProfiler").getEffectiveLevel() - ) - def test_set_verbosity(self, mock_stdout): from dataprofiler import dp_logging diff --git a/dataprofiler/version.py b/dataprofiler/version.py deleted file mode 100644 index 1136efae1..000000000 --- a/dataprofiler/version.py +++ /dev/null @@ -1,13 +0,0 @@ -"""File contains the version number for the package.""" - -MAJOR = 0 -MINOR = 11 -MICRO = 0 -POST = None # otherwise None - -VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) - -_post_str = "" -if POST: - _post_str = f".post{POST}" -__version__ = VERSION + _post_str diff --git a/requirements-dev.txt b/requirements-dev.txt index cff8f51a0..8c7c78684 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,9 @@ -check-manifest>=0.48 +check-manifest>=0.50 black>=24.3.0 isort==5.12.0 pre-commit==2.19.0 tox==3.25.1 +tox-conda==0.10.2 types-setuptools==67.7.0.1 types-python-dateutil==2.8.19.12 types-requests==2.30.0.0 diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 000000000..36517b45f --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,7 @@ +Sphinx>=5.0.0 +sphinx-rtd-theme +nbsphinx +furo +nbsphinx-link +pre-commit +tornado diff --git a/requirements-ml.txt b/requirements-ml.txt index ff525fec1..31f9ca633 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,7 +1,7 @@ scikit-learn>=0.23.2 -keras>=2.4.3,<3.0.0 +keras<=3.4.0 rapidfuzz>=2.6.1 -tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin' -tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64' -tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64' +tensorflow>=2.16.0; sys.platform != 'darwin' +tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64' +tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64' tqdm>=4.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 6c981cf9c..725b23849 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ coverage>=5.0.1 -dask>=2.29.0,<2024.2.0 +dask[dask-expr,dataframe]>=2024.4.1 fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 diff --git a/requirements.txt b/requirements.txt index a45dc34ae..e32f32851 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,22 @@ h5py>=2.10.0 wheel>=0.33.1 -numpy>=1.22.0 +numpy<2.0.0 pandas>=1.1.2 python-dateutil>=2.7.5 pytz>=2020.1 pyarrow>=1.0.1 chardet>=3.0.4 -fastavro>=1.0.0.post1 -python-snappy>=0.5.4 +fastavro>=1.1.0 +python-snappy>=0.7.1 charset-normalizer>=1.3.6 psutil>=4.0.0 scipy>=1.10.0 -requests>=2.28.1 +requests>=2.32.4 networkx>=2.5.1 typing-extensions>=3.10.0.2 HLL>=2.0.3 datasketches>=4.1.0 packaging>=23.0 -boto3>=1.28.61 +boto3>=1.37.15 +urllib3>=2.5.0 +versioneer diff --git a/resources/labelers/structured_model/keras_metadata.pb b/resources/labelers/structured_model/keras_metadata.pb deleted file mode 100644 index dcc84a213..000000000 --- a/resources/labelers/structured_model/keras_metadata.pb +++ /dev/null @@ -1,29 +0,0 @@ - -ã`root"_tf_keras_network*Á`{"name": "functional_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "class_name": "Functional", "config": {"name": "functional_1", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "sparse": false, "ragged": false, "name": "input_1"}, "name": "input_1", "inbound_nodes": []}, {"class_name": "Lambda", "config": {"name": "lambda", "trainable": true, "dtype": "float32", "function": {"class_name": "__tuple__", "items": ["4wEAAAAAAAAAAgAAAAQAAAATAAAAcxIAAAB0AGoBfACIAIgBgwN9AXwBUwApAU4pAtoWQ2hhcmFj\ndGVyTGV2ZWxDbm5Nb2RlbNoUX2NoYXJfZW5jb2RpbmdfbGF5ZXIpAtoJaW5wdXRfc3RyWg5jaGFy\nX2luX3ZlY3RvcikC2hRtYXhfY2hhcl9lbmNvZGluZ19pZNoKbWF4X2xlbmd0aKkA+lMvaG9tZS91\nYnVudHUvbmV3LWRwL0RhdGFQcm9maWxlci9kYXRhcHJvZmlsZXIvbGFiZWxlcnMvY2hhcmFjdGVy\nX2xldmVsX2Nubl9tb2RlbC5wedoRZW5jb2RpbmdfZnVuY3Rpb25TAgAAcwYAAAAAAQQBCgE=\n", null, {"class_name": "__tuple__", "items": [127, 3400]}]}, "function_type": "lambda", "module": "dataprofiler.labelers.character_level_cnn_model", "output_shape": {"class_name": "__tuple__", "items": [3400]}, "output_shape_type": "raw", "output_shape_module": null, "arguments": {}}, "name": "lambda", "inbound_nodes": [[["input_1", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "dtype": "float32", "input_dim": 129, "output_dim": 64, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 3400}, "name": "embedding", "inbound_nodes": [[["lambda", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d", "inbound_nodes": [[["embedding", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout", "inbound_nodes": [[["conv1d", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["dropout", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_1", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_1", "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_2", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_2", "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_2", "inbound_nodes": [[["dropout_2", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_3", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_3", "inbound_nodes": [[["batch_normalization_2", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_3", "inbound_nodes": [[["conv1d_3", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_3", "inbound_nodes": [[["dropout_3", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense", "inbound_nodes": [[["batch_normalization_3", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_4", "inbound_nodes": [[["dense", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense_1", "inbound_nodes": [[["dropout_4", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_5", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_5", "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 24, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense_2", "inbound_nodes": [[["dropout_5", 0, 0, {}]]]}, {"class_name": "TensorFlowOpLayer", "config": {"name": "ArgMax", "trainable": true, "dtype": "float32", "node_def": {"name": "ArgMax", "op": "ArgMax", "input": ["dense_2/truediv", "ArgMax/dimension"], "attr": {"Tidx": {"type": "DT_INT32"}, "output_type": {"type": "DT_INT64"}, "T": {"type": "DT_FLOAT"}}}, "constants": {"1": -1}}, "name": "tf_op_layer_ArgMax", "inbound_nodes": [[["dense_2", 0, 0, {}]]]}, {"class_name": "ThreshArgMaxLayer", "config": {"layer was saved without config": true}, "name": "thresh_arg_max_layer", "inbound_nodes": [[["tf_op_layer_ArgMax", 0, 0, {"confidence_layer": ["dense_2", 0, 0]}]]]}], "input_layers": [["input_1", 0, 0]], "output_layers": [["dense_2", 0, 0], ["tf_op_layer_ArgMax", 0, 0], ["thresh_arg_max_layer", 0, 0]]}, "shared_object_id": 52, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null]}, "string", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null]}, "string", "input_1"]}, "keras_version": "2.6.0", "backend": "tensorflow", "model_config": {"class_name": "Functional"}, "training_config": {"loss": {"dense_2": "categorical_crossentropy"}, "metrics": [[{"class_name": "MeanMetricWrapper", "config": {"name": "acc", "dtype": "float32", "fn": "categorical_accuracy"}, "shared_object_id": 54}, {"class_name": "Custom>F1Score", "config": {"name": "dense_2_f1_score", "dtype": "float32", "num_classes": 24, "average": "micro", "threshold": null}, "shared_object_id": 55}], [null], [null]], "weighted_metrics": null, "loss_weights": null, "optimizer_config": {"class_name": "Adam", "config": {"name": "Adam", "learning_rate": 0.0010000000474974513, "decay": 0.0, "beta_1": 0.8999999761581421, "beta_2": 0.9990000128746033, "epsilon": 1e-07, "amsgrad": false}}}}2 -ú root.layer-0"_tf_keras_input_layer*Ê{"class_name": "InputLayer", "name": "input_1", "dtype": "string", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "sparse": false, "ragged": false, "name": "input_1"}}2 -Å root.layer-1"_tf_keras_layer*›{"name": "lambda", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Lambda", "config": {"name": "lambda", "trainable": true, "dtype": "float32", "function": {"class_name": "__tuple__", "items": ["4wEAAAAAAAAAAgAAAAQAAAATAAAAcxIAAAB0AGoBfACIAIgBgwN9AXwBUwApAU4pAtoWQ2hhcmFj\ndGVyTGV2ZWxDbm5Nb2RlbNoUX2NoYXJfZW5jb2RpbmdfbGF5ZXIpAtoJaW5wdXRfc3RyWg5jaGFy\nX2luX3ZlY3RvcikC2hRtYXhfY2hhcl9lbmNvZGluZ19pZNoKbWF4X2xlbmd0aKkA+lMvaG9tZS91\nYnVudHUvbmV3LWRwL0RhdGFQcm9maWxlci9kYXRhcHJvZmlsZXIvbGFiZWxlcnMvY2hhcmFjdGVy\nX2xldmVsX2Nubl9tb2RlbC5wedoRZW5jb2RpbmdfZnVuY3Rpb25TAgAAcwYAAAAAAQQBCgE=\n", null, {"class_name": "__tuple__", "items": [127, 3400]}]}, "function_type": "lambda", "module": "dataprofiler.labelers.character_level_cnn_model", "output_shape": {"class_name": "__tuple__", "items": [3400]}, "output_shape_type": "raw", "output_shape_module": null, "arguments": {}}, "inbound_nodes": [[["input_1", 0, 0, {}]]], "shared_object_id": 1}2 -‚root.layer_with_weights-0"_tf_keras_layer*Ë{"name": "embedding", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "stateful": false, "must_restore_from_config": false, "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "dtype": "float32", "input_dim": 129, "output_dim": 64, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 3400}, "inbound_nodes": [[["lambda", 0, 0, {}]]], "shared_object_id": 3, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2 -ç root.layer_with_weights-1"_tf_keras_layer*° {"name": "conv1d", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 4}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 5}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding", 0, 0, {}]]], "shared_object_id": 6, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 64}}, "shared_object_id": 56}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 64]}}2 -¦ root.layer-4"_tf_keras_layer*ü{"name": "dropout", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d", 0, 0, {}]]], "shared_object_id": 7}2 -· root.layer_with_weights-2"_tf_keras_layer*€ {"name": "batch_normalization", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 8}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 9}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 10}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 11}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout", 0, 0, {}]]], "shared_object_id": 12, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 57}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ø root.layer_with_weights-3"_tf_keras_layer*Á {"name": "conv1d_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 13}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 14}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 15, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 58}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -­ root.layer-7"_tf_keras_layer*ƒ{"name": "dropout_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]], "shared_object_id": 16}2 -¿  root.layer_with_weights-4"_tf_keras_layer*ˆ {"name": "batch_normalization_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 17}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 18}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 19}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 20}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]], "shared_object_id": 21, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 59}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ú  -root.layer_with_weights-5"_tf_keras_layer*à {"name": "conv1d_2", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 22}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 23}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 24, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 60}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -®  root.layer-10"_tf_keras_layer*ƒ{"name": "dropout_2", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]], "shared_object_id": 25}2 -¿  root.layer_with_weights-6"_tf_keras_layer*ˆ {"name": "batch_normalization_2", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 26}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 27}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 28}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 29}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_2", 0, 0, {}]]], "shared_object_id": 30, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 61}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ú  root.layer_with_weights-7"_tf_keras_layer*à {"name": "conv1d_3", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_3", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 31}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 32}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_2", 0, 0, {}]]], "shared_object_id": 33, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 62}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -® root.layer-13"_tf_keras_layer*ƒ{"name": "dropout_3", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_3", 0, 0, {}]]], "shared_object_id": 34}2 -¿ root.layer_with_weights-8"_tf_keras_layer*ˆ {"name": "batch_normalization_3", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 35}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 36}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 38}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_3", 0, 0, {}]]], "shared_object_id": 39, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 63}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -„root.layer_with_weights-9"_tf_keras_layer*Í{"name": "dense", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 40}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_3", 0, 0, {}]]], "shared_object_id": 42, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 48}}, "shared_object_id": 64}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -« root.layer-16"_tf_keras_layer*€{"name": "dropout_4", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense", 0, 0, {}]]], "shared_object_id": 43}2 -ýroot.layer_with_weights-10"_tf_keras_layer*Å{"name": "dense_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 44}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 45}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_4", 0, 0, {}]]], "shared_object_id": 46, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 96}}, "shared_object_id": 65}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 96]}}2 -­ root.layer-18"_tf_keras_layer*‚{"name": "dropout_5", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_5", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]], "shared_object_id": 47}2 -€root.layer_with_weights-11"_tf_keras_layer*È{"name": "dense_2", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 24, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 48}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 49}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_5", 0, 0, {}]]], "shared_object_id": 50, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 96}}, "shared_object_id": 66}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 96]}}2 -í root.layer-20"_tf_keras_layer*Â{"name": "tf_op_layer_ArgMax", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "class_name": "TensorFlowOpLayer", "config": {"name": "ArgMax", "trainable": true, "dtype": "float32", "node_def": {"name": "ArgMax", "op": "ArgMax", "input": ["dense_2/truediv", "ArgMax/dimension"], "attr": {"Tidx": {"type": "DT_INT32"}, "output_type": {"type": "DT_INT64"}, "T": {"type": "DT_FLOAT"}}}, "constants": {"1": -1}}, "inbound_nodes": [[["dense_2", 0, 0, {}]]], "shared_object_id": 51}2 -Æroot.layer_with_weights-12"_tf_keras_layer*Ž{"name": "thresh_arg_max_layer", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "ThreshArgMaxLayer", "config": {"layer was saved without config": true}}2 -º“root.keras_api.metrics.0"_tf_keras_metric*‚{"class_name": "Mean", "name": "loss", "dtype": "float32", "config": {"name": "loss", "dtype": "float32"}, "shared_object_id": 67}2 -Ê”root.keras_api.metrics.1"_tf_keras_metric*’{"class_name": "Mean", "name": "dense_2_loss", "dtype": "float32", "config": {"name": "dense_2_loss", "dtype": "float32"}, "shared_object_id": 68}2 -ã•root.keras_api.metrics.2"_tf_keras_metric*«{"class_name": "MeanMetricWrapper", "name": "acc", "dtype": "float32", "config": {"name": "acc", "dtype": "float32", "fn": "categorical_accuracy"}, "shared_object_id": 54}2 -––root.keras_api.metrics.3"_tf_keras_metric*Þ{"class_name": "Custom>F1Score", "name": "dense_2_f1_score", "dtype": "float32", "config": {"name": "dense_2_f1_score", "dtype": "float32", "num_classes": 24, "average": "micro", "threshold": null}, "shared_object_id": 55}2 \ No newline at end of file diff --git a/resources/labelers/structured_model/variables/variables.data-00000-of-00001 b/resources/labelers/structured_model/model.keras similarity index 88% rename from resources/labelers/structured_model/variables/variables.data-00000-of-00001 rename to resources/labelers/structured_model/model.keras index 95732bf16..795d637da 100644 Binary files a/resources/labelers/structured_model/variables/variables.data-00000-of-00001 and b/resources/labelers/structured_model/model.keras differ diff --git a/resources/labelers/structured_model/saved_model.pb b/resources/labelers/structured_model/saved_model.pb deleted file mode 100644 index 76274cae0..000000000 Binary files a/resources/labelers/structured_model/saved_model.pb and /dev/null differ diff --git a/resources/labelers/structured_model/variables/variables.index b/resources/labelers/structured_model/variables/variables.index deleted file mode 100644 index 627e9a577..000000000 Binary files a/resources/labelers/structured_model/variables/variables.index and /dev/null differ diff --git a/resources/labelers/unstructured_model/keras_metadata.pb b/resources/labelers/unstructured_model/keras_metadata.pb deleted file mode 100644 index dcc84a213..000000000 --- a/resources/labelers/unstructured_model/keras_metadata.pb +++ /dev/null @@ -1,29 +0,0 @@ - -ã`root"_tf_keras_network*Á`{"name": "functional_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "class_name": "Functional", "config": {"name": "functional_1", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "sparse": false, "ragged": false, "name": "input_1"}, "name": "input_1", "inbound_nodes": []}, {"class_name": "Lambda", "config": {"name": "lambda", "trainable": true, "dtype": "float32", "function": {"class_name": "__tuple__", "items": ["4wEAAAAAAAAAAgAAAAQAAAATAAAAcxIAAAB0AGoBfACIAIgBgwN9AXwBUwApAU4pAtoWQ2hhcmFj\ndGVyTGV2ZWxDbm5Nb2RlbNoUX2NoYXJfZW5jb2RpbmdfbGF5ZXIpAtoJaW5wdXRfc3RyWg5jaGFy\nX2luX3ZlY3RvcikC2hRtYXhfY2hhcl9lbmNvZGluZ19pZNoKbWF4X2xlbmd0aKkA+lMvaG9tZS91\nYnVudHUvbmV3LWRwL0RhdGFQcm9maWxlci9kYXRhcHJvZmlsZXIvbGFiZWxlcnMvY2hhcmFjdGVy\nX2xldmVsX2Nubl9tb2RlbC5wedoRZW5jb2RpbmdfZnVuY3Rpb25TAgAAcwYAAAAAAQQBCgE=\n", null, {"class_name": "__tuple__", "items": [127, 3400]}]}, "function_type": "lambda", "module": "dataprofiler.labelers.character_level_cnn_model", "output_shape": {"class_name": "__tuple__", "items": [3400]}, "output_shape_type": "raw", "output_shape_module": null, "arguments": {}}, "name": "lambda", "inbound_nodes": [[["input_1", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "dtype": "float32", "input_dim": 129, "output_dim": 64, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 3400}, "name": "embedding", "inbound_nodes": [[["lambda", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d", "inbound_nodes": [[["embedding", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout", "inbound_nodes": [[["conv1d", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["dropout", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_1", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_1", "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_2", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_2", "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_2", "inbound_nodes": [[["dropout_2", 0, 0, {}]]]}, {"class_name": "Conv1D", "config": {"name": "conv1d_3", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "conv1d_3", "inbound_nodes": [[["batch_normalization_2", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_3", "inbound_nodes": [[["conv1d_3", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_3", "inbound_nodes": [[["dropout_3", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense", "inbound_nodes": [[["batch_normalization_3", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_4", "inbound_nodes": [[["dense", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense_1", "inbound_nodes": [[["dropout_4", 0, 0, {}]]]}, {"class_name": "Dropout", "config": {"name": "dropout_5", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "name": "dropout_5", "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 24, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "name": "dense_2", "inbound_nodes": [[["dropout_5", 0, 0, {}]]]}, {"class_name": "TensorFlowOpLayer", "config": {"name": "ArgMax", "trainable": true, "dtype": "float32", "node_def": {"name": "ArgMax", "op": "ArgMax", "input": ["dense_2/truediv", "ArgMax/dimension"], "attr": {"Tidx": {"type": "DT_INT32"}, "output_type": {"type": "DT_INT64"}, "T": {"type": "DT_FLOAT"}}}, "constants": {"1": -1}}, "name": "tf_op_layer_ArgMax", "inbound_nodes": [[["dense_2", 0, 0, {}]]]}, {"class_name": "ThreshArgMaxLayer", "config": {"layer was saved without config": true}, "name": "thresh_arg_max_layer", "inbound_nodes": [[["tf_op_layer_ArgMax", 0, 0, {"confidence_layer": ["dense_2", 0, 0]}]]]}], "input_layers": [["input_1", 0, 0]], "output_layers": [["dense_2", 0, 0], ["tf_op_layer_ArgMax", 0, 0], ["thresh_arg_max_layer", 0, 0]]}, "shared_object_id": 52, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null]}, "string", "input_1"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null]}, "string", "input_1"]}, "keras_version": "2.6.0", "backend": "tensorflow", "model_config": {"class_name": "Functional"}, "training_config": {"loss": {"dense_2": "categorical_crossentropy"}, "metrics": [[{"class_name": "MeanMetricWrapper", "config": {"name": "acc", "dtype": "float32", "fn": "categorical_accuracy"}, "shared_object_id": 54}, {"class_name": "Custom>F1Score", "config": {"name": "dense_2_f1_score", "dtype": "float32", "num_classes": 24, "average": "micro", "threshold": null}, "shared_object_id": 55}], [null], [null]], "weighted_metrics": null, "loss_weights": null, "optimizer_config": {"class_name": "Adam", "config": {"name": "Adam", "learning_rate": 0.0010000000474974513, "decay": 0.0, "beta_1": 0.8999999761581421, "beta_2": 0.9990000128746033, "epsilon": 1e-07, "amsgrad": false}}}}2 -ú root.layer-0"_tf_keras_input_layer*Ê{"class_name": "InputLayer", "name": "input_1", "dtype": "string", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "dtype": "string", "sparse": false, "ragged": false, "name": "input_1"}}2 -Å root.layer-1"_tf_keras_layer*›{"name": "lambda", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Lambda", "config": {"name": "lambda", "trainable": true, "dtype": "float32", "function": {"class_name": "__tuple__", "items": ["4wEAAAAAAAAAAgAAAAQAAAATAAAAcxIAAAB0AGoBfACIAIgBgwN9AXwBUwApAU4pAtoWQ2hhcmFj\ndGVyTGV2ZWxDbm5Nb2RlbNoUX2NoYXJfZW5jb2RpbmdfbGF5ZXIpAtoJaW5wdXRfc3RyWg5jaGFy\nX2luX3ZlY3RvcikC2hRtYXhfY2hhcl9lbmNvZGluZ19pZNoKbWF4X2xlbmd0aKkA+lMvaG9tZS91\nYnVudHUvbmV3LWRwL0RhdGFQcm9maWxlci9kYXRhcHJvZmlsZXIvbGFiZWxlcnMvY2hhcmFjdGVy\nX2xldmVsX2Nubl9tb2RlbC5wedoRZW5jb2RpbmdfZnVuY3Rpb25TAgAAcwYAAAAAAQQBCgE=\n", null, {"class_name": "__tuple__", "items": [127, 3400]}]}, "function_type": "lambda", "module": "dataprofiler.labelers.character_level_cnn_model", "output_shape": {"class_name": "__tuple__", "items": [3400]}, "output_shape_type": "raw", "output_shape_module": null, "arguments": {}}, "inbound_nodes": [[["input_1", 0, 0, {}]]], "shared_object_id": 1}2 -‚root.layer_with_weights-0"_tf_keras_layer*Ë{"name": "embedding", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "stateful": false, "must_restore_from_config": false, "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 3400]}, "dtype": "float32", "input_dim": 129, "output_dim": 64, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 3400}, "inbound_nodes": [[["lambda", 0, 0, {}]]], "shared_object_id": 3, "build_input_shape": {"class_name": "TensorShape", "items": [null, null]}}2 -ç root.layer_with_weights-1"_tf_keras_layer*° {"name": "conv1d", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 4}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 5}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding", 0, 0, {}]]], "shared_object_id": 6, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 64}}, "shared_object_id": 56}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 64]}}2 -¦ root.layer-4"_tf_keras_layer*ü{"name": "dropout", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d", 0, 0, {}]]], "shared_object_id": 7}2 -· root.layer_with_weights-2"_tf_keras_layer*€ {"name": "batch_normalization", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 8}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 9}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 10}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 11}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout", 0, 0, {}]]], "shared_object_id": 12, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 57}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ø root.layer_with_weights-3"_tf_keras_layer*Á {"name": "conv1d_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 13}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 14}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 15, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 58}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -­ root.layer-7"_tf_keras_layer*ƒ{"name": "dropout_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]], "shared_object_id": 16}2 -¿  root.layer_with_weights-4"_tf_keras_layer*ˆ {"name": "batch_normalization_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 17}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 18}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 19}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 20}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]], "shared_object_id": 21, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 59}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ú  -root.layer_with_weights-5"_tf_keras_layer*à {"name": "conv1d_2", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 22}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 23}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 24, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 60}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -®  root.layer-10"_tf_keras_layer*ƒ{"name": "dropout_2", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]], "shared_object_id": 25}2 -¿  root.layer_with_weights-6"_tf_keras_layer*ˆ {"name": "batch_normalization_2", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 26}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 27}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 28}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 29}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_2", 0, 0, {}]]], "shared_object_id": 30, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 61}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -ú  root.layer_with_weights-7"_tf_keras_layer*à {"name": "conv1d_3", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Conv1D", "config": {"name": "conv1d_3", "trainable": true, "dtype": "float32", "filters": 48, "kernel_size": {"class_name": "__tuple__", "items": [13]}, "strides": {"class_name": "__tuple__", "items": [1]}, "padding": "same", "data_format": "channels_last", "dilation_rate": {"class_name": "__tuple__", "items": [1]}, "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 31}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 32}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_2", 0, 0, {}]]], "shared_object_id": 33, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 3, "axes": {"-1": 48}}, "shared_object_id": 62}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -® root.layer-13"_tf_keras_layer*ƒ{"name": "dropout_3", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["conv1d_3", 0, 0, {}]]], "shared_object_id": 34}2 -¿ root.layer_with_weights-8"_tf_keras_layer*ˆ {"name": "batch_normalization_3", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 35}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 36}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 38}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["dropout_3", 0, 0, {}]]], "shared_object_id": 39, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 48}}, "shared_object_id": 63}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -„root.layer_with_weights-9"_tf_keras_layer*Í{"name": "dense", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 40}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["batch_normalization_3", 0, 0, {}]]], "shared_object_id": 42, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 48}}, "shared_object_id": 64}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 48]}}2 -« root.layer-16"_tf_keras_layer*€{"name": "dropout_4", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense", 0, 0, {}]]], "shared_object_id": 43}2 -ýroot.layer_with_weights-10"_tf_keras_layer*Å{"name": "dense_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 96, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 44}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 45}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_4", 0, 0, {}]]], "shared_object_id": 46, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 96}}, "shared_object_id": 65}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 96]}}2 -­ root.layer-18"_tf_keras_layer*‚{"name": "dropout_5", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dropout", "config": {"name": "dropout_5", "trainable": true, "dtype": "float32", "rate": 0.073, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]], "shared_object_id": 47}2 -€root.layer_with_weights-11"_tf_keras_layer*È{"name": "dense_2", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 24, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 48}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 49}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_5", 0, 0, {}]]], "shared_object_id": 50, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 96}}, "shared_object_id": 66}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 96]}}2 -í root.layer-20"_tf_keras_layer*Â{"name": "tf_op_layer_ArgMax", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": true, "class_name": "TensorFlowOpLayer", "config": {"name": "ArgMax", "trainable": true, "dtype": "float32", "node_def": {"name": "ArgMax", "op": "ArgMax", "input": ["dense_2/truediv", "ArgMax/dimension"], "attr": {"Tidx": {"type": "DT_INT32"}, "output_type": {"type": "DT_INT64"}, "T": {"type": "DT_FLOAT"}}}, "constants": {"1": -1}}, "inbound_nodes": [[["dense_2", 0, 0, {}]]], "shared_object_id": 51}2 -Æroot.layer_with_weights-12"_tf_keras_layer*Ž{"name": "thresh_arg_max_layer", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "class_name": "ThreshArgMaxLayer", "config": {"layer was saved without config": true}}2 -º“root.keras_api.metrics.0"_tf_keras_metric*‚{"class_name": "Mean", "name": "loss", "dtype": "float32", "config": {"name": "loss", "dtype": "float32"}, "shared_object_id": 67}2 -Ê”root.keras_api.metrics.1"_tf_keras_metric*’{"class_name": "Mean", "name": "dense_2_loss", "dtype": "float32", "config": {"name": "dense_2_loss", "dtype": "float32"}, "shared_object_id": 68}2 -ã•root.keras_api.metrics.2"_tf_keras_metric*«{"class_name": "MeanMetricWrapper", "name": "acc", "dtype": "float32", "config": {"name": "acc", "dtype": "float32", "fn": "categorical_accuracy"}, "shared_object_id": 54}2 -––root.keras_api.metrics.3"_tf_keras_metric*Þ{"class_name": "Custom>F1Score", "name": "dense_2_f1_score", "dtype": "float32", "config": {"name": "dense_2_f1_score", "dtype": "float32", "num_classes": 24, "average": "micro", "threshold": null}, "shared_object_id": 55}2 \ No newline at end of file diff --git a/resources/labelers/unstructured_model/variables/variables.data-00000-of-00001 b/resources/labelers/unstructured_model/model.keras similarity index 88% rename from resources/labelers/unstructured_model/variables/variables.data-00000-of-00001 rename to resources/labelers/unstructured_model/model.keras index 95732bf16..795d637da 100644 Binary files a/resources/labelers/unstructured_model/variables/variables.data-00000-of-00001 and b/resources/labelers/unstructured_model/model.keras differ diff --git a/resources/labelers/unstructured_model/saved_model.pb b/resources/labelers/unstructured_model/saved_model.pb deleted file mode 100644 index 76274cae0..000000000 Binary files a/resources/labelers/unstructured_model/saved_model.pb and /dev/null differ diff --git a/resources/labelers/unstructured_model/variables/variables.index b/resources/labelers/unstructured_model/variables/variables.index deleted file mode 100644 index 627e9a577..000000000 Binary files a/resources/labelers/unstructured_model/variables/variables.index and /dev/null differ diff --git a/setup.cfg b/setup.cfg index dd0e2235f..8ad51bd86 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,10 +1,19 @@ +[versioneer] +vcs = git +style = pep440 +versionfile_source = dataprofiler/_version.py +versionfile_build = dataprofiler/_version.py +tag_prefix = "" +parentdir_prefix = dataprofiler- [flake8] max-line-length = 88 extend-ignore = E203 +exclude = versioneer.py, dataprofiler/_version.py [isort] + multi_line_output=3 -skip=dataprofiler/tests/data/,venv/ +skip=dataprofiler/tests/data/,venv/, versioneer.py, dataprofiler/_version.py profile=black include_trailing_comma=True force_grid_wrap=0 diff --git a/setup.py b/setup.py index f8b5eaf8e..f1f799446 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from setuptools import find_packages, setup # Load package version -from dataprofiler.version import __version__ +import versioneer here = path.abspath(path.dirname(__file__)) @@ -53,8 +53,9 @@ setup( name="DataProfiler", - version=__version__, - python_requires=">=3.8", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + python_requires=">=3.10", description=DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tox.ini b/tox.ini index 55fa50147..caf70d437 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37, py38, py39, py310, docs, pypi-description, manifest, precom +envlist = py39, py310, py311, pypi-description, manifest, precom [testenv] @@ -16,32 +16,37 @@ deps = -rrequirements-reports.txt -rrequirements-test.txt commands = - python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:dist/coverage.xml --forked + python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:coverage.xml +# add "docs" to `envlist` to run the docs build #[testenv:docs] #extras = docs #changedir = docs #commands = sphinx-build -b html source _build [testenv:pypi-description] -skip_install = true deps = + {[testenv]deps} twine wheel - pip >= 19.0.0 +skip_install = true commands = python setup.py sdist bdist_wheel twine check dist/* [testenv:manifest] -deps = check-manifest +deps = + {[testenv]deps} + check-manifest skip_install = true commands = check-manifest # skip isort for infinite loop issues between tox and top level settings [testenv:precom] skip_install = true -deps = pre-commit +deps = + {[testenv]deps} + pre-commit commands = pre-commit run black --all-files --verbose # if you use the walrus operator on Python 3.8 disable the flake8 check diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 000000000..fcbc15bd1 --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1741 @@ +# Version: 0.19 + +"""The Versioneer - like a rocketeer, but for versions. +The Versioneer +============== +* like a rocketeer, but for versions! +* https://github.com/python-versioneer/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. +## Quick Install +* `pip install versioneer` to somewhere in your $PATH +* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) +* run `versioneer install` in your source tree, commit the results +* Verify version information with `python setup.py version` +## Version Identifiers +Source trees come from a variety of places: +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes). +The version identifier is used for multiple purposes: +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball +## Theory of Operation +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. +## Installation +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. +## Version-String Flavors +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. +Both functions return a dictionary with different flavors of version +information: +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions +## Styles +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. +## Debugging +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). +## Known Limitations +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/python-versioneer/python-versioneer/issues). +### Subprojects +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other languages) in subdirectories. +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). +`pip install --editable .` should work correctly. `setup.py install` might +work too. +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. +### Editable installs with setuptools <= 18.5 +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. +## Updating Versioneer +To upgrade your project to a new release of Versioneer, do the following: +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files +## Future Directions +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. +## Similar projects +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time + dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of + versioneer +## License +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer +""" + +import configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.ConfigParser() + with open(setup_cfg) as f: + parser.read_file(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print(f"unable to find command, tried {commands}") + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY[ + "git" +] = r''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. +# This file is released into the public domain. Generated by +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) +"""Git implementation of _version.py.""" +import errno +import os +import re +import subprocess +import sys +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" +LONG_VERSION_PY = {} +HANDLERS = {} +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + # now we have TAG-NUM-gHEX or HEX + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + # commit: short hex revision ID + pieces["short"] = mo.group(3) + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + return pieces +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%%d" %% pieces["distance"] + return rendered +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + if not style or style == "default": + style = "pep440" # the default + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + cfg = get_config() + verbose = cfg.verbose + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs) + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except OSError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.19) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. +import json +version_json = ''' +%s +''' # END VERSION_JSON +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except OSError: + raise NotThisMethod("unable to read _version.py") + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set {} to '{}'".format(filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post0.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print(f"got version from file {versionfile_abs} {ver}") + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(cmdclass=None): + """Get the custom setuptools/distutils subclasses used by Versioneer. + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/python-versioneer/python-versioneer/issues/52 + + cmds = {} if cmdclass is None else cmdclass.copy() + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "build_py" in cmds: + _build_py = cmds["build_py"] + elif "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_py"] = cmd_build_py + + if "setuptools" in sys.modules: + from setuptools.command.build_ext import build_ext as _build_ext + else: + from distutils.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_ext"] = cmd_build_ext + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if "py2exe" in sys.modules: # py2exe enabled? + from py2exe.distutils_buildexe import py2exe as _py2exe + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "sdist" in cmds: + _sdist = cmds["sdist"] + elif "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- +You will also need to edit your setup.py to use the results: + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Do main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy) as f: + old = f.read() + except OSError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in) as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except OSError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)