From 51822e9f145fb4dfd5b4415de8d50192cdc55a3c Mon Sep 17 00:00:00 2001 From: Bobronium Date: Mon, 1 Sep 2025 00:00:49 +0200 Subject: [PATCH] Add duper.deepcopy, duper.reprx, BytecodeBuilder, mypyc, rework the API and more --- .github/workflows/build.yml | 42 ++ .github/workflows/publish.yml | 49 ++ .github/workflows/style.yml | 20 +- .github/workflows/test.yml | 30 +- .github/workflows/types.yml | 30 +- .gitignore | 1 + .pre-commit-config.yaml | 34 ++ LICENSE | 2 +- LICENSES/MPL-2.0.txt | 373 ++++++++++++ README.md | 112 +++- data/__init__.py | 470 +++++++++++++++ duper/__about__.py | 2 +- duper/__init__.py | 269 ++------- duper/_compile_support.py | 78 +++ duper/_compile_test.py | 5 + duper/_compile_test_2.py | 5 + duper/_empty_module.py | 3 + duper/_msg.py | 9 +- duper/builders/__init__.py | 7 + duper/builders/_helpers.py | 70 +++ duper/builders/_source.py | 62 ++ duper/builders/_tracking.py | 72 +++ duper/builders/ast.py | 1012 +++++++++++++++++++++++++++++++++ duper/builders/bytecode.py | 917 +++++++++++++++++++++++++++++ duper/builders/runtime.py | 331 +++++++++++ duper/constants.py | 112 +++- duper/copy.py | 235 ++++++++ duper/debug.py | 249 ++++++++ duper/factories/__init__.py | 3 - duper/factories/ast.py | 372 ------------ duper/factories/runtime.py | 121 ---- duper/fastast.py | 320 +++++++++-- pyproject.toml | 274 +++++---- showcase.py | 29 + tests/__init__.py | 4 +- tests/comparer.py | 579 +++++++++++++++++++ tests/test_copy.py | 346 ++++++----- tests/test_factories.py | 269 +++++++++ timesup.py | 293 ++++++++++ 39 files changed, 6088 insertions(+), 1123 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .pre-commit-config.yaml create mode 100644 LICENSES/MPL-2.0.txt create mode 100644 data/__init__.py create mode 100644 duper/_compile_support.py create mode 100644 duper/_compile_test.py create mode 100644 duper/_compile_test_2.py create mode 100644 duper/_empty_module.py create mode 100644 duper/builders/__init__.py create mode 100644 duper/builders/_helpers.py create mode 100644 duper/builders/_source.py create mode 100644 duper/builders/_tracking.py create mode 100644 duper/builders/ast.py create mode 100644 duper/builders/bytecode.py create mode 100644 duper/builders/runtime.py create mode 100644 duper/copy.py create mode 100644 duper/debug.py delete mode 100644 duper/factories/__init__.py delete mode 100644 duper/factories/ast.py delete mode 100644 duper/factories/runtime.py create mode 100644 showcase.py create mode 100644 tests/comparer.py create mode 100644 tests/test_factories.py create mode 100644 timesup.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..a71bbfe --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,42 @@ +name: Wheels + +on: + push: + branches: [ main, poc, pub ] + pull_request: + branches: [ main, poc, pub ] + +jobs: + build-test-wheels: + # Separate native runners so mac gets native arm64 & x86_64 (universal2), Windows gets MSVC, etc. + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest, ubuntu-24.04-arm, windows-latest, windows-11-arm, macos-13, macos-14 ] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up QEMU + if: runner.os == 'Linux' && runner.arch == 'X64' + uses: docker/setup-qemu-action@v3 + with: + platforms: all + + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Build wheels + uses: pypa/cibuildwheel@v3.1.3 + env: + CIBW_BUILD_VERBOSITY: 1 + CIBW_ARCHS_LINUX: ${{ runner.arch == 'X64' && 'auto ppc64le s390x' || 'auto' }} + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..7f17935 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,49 @@ +name: Publish + +on: + push: + tags: [ "*" ] + + +jobs: + build: + uses: ./.github/workflows/build.yml + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + - name: Build sdist + run: uv build --sdist --no-sources + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: dist/*.tar.gz + + publish: + needs: [ build, sdist ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + - name: Download wheels + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: dist + - name: Download sdist + uses: actions/download-artifact@v4 + with: + name: sdist + path: dist + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index fd9bf28..d8fed60 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -2,9 +2,8 @@ name: style on: push: - branches: [main, poc, pub] + branches: [ main, poc, pub ] pull_request: - branches: [main, poc, pub] concurrency: group: style-${{ github.head_ref }} @@ -21,13 +20,18 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python "3.11" + - name: Set up Python "3.13" uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: "3.13" - - name: Install Hatch - run: pip install --upgrade hatch + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Run ruff + run: uv run --python-preference system --extra style ruff check duper - - name: Run check - run: hatch run style:check + - name: Run ruff + run: uv run --python-preference system --extra style ruff format duper \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 424a242..5d31753 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,17 +2,13 @@ name: test on: push: - branches: [main, poc, pub] + branches: [ main, poc, pub ] pull_request: - branches: [main, poc, pub] concurrency: group: test-${{ github.head_ref }} cancel-in-progress: true -env: - PYTHONUNBUFFERED: "1" - FORCE_COLOR: "1" jobs: cov: @@ -21,19 +17,21 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.9', '3.10', '3.11'] + os: [ ubuntu-latest, windows-latest, macos-14 ] + python-version: [ '3.10', '3.11', '3.12', '3.13', '3.14.0-rc.2' ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - - name: Install Hatch - run: pip install --upgrade hatch + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true - - name: Run tests - run: hatch run test:cov + - name: Run mypy + run: uv run --python-preference system --extra test pytest tests --cov-report=term-missing --cov-config=pyproject.toml --cov=duper --cov=tests diff --git a/.github/workflows/types.yml b/.github/workflows/types.yml index 8dc4789..96876c4 100644 --- a/.github/workflows/types.yml +++ b/.github/workflows/types.yml @@ -2,17 +2,13 @@ name: types on: push: - branches: [main, poc, pub] + branches: [ main, poc, pub ] pull_request: - branches: [main, poc, pub] concurrency: group: types-${{ github.head_ref }} cancel-in-progress: true -env: - PYTHONUNBUFFERED: "1" - FORCE_COLOR: "1" jobs: check: @@ -22,19 +18,21 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.9', '3.10', '3.11'] + os: [ ubuntu-latest, windows-latest, macos-14 ] + python-version: [ '3.10', '3.11', '3.12', '3.13', '3.14.0-rc.2'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - - name: Install Hatch - run: pip install --upgrade hatch + - name: Setup uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true - - name: Run tests - run: hatch run types:check + - name: Run mypy + run: uv run --python-preference system --extra types mypy duper diff --git a/.gitignore b/.gitignore index 2db9270..3aee05f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +.crush/ cmake-build-*/ *.iws out/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..8af4e49 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +repos: + - repo: local + hooks: + - id: ruff-format + name: ruff format + entry: ruff format + language: system + types: [ python ] + + - id: ruff-check + name: ruff check --fix + entry: ruff check --fix + language: system + types: [ python ] + + - id: reuse + name: reuse lint + entry: reuse annotate --license MPL-2.0 --copyright "Arseny Boykov (Bobronium) " -y 2023-present + types: [ python ] + language: system + + - id: mypy + name: mypy + entry: mypy duper + language: system + types: [ python ] + stages: [ push ] + + - id: pytest + name: pytest + entry: pytest tests + language: system + pass_filenames: false + stages: [ push ] \ No newline at end of file diff --git a/LICENSE b/LICENSE index 293ba74..43ee59d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2023, Bobronium +Copyright (c) 2023, Arseny Boykov All code in this repository except where explicitly noted otherwise is released under the Mozilla Public License v 2.0. You can obtain a copy at https://mozilla.org/MPL/2.0/. diff --git a/LICENSES/MPL-2.0.txt b/LICENSES/MPL-2.0.txt new file mode 100644 index 0000000..ee6256c --- /dev/null +++ b/LICENSES/MPL-2.0.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at https://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/README.md b/README.md index 74f3c4d..3cf9aad 100644 --- a/README.md +++ b/README.md @@ -2,40 +2,61 @@ 20-50x faster than `copy.deepcopy()` on mutable objects. -Aims to fill the gaps in performance and obscurity between copy, pickle, json and other serialization libraries, becoming the go-to library for copying objects within the same Python process. +Aims to fill the gaps in performance and obscurity between copy, pickle, json and other serialization libraries, +becoming the go-to library for copying objects within the same Python process. ```shell pip install duper ``` -[Skip to FAQ](#faq)... +[Skip to FAQ](#faq)... -Note: In its current implementation, duper.deepdups(x) might be 2-5 times slower than copy.deepcopy() for a single operation. It's when you need to create many identical copies of the same object, using duper.deepdups(x) is going to be advantageous due to its specific design. +Note: In its current implementation, duper.deepdups(x) might be 2-5 times slower than copy.deepcopy() for a single +operation. It's when you need to create many identical copies of the same object, using duper.deepdups(x) is going to be +advantageous due to its specific design. -If you have any feedback or ideas, please [open an issue on GitHub](https://github.com/Bobronium/duper/issues) or reach out via [bobronium@gmail.com](mailto:bobronium@gmail.com) or [Telegram](https://t.me/Bobronium). +If you have any feedback or ideas, please [open an issue on GitHub](https://github.com/Bobronium/duper/issues) or reach +out via [bobronium@gmail.com](mailto:bobronium@gmail.com) or [Telegram](https://t.me/Bobronium). --- ### Showcase -##### Using unreleased [timesup](https://github.com/Bobronium/timesup) library \o/. I've planned to release it soon after this one, but had to spend my *time* elswhere and put Open Source on pause. Hopefully, I'll make a first release later this year. + +##### Using unreleased [timesup](https://github.com/Bobronium/timesup) library \o/. I've planned to release it soon after this one, but had to spend my + +*time* elswhere and put Open Source on pause. Hopefully, I'll make a first release later this year. ```py -import duper import copy +from decimal import Decimal + +import duper from timesup import timesup -@timesup(number=100000, repeats=3) +@timesup(number=1000, profile=not duper.COMPILED, repeats=1) def reconstruction(): - x = {"a": 1, "b": [(1, 2, 3), (4, 5, 6)], "c": [object(), object(), object()]} # i - - copy.deepcopy(x) # ~0.00576 ms (deepcopy) - dup = duper.deepdups(x) # ~0.03131 ms (duper_build) - dup() # ~0.00013 ms (duper_dup): 45.18 times faster than deepcopy + x = {"a": 1, "b": [(1, 2, 3), (4, 5, 6)], "c": [object(), Decimal("3.14"), {()}]} + + copy.deepcopy(x) # ~0.01271 ms (deepcopy) + duper.deepcopy(x) # ~0.00354 ms (duper_deepcopy): 3.59 times faster than deepcopy + + [copy.deepcopy(x) for _ in range(10)] # ~0.12629 ms (deepcopy_10) + [copy.deepcopy(x) for _ in range(100)] # ~1.26483 ms (deepcopy_100) + [copy.deepcopy(x) for _ in range(1000)] # ~12.53537 ms (deepcopy_1000) + + duper.replicate(x, 10) # ~0.03589 ms (replicate_10): 3.52 times faster than deepcopy_10 + duper.replicate(x, 100) # ~0.07625 ms (replicate_100): 16.59 times faster than deepcopy_100 + duper.replicate(x, 1000) # ~0.38839 ms (replicate_1000): 32.28 times faster than deepcopy_1000 + + replicate_x = duper.reconstructs(x) # # ~0.04128 ms (build_reconstructor): 3.25 times slower than deepcopy + replicate_x() # ~0.00026 ms (duper_deepcopies): 48.85 times faster than deepcopy ``` ### Real use case + #### Pydantic +
Models definition @@ -62,13 +83,12 @@ class User(BaseModel): } - @wraps(Field) def FastField(default, *args, **kwargs): """ Overrides the fields that need to be copied to have default_factories - """ - default_factory = duper.deepdups(default) + """ + default_factory = duper.reconstructs(default) field_info: FieldInfo = Field(*args, default_factory=default_factory, **kwargs) return field_info @@ -94,68 +114,102 @@ class FastUser(BaseModel): ```py @timesup(number=100000, repeats=3) def pydantic_defaults(): - User(id=42) # ~0.00935 ms (with_deepcopy) + User(id=42) # ~0.00935 ms (with_deepcopy) FastUser(id=1337) # ~0.00292 ms (with_duper): 3.20 times faster than with_deepcopy ``` ### FAQ + #### What's wrong with `copy.deepcopy()`? -Well, it's slow. [Extremely slow](https://stackoverflow.com/questions/24756712/deepcopy-is-extremely-slow), in fact. This has been noted by many, but [no equally powerful alternatives](https://stackoverflow.com/questions/1410615/copy-deepcopy-vs-pickle) were suggested. + +Well, it's slow. [Extremely slow](https://stackoverflow.com/questions/24756712/deepcopy-is-extremely-slow), in fact. +This has been noted by many, +but [no equally powerful alternatives](https://stackoverflow.com/questions/1410615/copy-deepcopy-vs-pickle) were +suggested. #### Why not just rewrite it in C or Rust? -`deepcopy()` needs to examine an arbitrary Python object each time the copy is needed. I figured that this must be quite wasteful, regardless of whether the code that executes this algorithm is compiled or not, since interacting with Python objects inevitably invokes the slow Python interpreter. -When I had a proof of concept, I discovered [gh-72793: C implementation of parts of copy.deepcopy](https://github.com/python/cpython/pull/91610), which further confirmed my assumptions. +`deepcopy()` needs to examine an arbitrary Python object each time the copy is needed. I figured that this must be quite +wasteful, regardless of whether the code that executes this algorithm is compiled or not, since interacting with Python +objects inevitably invokes the slow Python interpreter. + +When I had a proof of concept, I +discovered [gh-72793: C implementation of parts of copy.deepcopy](https://github.com/python/cpython/pull/91610), which +further confirmed my assumptions. #### How is `duper` so fast without even being compiled? -Instead of interacting with slow Python objects for each copy, it compiles concrete instructions to reproduces the object. There is still an interpreter overhead when reconstructing the object, but now it already knows the exact actions that are needed and just executes them. + +Instead of interacting with slow Python objects for each copy, it compiles concrete instructions to reproduces the +object. There is still an interpreter overhead when reconstructing the object, but now it already knows the exact +actions that are needed and just executes them. Interestingly, I learned that this approach has a lot in common with how `pickle` and `marshal` work. #### How is it different from `pickle` or `marshal`? -Both are designed for `serialization`, so they need to dump objects to `bytes` that can be stored on disk and then used to reconstruct the object, even in a different Python process. + +Both are designed for `serialization`, so they need to dump objects to `bytes` that can be stored on disk and then used +to reconstruct the object, even in a different Python process. This creates many constraints on the data they can serialize, as well as the speed of reconstruction. -`duper`, however, is not constrained by these problems. It only needs to guarantee that the object can be recreated within the same Python process, and it can use that to its advantage. +`duper`, however, is not constrained by these problems. It only needs to guarantee that the object can be recreated +within the same Python process, and it can use that to its advantage. #### Are there any drawbacks to this approach? + Perhaps the only drawback is that it's non-trivial to implement. When it comes to using it, I can't see any fundamental drawbacks, only advantages. -However, there are drawbacks to the current implementation. The approach itself boils down to getting a set of minimal instructions that will produce the needed object. But there are different ways to obtain this set of instructions. The fastest way would be to compile the instructions on the fly while deconstructing the object. However, for the sake of simplicity, I used a slower approach of building an AST that compiles to the desired bytecode. Removing this intermediate step should increase the performance of the initial construction by 20-50 times. +However, there are drawbacks to the current implementation. The approach itself boils down to getting a set of minimal +instructions that will produce the needed object. But there are different ways to obtain this set of instructions. The +fastest way would be to compile the instructions on the fly while deconstructing the object. However, for the sake of +simplicity, I used a slower approach of building an AST that compiles to the desired bytecode. Removing this +intermediate step should increase the performance of the initial construction by 20-50 times. #### Is this a drop-in replacement for `deepcopy`? -Not quite yet, but it aims to be. + +Not quite yet, but it aims to be. #### How should I use it? + `duper` shines when you need to make multiple copies of the same object. Here's an example where duper can help the most: + ```python import copy + + data = {"a": 1, "b": [[1, 2, 3], [4, 5, 6]]} copies = [copy.deepcopy(data) for _ in range(10000)] ``` -By pre-compiling instructions in a separate one-time step, we eliminate all of the overhead from the copying phase: + +By pre-compiling instructions in a separate one-time step, we eliminate all of the overhead from the copying phase: + ```python import duper + + data = {"a": 1, "b": [[1, 2, 3], [4, 5, 6]]} reconstruct_data = duper.deepdups(data) copies = [reconstruct_data() for _ in range(10000)] ``` #### Is it production ready? + [Hell no!](#-project-is-in-poc-state) ### 🚧 Project is in a PoC state + Current priorities + - [x] Support for immutable types - [x] Support for builtin types - [x] Support for arbitrary types -- [x] Partial support for `__deepcopy__` and `__copy__` overrides (memo is not respected) -- [ ] Support for recursive structures -- [ ] Find quirky corner cases +- [x] Partial support for `__deepcopy__` and `__copy__` overrides (memo is not fully supported, there are safeguards) +- [x] Support for recursive structures +- [] Find quirky corner cases - [ ] Make initial construction faster (potentially 30-50 times faster than current implementation) - [ ] Support memo in `__deepcopy__` and `__copy__` overrides -The project will be ready for release when `duper.deepdups(x)()` behaves the same as `copy.deepcopy()` and is at least as fast, if not faster. +The project will be ready for release when `duper.deepdups(x)()` behaves the same as `copy.deepcopy()` and is at least +as fast, if not faster. diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..455ea64 --- /dev/null +++ b/data/__init__.py @@ -0,0 +1,470 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +""" +Case naming scheme (pytest -k friendly): +- atom:* – small atomic/builtin scalars & singletons +- container:* – plain containers (lists/tuples/sets/dicts/fsets/range/slice) +- alias:* – shared-identity graphs (no true cycles) +- reflexive:* – true cycles / self or mutual refs +- proto:* – protocol-driven objects (__deepcopy__, __getnewargs*__, __reduce*__, __get/setstate__) +- func:* – functions, code objects, bound methods, closures, descriptors +- stdlib:* – stdlib classes, enums, dataclasses, typing, regex, exceptions, types.* +- numeric:* – fractions/decimal/array/memoryview/float edge cases +- path:* – pathlib paths +- time:* – datetime family (aware/naive), timedelta +- large:* – larger graphs meant for perf/bench +- thirdparty::* – optional cases gated by import success +- xfail:pickle:* – commonly non-picklable objects worth exercising + +Export: +- Case +- OBJECTS (tuple[Case, ...]) +""" + +import array +import dataclasses as dc +import datetime as dt +import decimal +import enum +import fractions +import inspect +import math +import re +import traceback +import types +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any, NamedTuple, TypedDict + + +@dataclass +class Case: + name: str + obj: Any + + +def _reflexive_self_list(): + x: list[Any] = [] + x.append(x) + return x + + +def _reflexive_self_dict(): + d: dict[str, Any] = {} + d["self"] = d + return d + + +def _reflexive_mutual_lists(): + a: list[Any] = [] + b: list[Any] = [] + a.append(b) + b.append(a) + return a # (b is reachable via a[0]) + + +def _reflexive_tuple_list(): + t = ([],) + t[0].append(t) + return t + + +def _reflexive_dict_list_cross(): + lst: list[Any] = [] + d: dict[str, Any] = {"l": lst} + lst.append(d) + return d + + +def _alias_shared_list_pair(): + shared = ["A", "B"] + return [shared, shared] + + +def _alias_mixed_combo(): + shared = ["X", {"k": [1, 2]}] + return [shared, {"again": shared}, (shared,)] + + +def _alias_deep_shared_with_cycle(): + base: list[Any] = [] + pair = [base, base] + d = {"a": base, "b": pair} + base.append(d) # cycle through base -> d -> base + return d + + +def _large_deep_graph(depth: int = 6, leaf_len: int = 64): + leaf = {"payload": list(range(leaf_len))} + node = leaf + for i in range(depth): + node = {"d": i, "pair": [node, {"wrap": node}]} + root = {"root": node} + root["alias1"] = root["root"] + root["alias2"] = root["root"]["pair"][0] + return root + + +def _bound_method_holder(): + class BM: + def m(self) -> int: + return 1 + + inst = BM() + inst.bound = inst.m # attribute holding a bound method + return inst + + +def _closure_func(): + cap = [1] + + def inner(y: int) -> int: + cap.append(y) + return sum(cap) + + return inner + + +def _mappingproxy(): + return types.MappingProxyType({"a": [1, 2], "b": {"k": 3}}) + + +# ----------------------------- Protocol classes ----------------------------- + + +class ProtoDeepCopy: + def __init__(self, xs) -> None: + self.xs = xs + + def __deepcopy__(self, memo=None): + import copy + + cls = type(self) + obj = cls(copy.deepcopy(self.xs, memo)) + return obj + + def __eq__(self, other): # pragma: no cover (equality helper) + return isinstance(other, ProtoDeepCopy) and self.xs == other.xs + + +class ProtoGetNewArgs(int): + def __new__(cls, payload): + self = int.__new__(cls, 7) + self.payload = payload + return self + + def __getnewargs__(self): + return (self.payload,) + + +class ProtoGetNewArgsEx(int): + def __new__(cls, *, data): + self = int.__new__(cls, 9) + self.data = data + return self + + def __getnewargs_ex__(self): + return (), {"data": self.data} + + +class ProtoReduce: + def __init__(self, a, b) -> None: + self.a, self.b = a, b + + def __reduce__(self): + def _rebuild(a, b): + obj = ProtoReduce.__new__(ProtoReduce) + obj.a, obj.b = a, b + return obj + + return (_rebuild, (self.a, self.b)) + + +class ProtoGetStateSetState: + def __init__(self, foo) -> None: + self.foo = foo + + def __getstate__(self): + return {"foo": self.foo} + + def __setstate__(self, st): + self.__dict__.update(st) + + +class SlotClass: + __slots__ = ("a", "b") + + def __init__(self, a, b) -> None: + self.a, self.b = a, b + + def __eq__(self, other: object) -> bool: # pragma: no cover + return isinstance(other, SlotClass) and (self.a, self.b) == (other.a, other.b) + + +# ----------------------------- Enums / typing-ish --------------------------- + + +class Animal(enum.Enum): + CAT = 1 + DOG = 2 + + +class Point(NamedTuple): + x: int + y: int + meta: Any = None + + +class UserTD(TypedDict): + id: int + name: str + + +# ----------------------------- Standard library objects --------------------- + + +@dc.dataclass +class UserDC: + name: str + tags: list[str] + props: dict[str, Any] + + +@dc.dataclass(frozen=True) +class FrozenPair: + x: int + y: tuple[int, ...] + + +# pattern, traceback, signature, etc. +def _make_traceback_exception(): + try: + 1 / 0 + except Exception as e: + return traceback.TracebackException.from_exception(e) + + +# ----------------------------- Third-party (optional) ----------------------- + + +def _thirdparty_cases(): + out: list[Case] = [] + # numpy + try: + import numpy as np + + def _np_object_alias_array(): + shared = {"k": [1, 2]} + arr = np.empty(3, dtype=object) + arr[:] = [shared, shared, shared] + return arr + + out += [ + Case("thirdparty:numpy:array_i32_2x3", np.arange(6, dtype=np.int32).reshape(2, 3)), + Case( + "thirdparty:numpy:array_f64_nan_inf", np.array([0.0, -0.0, np.nan, np.inf, -np.inf]) + ), + Case( + "thirdparty:numpy:array_structured", + np.array([(1, 2.0), (3, 4.5)], dtype=[("a", "i4"), ("b", "f8")]), + ), + Case("thirdparty:numpy:array_object_alias", _np_object_alias_array()), + ] + except Exception: + pass + # pandas + try: + import pandas as pd + + out += [ + Case("thirdparty:pandas:series_Int64_na", pd.Series([1, None, 3], dtype="Int64")), + Case( + "thirdparty:pandas:dataframe_categorical", + pd.DataFrame({"a": [1, 2, 1], "b": pd.Categorical(["x", "y", "x"])}), + ), + Case( + "thirdparty:pandas:dataframe_dt_tz", + pd.DataFrame( + {"ts": pd.to_datetime(["2025-08-30T12:00:00Z", "2025-08-31T00:00:00Z"])} + ), + ), + ] + except Exception: + pass + # pydantic (v1/v2 compatible imports) + try: + from pydantic import BaseModel, Field + + class UserModel(BaseModel): + id: int + name: str = "Alice" + tags: list[str] = Field(default_factory=lambda: ["x"]) + + shared = ["tag", {"k": [1, 2]}] + model = UserModel(id=1, name="Bob", tags=shared) + out += [Case("thirdparty:pydantic:basemodel_with_alias", {"model": model, "alias": shared})] + except Exception: + pass + # attrs + try: + import attrs + + @attrs.define + class A: + x: int + y: list[int] + + out += [Case("thirdparty:attrs:define_instance", A(1, [2, 3]))] + except Exception: + pass + # msgspec + try: + import msgspec + + @msgspec.struct + class S: + x: int + y: list[int] + + out += [Case("thirdparty:msgspec:struct_instance", S(1, [2, 3]))] + except Exception: + pass + # Pillow + try: + from PIL import Image + + out += [ + Case("thirdparty:pillow:image_rgba_2x2", Image.new("RGBA", (2, 2), (255, 0, 0, 128))) + ] + except Exception: + pass + # torch + try: + import torch + + out += [Case("thirdparty:torch:tensor_long_2x3", torch.arange(6).reshape(2, 3))] + except Exception: + pass + # sympy + try: + import sympy as sp + + x = sp.Symbol("x") + out += [Case("thirdparty:sympy:expr_sin2_plus_cos2", sp.sin(x) ** 2 + sp.cos(x) ** 2)] + except Exception: + pass + # networkx + try: + import networkx as nx + + g = nx.DiGraph() + g.add_edge("a", "b") + g.add_edge("b", "a") + out += [Case("thirdparty:networkx:digraph_2cycle", g)] + except Exception: + pass + return tuple(out) + + +# ----------------------------- OBJECTS -------------------------------------- + +OBJECTS: tuple[Case, ...] = ( + # --- atom --- + Case("atom:none", None), + Case("atom:NoneType", type(None)), + Case("atom:bool_true", True), + Case("atom:bool_true", bool), + Case("atom:int_big", 2**120), + Case("atom:float_pi", 3.1415926535), + Case("atom:complex_unit_imag", 1j), + Case("atom:str_unicode", "hello\u1234"), + Case("atom:bytes_small", b"bytes"), + Case("atom:bytearray_small", bytearray(b"ba")), + Case("atom:uuid4", uuid.uuid4()), + Case("atom:ellipsis", ...), + Case("atom:not_implemented", NotImplemented), + # --- numeric --- + Case("numeric:fraction_355_113", fractions.Fraction(355, 113)), + Case("numeric:decimal_pi", decimal.Decimal("3.1415926535")), + Case("numeric:array_i", array.array("i", [1, 2, 3, 4])), + Case("numeric:float_edges", [0.0, -0.0, math.inf, -math.inf, math.nan]), + Case("numeric:memoryview_bytes", memoryview(b"\x00\x01\xfe\xff")), + # --- time --- + Case("time:date_2025_08_30", dt.date(2025, 8, 30)), + Case("time:time_12_34_56_789", dt.time(12, 34, 56, 789)), + Case("time:datetime_naive", dt.datetime(2025, 8, 30, 12, 34, 56, 789)), + Case("time:timedelta_3d7s", dt.timedelta(days=3, seconds=7)), + # zoneinfo if available + *( + lambda: ( + Case( + "time:datetime_tz_zoneinfo_utc", + dt.datetime(2025, 8, 30, 12, 0, tzinfo=__import__("zoneinfo").ZoneInfo("UTC")), + ), + ) + if __import__("importlib").util.find_spec("zoneinfo") + else () + )(), + # --- path --- + Case("path:relative", Path("some/relative/path.txt")), + Case("path:absolute", Path("/tmp/example.bin")), + # --- container (plain, immutable or no alias) --- + Case("container:list_simple", [1, 2, 3, 43]), + Case("container:tuple_simple", (1, 2, 3)), + Case("container:dict_mixed_keys", {"a": True, 42: "answer", (1, 2): "tuple-key"}), + Case("container:set_small", {3, 2, 1}), + Case("container:frozenset_small", frozenset({1, 2, 3})), + Case("container:slice_1_10_2", slice(1, 10, 2)), + Case("container:range_5_50_5", range(5, 50, 5)), + # --- alias (shared identity, no cycles) --- + Case("alias:list_shared_pair", _alias_shared_list_pair()), + Case("alias:mixed_shared_combo", _alias_mixed_combo()), + Case( + "alias:repeated_dataclass", + (lambda inst: [inst, inst, {"again": inst}])( + (lambda: dc.make_dataclass("D", [("x", int)])(7))() + ), + ), + # --- reflexive (true cycles) --- + Case("reflexive:self_list", _reflexive_self_list()), + Case("reflexive:self_dict", _reflexive_self_dict()), + Case("reflexive:mutual_lists", _reflexive_mutual_lists()), + Case("reflexive:tuple_list", _reflexive_tuple_list()), + Case("reflexive:dict_list_cross", _reflexive_dict_list_cross()), + Case("reflexive:deep_shared_subgraph", _alias_deep_shared_with_cycle()), + # --- func / code / descriptors --- + Case("func:builtin_max", max), + Case("func:code_object", (lambda: None).__code__), + Case("func:plain_function", (lambda x, y=3: x + y)), # top-level lambda literal + Case("func:bound_method_attr", _bound_method_holder()), + Case("func:property_descriptor", property()), + # likely non-picklable but interesting to deepcopy + Case("xfail:pickle:closure_function", _closure_func()), + Case("xfail:pickle:mappingproxy", _mappingproxy()), + # --- proto (copy/pickle protocols) --- + Case("proto:__deepcopy__", ProtoDeepCopy([1, [2, 3]])), + Case("proto:__getnewargs__", ProtoGetNewArgs([1, 2, 3])), + Case("proto:__getnewargs_ex__", ProtoGetNewArgsEx(data={"k": [1, 2]})), + Case("proto:__reduce__", ProtoReduce(a=[1, 2], b={"k": [3]})), + Case("proto:getstate_setstate", ProtoGetStateSetState([42])), + Case("proto:slots_class", SlotClass([1, 2], {"k": [3]})), + # --- stdlib miscellany --- + Case("stdlib:enum_animal_cat", Animal.CAT), + Case("stdlib:namedtuple_point", Point(1, 2, {"k": [3, 4]})), + Case("stdlib:typeddict_user", (lambda: {"id": 7, "name": "Alice"})()), + Case("stdlib:dataclass_user", UserDC("alice", ["x", "y"], {"score": [1, 2, 3]})), + Case("stdlib:dataclass_frozen_pair", FrozenPair(1, (2, 3, 4))), + Case( + "stdlib:regex_word_icase_multiline", re.compile(r"\w+", flags=re.IGNORECASE | re.MULTILINE) + ), + Case("stdlib:inspect_signature", inspect.signature(lambda a, b: (a, b))), + Case("stdlib:traceback_exception", _make_traceback_exception()), + Case("stdlib:types_simple_namespace", types.SimpleNamespace(a=[1, 2], b={"k": 3})), + # --- large / perf --- + Case("large:deep_graph_d6_leaf64", _large_deep_graph(35, 64)), +) + _thirdparty_cases() diff --git a/duper/__about__.py b/duper/__about__.py index 3ceefc4..160b2ad 100644 --- a/duper/__about__.py +++ b/duper/__about__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023 Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # # SPDX-License-Identifier: MPL-2.0 diff --git a/duper/__init__.py b/duper/__init__.py index 03e3e5d..19aec6a 100644 --- a/duper/__init__.py +++ b/duper/__init__.py @@ -1,215 +1,64 @@ -# SPDX-FileCopyrightText: 2023 Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # # SPDX-License-Identifier: MPL-2.0 from __future__ import annotations -import copy -from collections import OrderedDict # noqa -from collections.abc import Callable -from collections.abc import Iterable -from functools import partial -from typing import Any -from typing import NoReturn -from typing import TypeVar -from typing import cast - -from duper import _msg -from duper.constants import BUILTIN_COLLECTIONS -from duper.constants import BUILTIN_MUTABLE -from duper.constants import IMMUTABLE_NON_COLLECTIONS -from duper.constants import IMMUTABLE_TYPES -from duper.constants import BuiltinCollectionType -from duper.constants import BuiltinMutableType -from duper.factories.ast import ast_factory -from duper.factories.runtime import debunk_reduce -from duper.factories.runtime import get_reduce -from duper.factories.runtime import reconstruct_copy -from duper.factories.runtime import returns - - -T = TypeVar("T") - - -Constructor = Callable[[], T] -Factory = Callable[[T], Constructor[T]] - - -class Error(copy.Error, TypeError): - """ - Copy module can rise either copy.Error or any other exception that happens during copying - Duper will do its best effort to always rise only duper.Error subclasses - """ - - -def warn( - obj: T, memo: Any, factory: Callable[[T], Callable[[], T]], error: Exception -) -> Callable[[], T]: - import warnings - - warnings.warn( - f"Can't use `{_msg.repr(deepdups)}(..., factory={_msg.repr(factory)})` to copy this {_msg.repr(obj)}:" - "\n" + " " * (len(_msg.repr(Error)) + 3) + f"{error!r}" - f"\nFalling back to builtin copy.deepcopy()" - f"\nNote: such fallbacks may be slow, if they happen too often, consider using copy.deepcopy() directly", - RuntimeWarning, - stacklevel=3, - ) - return partial(copy.deepcopy, obj, memo) - - -def fail(obj: T, _: Any, factory: Callable[[T], Callable[[], T]], error: Exception) -> NoReturn: - __tracebackhide__ = True - - raise Error( - f"Can't use `{_msg.repr(deepdups)}(..., factory={_msg.repr(factory)})` to copy this {_msg.repr(obj)}:" - "\n" + " " * (len(_msg.repr(Error)) + 3) + f"{error!r}" - f"\n\nTip: `{_msg.repr(deepdups)}(..., fallback={_msg.repr(warn)})` will fallback to standard deepcopy on errors" - ) from error - - -def deepdups( - obj: T, - /, - *, - factory: Callable[[T], Callable[[], T]] = ast_factory, - fallback: Callable[..., Callable[[], T]] = fail, - check: bool = True, -) -> Callable[[], T]: - """ - Finds the fastest way of deep-copying an object. - - If object is immutable, it will be returned as is. - If it's an empty builtin collection, it will return its class (list, dict, etc.) - - If obj is non-empty builtin collection, it will check if all values - - Then it will check for __deepcopy__ method and will use it, if it's defined. - - Constructs a factory that knows how to reconstruct an object _fast_. - - :param obj: object to reconstruct - :param factory: an internal factory that will do the work if we - :param fallback: - :param check: - """ - if (cls := cast(type[Any], type(obj))) in IMMUTABLE_NON_COLLECTIONS or issubclass(cls, type): - return partial(returns, obj) - # special case for empty collections. should also work for empty tuples since they are constant - if (builtin := cls in BUILTIN_COLLECTIONS) and not obj: - return cls - - if builtin: - if cls is dict: - container: Iterable[Any] = cast("dict[Any, Any] | OrderedDict[Any, Any]", obj).values() - else: - container = cast(BuiltinCollectionType, obj) - - if all(type(v) in IMMUTABLE_NON_COLLECTIONS for v in container): - if cls in BUILTIN_MUTABLE: - return cast(Callable[[], T], cast(BuiltinMutableType, obj).copy().copy) - return partial(returns, obj) # it's a shallow tuple or frozenset - else: - # seems like we can't speed things up here, unfortunately - # being consistent with builtin deepcopy is better - # than being just faster - if (cp := getattr(obj, "__deepcopy__", None)) is not None: - return partial(cp({}).__deepcopy__, {}) - - try: - compiled = factory(obj) - if not check: - return compiled - try: - compiled() - except Exception as e: - raise Error("Cannot reconstruct this object, see details above") from e - return compiled - except Exception as e: - return fallback(obj, None, factory, e) - - -def deepdupe( - obj: T, - memo: Any = None, - *, - factory: Factory[T] = ast_factory, - fallback: Callable[[T, Any, Factory[T], Exception], Constructor[T]] = fail, -) -> T: - """ - Mirrors interface of copy.deepcopy. Mostly here for test and research purposes. - Constructs a factory, calls it and throws it away, returning the result. - - It's generally going to be slower than deepcopy if used that way. - - If speed is important for your application, you should use `duper.depdupes` or `duper.Duper` instead. - - >>> o = {"a": {}} - >>> c = deepdupe() - >>> assert o == c - >>> assert o["a"] is not c["a"] - - - :return: - """ - if memo is not None: # error: Local variable "memo" has inferred type None; add an annotation - return fallback( - obj, - memo, - factory, - NotImplementedError("Usage of memo is not supported."), - )() - return deepdups(obj, factory=factory, fallback=fallback)() - - -def dups(obj: T) -> Callable[[], T]: - """ - Finds the fastest way to repeatedly copy an object and returns copy factory - """ - # handle two special cases when we don't need to build any fancy reconstructor - if (cls := cast(type[Any], type(obj))) in IMMUTABLE_TYPES or issubclass(cls, type): - return partial(returns, obj) # can just always return the same object - - if cls in BUILTIN_COLLECTIONS and not obj: - return cls # special case for empty collections - - if cls in BUILTIN_MUTABLE: - return cast(Callable[[], T], cast(BuiltinMutableType, obj).copy().copy) - if cp := getattr(obj, "__copy__", None): - return cast(Callable[[], T], cp().__copy__) - - rv = get_reduce(obj, cls) - if isinstance(rv, str): - return partial(returns, obj) - - func, args, kwargs, *rest = debunk_reduce(*rv) - - if any(r is not None for r in rest): - return partial(reconstruct_copy, func, args, kwargs, *rest) - - return partial(func, *args, **kwargs) - - -def dupe(obj: T) -> T: - """ - Mirrors interface of copy.copy. Mostly useful for testing purposes. - - Constructs a factory, calls it and throws it away, returning the result. - - It's generally going to be slower than deepcopy if used that way. - - If speed is important for your application, you should use `duper.depdupes` or `duper.Duper` instead. - - >>> o = {"a": {}} - >>> c = deepdupe() - >>> assert o == c - >>> assert o["a"] is not c["a"] - - - :param obj: - :param memo: - :param factory: - :param fallback: - :return: - """ - return dups(obj)() +import warnings +from typing import Any, TypeVar, TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable + from types import FunctionType, ModuleType + +from duper._compile_support import COMPILED # this import must stay above any other duper imports + + +def _replicate(factory: Callable[[], _T], n: int) -> list[_T]: + return [factory() for _ in range(n)] + + +from duper.copy import copy, copies, deepcopy, reconstructs, Error, warn, fail, replicate # noqa: E402 +from duper.builders.runtime import returns # noqa: E402 +from duper.builders.ast import reprx # noqa: E402 +from duper import builders # noqa: E402 + +_T = TypeVar("_T") +__all__ = [ + "COMPILED", + "Error", + "builders", + "copies", + "copy", + "deepcopy", + "fail", + "reconstructs", + "replicate", + "reprx", + "returns", + "warn", +] + +for _attr in __all__: + if hasattr(public_object := globals()[_attr], "__module__"): + public_object.__module__ = __name__ + + +_DEPRECATED_NAMES = { + "dups": copies, + "dupe": copy, + "deepdups": reconstructs, + "deepdupe": deepcopy, + "factories": builders, +} + + +def __getattr__(name: str) -> Any: + if name in _DEPRECATED_NAMES: + new_obj = _DEPRECATED_NAMES[name] + warnings.warn( + f"{__name__}.{name} is deprecated, use {__name__}.{new_obj.__name__} instead.", # type: ignore[attr-defined] + stacklevel=1, + ) + return new_obj + raise AttributeError(f"Module {__name__!r} doesn't have attributee {name}.") diff --git a/duper/_compile_support.py b/duper/_compile_support.py new file mode 100644 index 0000000..e11aec0 --- /dev/null +++ b/duper/_compile_support.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +import os +import shutil +import sys +import warnings +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import types + + +def check_compiled(*, recheck: bool = False) -> bool: + """Check if the package is running from compiled modules.""" + test_module: types.ModuleType + + if "duper._compile_test" in sys.modules: + import duper._compile_test_2 + + test_module = duper._compile_test_2 # noqa: SLF001 + elif "duper._compile_test_2" not in sys.modules: + import duper._compile_test + + test_module = duper._compile_test # noqa: SLF001 + else: + raise RuntimeError(f"{check_compiled.__name__} allowed be called only once.") + + compiled_test_module_file = test_module.__file__ or "" + compiled = compiled_test_module_file.endswith(".so") + + should_uncompile = os.getenv("DUPER_NO_COMPILED") + if should_uncompile and not compiled: + return False + if not should_uncompile and not compiled: + if not recheck and recompile(Path(compiled_test_module_file).parent): + return check_compiled(recheck=True) + if not os.getenv("DUPER_NO_COMPILED_WARNING"): + warnings.warn( + "duper is not compiled, duper.deepcopy won't be much faster " + "while duper.deepcopies will be significantly slower on compilation phase." + " Reconstruction will still be much faster.", + stacklevel=4, + ) + return False + if not should_uncompile and compiled: + return True + + uncompile(Path(compiled_test_module_file).parent) + + return check_compiled(recheck=True) + + +def recompile(package_path: Path) -> bool: + moved_extensions_file = package_path / ".moved_extensions" + try: + moved: list[str] = moved_extensions_file.read_text().splitlines() + except FileNotFoundError: + return False + + for file in moved: + shutil.move(file, file.replace(".so.bak", ".so")) + moved_extensions_file.unlink() + return True + + +def uncompile(package_path: Path) -> None: + file: Path + moved: list[str] = [] + for file in package_path.rglob("*.so"): + moved.append(destination := str(file.parent / (file.name + ".bak"))) + shutil.move(file, destination) + (package_path / ".moved_extensions").write_text("\n".join(moved)) + + +COMPILED = check_compiled() diff --git a/duper/_compile_test.py b/duper/_compile_test.py new file mode 100644 index 0000000..6f01b8c --- /dev/null +++ b/duper/_compile_test.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +"""Dummy module""" diff --git a/duper/_compile_test_2.py b/duper/_compile_test_2.py new file mode 100644 index 0000000..6f01b8c --- /dev/null +++ b/duper/_compile_test_2.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +"""Dummy module""" diff --git a/duper/_empty_module.py b/duper/_empty_module.py new file mode 100644 index 0000000..d2466dd --- /dev/null +++ b/duper/_empty_module.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 diff --git a/duper/_msg.py b/duper/_msg.py index a5dfe5d..8684cb4 100644 --- a/duper/_msg.py +++ b/duper/_msg.py @@ -1,15 +1,14 @@ -# SPDX-FileCopyrightText: 2023 Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # # SPDX-License-Identifier: MPL-2.0 import ast from typing import Any - _builtin_repr = repr -def repr(obj: Any) -> str: +def repr(obj: Any) -> str: # noqa: A001 """ >>> repr(1) '1' @@ -19,7 +18,7 @@ def repr(obj: Any) -> str: '(1, 2, 3)' >>> repr((1, 2, 3, object())) 'tuple(...)' - >>> import duper; repr(duper.ast_factory) + >>> import duper; repr(duper.build_reconstructor) 'duper.ast_factory' """ @@ -38,7 +37,7 @@ def repr(obj: Any) -> str: obj = type(obj) suffix = "(...)" - if name in duper.__dict__.keys(): + if name in duper.__dict__: # type: ignore[attr-defined] module = "duper." else: module = obj.__module__ + "." if obj.__module__ != "builtins" else "" diff --git a/duper/builders/__init__.py b/duper/builders/__init__.py new file mode 100644 index 0000000..5cd9f8a --- /dev/null +++ b/duper/builders/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from duper.builders.ast import build_reconstructor + +__all__ = ["build_reconstructor"] diff --git a/duper/builders/_helpers.py b/duper/builders/_helpers.py new file mode 100644 index 0000000..e6658bf --- /dev/null +++ b/duper/builders/_helpers.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from typing import Any +from typing import NoReturn +from typing import TypeVar + +from duper.builders._tracking import ObjectInfo + +T = TypeVar("T") + +OT = TypeVar("OT", bound=ObjectInfo) +_sentinel = object() + + +class SentinelMemo(dict[int, Any]): + def __init__( + self, + vid: int, + seen: dict[int, OT], + info_cls: type[OT], + ) -> None: + super().__init__() + self._vid = vid + self._seen = seen + self._info_cls = info_cls + + def _validate(self, key: int, value: Any = _sentinel) -> None: + existing = self._seen.get(key, None) + if existing is None: + if value is not _sentinel: + self._seen[key] = self._info_cls( + key, value, type(value), None, exclusive_for=self._vid + ) + return + + if self._vid not in (key, existing.exclusive_for): + # reusing these is not impossible, but tricky to implement + # it would require maintaining memo name that would collect + # all objects reconstructed via + # custom __deepcopy__ methods, as well as references to objects reconstructed outside of + # custom __deepcopy__, but later referenced there + raise NotImplementedError( + "Types with custom __deepcopy__ referencing previously " + "reconstructed objects are not supported yet. See the comment above." + ) + + def get(self, key: int, default: Any = None) -> Any: + self._validate(key) + return super().get(key, default) + + def __setitem__(self, key: int, value: Any) -> None: + self._validate(key, value) + return super().__setitem__(key, value) + + def __getitem__(self, key: int) -> Any: + self._validate(key) + return super().__getitem__(key) + + def __contains__(self, key: int) -> bool: # type: ignore[override] + self._validate(key) + return super().__contains__(key) + + +def forbid_references(value: Any) -> NoReturn: + raise NotImplementedError( + f"{value} was reconstructed by custom __deepcopy__ and then referenced in other object." + " This is not supported yet." + ) diff --git a/duper/builders/_source.py b/duper/builders/_source.py new file mode 100644 index 0000000..f0cc19b --- /dev/null +++ b/duper/builders/_source.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import ast +import linecache +import threading +import weakref +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + from types import FunctionType + + +class LazySourceLines: + """ + A list-like, lazily materialized sequence of newline-terminated source lines. + Implements enough of the sequence protocol to satisfy inspect/linecache callers. + """ + + __slots__ = ("_filename", "_fn_ref", "_lines", "_lock") + + def __init__(self, fn: FunctionType, filename: str) -> None: + self._fn_ref = weakref.ref(fn) + self._filename = filename + self._lock = threading.Lock() + self._lines: list[str] | None = None + + def _materialize(self) -> list[str]: + if self._lines is not None: + return self._lines + with self._lock: + if self._lines is None: + fn = self._fn_ref() + if fn is None: + self._lines = [] + linecache.cache.pop(self._filename) + else: + text = ast.unparse(fn.__ast__) # type: ignore[attr-defined] + self._lines = text.splitlines(keepends=True) + linecache.cache[self._filename] = (0, None, self._lines, self._filename) + return self._lines + + # -- list-like API -- + def __len__(self) -> int: + return len(self._materialize()) + + def __getitem__(self, i: int | slice) -> str | list[str]: + data = self._materialize() + return data[i] # slice returns list[str], int returns str + + def __iter__(self) -> Iterator[str]: + return iter(self._materialize()) + + def __repr__(self) -> str: + state = "materialized" if self._lines is not None else "lazy" + return ( + f"" + ) diff --git a/duper/builders/_tracking.py b/duper/builders/_tracking.py new file mode 100644 index 0000000..45ff13f --- /dev/null +++ b/duper/builders/_tracking.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +""" +Unified object tracking system for both AST and bytecode factories. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any + +if TYPE_CHECKING: + from duper.fastast import AST + + +class ObjectInfo: + """Base class for tracking object reconstruction state. + + Contains common fields that both factories need, keyed by object ID. + Replaces multiple vid-keyed dictionaries with a single lookup. + """ + + def __init__( + self, + vid: int, + value: Any, + cls: type[Any], + external_value: Any = None, + exclusive_for: int = 0, + ) -> None: + self.vid = vid + self.value = value + self.cls = cls + self.name: str | None = None + self.is_reconstructing = False + self.exclusive_for = exclusive_for + self.external_value = external_value # from memo/keep_by_reference + + +class ASTObjectInfo(ObjectInfo): + """AST factory-specific object tracking.""" + + def __init__( + self, + vid: int, + value: Any, + cls: type[Any], + external_value: Any = None, + exclusive_for: int = 0, + ) -> None: + super().__init__(vid, value, cls, external_value, exclusive_for) + self.expression: AST | None = None + self.reflective_refs: list[AST] | None = None + + +class BytecodeObjectInfo(ObjectInfo): + """Bytecode factory-specific object tracking.""" + + def __init__( + self, + vid: int, + value: Any, + cls: type[Any], + external_value: Any = None, + exclusive_for: int = 0, + ) -> None: + super().__init__(vid, value, cls, external_value, exclusive_for) + self.var_name: str | None = None # stored_in_vars + self.first_construction_position: int | None = None + self.reflective_refs: list[str] | None = None diff --git a/duper/builders/ast.py b/duper/builders/ast.py new file mode 100644 index 0000000..408ff81 --- /dev/null +++ b/duper/builders/ast.py @@ -0,0 +1,1012 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +""" +Construct AST that creates a deep copy of a given object +""" + +from __future__ import annotations + +import ast +import builtins +import importlib +import linecache +import pickle +import sys +import types +import uuid +from enum import Enum +from types import FunctionType +from typing import TYPE_CHECKING +from typing import Any +from typing import Final +from typing import TypeVar +from typing import cast + +from duper import _empty_module +from duper.builders._helpers import SentinelMemo +from duper.builders._helpers import forbid_references +from duper.builders._source import LazySourceLines +from duper.builders._tracking import ASTObjectInfo +from duper.builders.runtime import debunk_reduce +from duper.builders.runtime import get_reduce +from duper.constants import CODE_ATTRIBUTES +from duper.constants import CONST_TYPES +from duper.constants import IMMORTAL_OBJECTS +from duper.constants import IMMUTABLE_CONSTANTS_OR_IMPORTABLE +from duper.constants import IMMUTABLE_NON_COLLECTIONS +from duper.constants import ImmutableCollectionType +from duper.constants import ImmutableType +from duper.fastast import AST +from duper.fastast import LOAD +from duper.fastast import STORE +from duper.fastast import A +from duper.fastast import Assign +from duper.fastast import Attribute +from duper.fastast import Call +from duper.fastast import Constant +from duper.fastast import Dict +from duper.fastast import FunctionDef +from duper.fastast import Import +from duper.fastast import ImportFrom +from duper.fastast import List +from duper.fastast import Module +from duper.fastast import Name +from duper.fastast import NamedExpr +from duper.fastast import Return +from duper.fastast import Set +from duper.fastast import Subscript +from duper.fastast import Tuple +from duper.fastast import alias +from duper.fastast import arg +from duper.fastast import arguments +from duper.fastast import expr +from duper.fastast import keyword +from duper.fastast import stmt + +if TYPE_CHECKING: + from collections.abc import Callable + from collections.abc import Iterable + +T = TypeVar("T") +E = TypeVar("E", bound=expr) + +Undefined: Final = NamedExpr(Name("UNDEFINED"), Constant(1)) +DEFAULT_NAMESPACE = {**builtins.__dict__, **_empty_module.__dict__} +IMMORTAL_CONSTANTS: Final = {id(obj): Constant(obj) for obj in IMMORTAL_OBJECTS} +BUILTIN_NAMES: Final = { + id(obj): Name(name) for name, obj in builtins.__dict__.items() if not name.startswith("_") +} +CACHE: Final[dict[int, Name | Constant[ImmutableType | ImmutableCollectionType]]] = { + **BUILTIN_NAMES, + **IMMORTAL_CONSTANTS, +} +EMPTY_TUPLE_CONSTANT = Constant(()) + + +def __loader__() -> None: # noqa: N807 + """Special method to tell inspect that this file has special logic for loading code""" + raise NotImplementedError + + +class ASTBuilder: + def __init__( + self, + memo: dict[int, Any] | None, + *, + ignore_deepcopy_method: bool, + interprocess: bool, + ) -> None: + self.objects: dict[int, ASTObjectInfo] = {} + self.keep_by_reference: dict[int, Any] | None = memo + self.ignore_deepcopy_method = ignore_deepcopy_method + self.interprocess = interprocess + + self.names: dict[str, Any] = {} + self.used_names: set[str] = {*builtins.__dict__, "None", "True", "False"} + self.cell_helper_required: bool = False + + def get_or_create_object_info(self, value: Any) -> ASTObjectInfo: + """Single lookup/creation point. Returns ObjectInfo for direct manipulation.""" + vid = id(value) + obj_info = self.objects.get(vid) + if obj_info is None: + external_value = None + if self.keep_by_reference is not None: + external_value = self.keep_by_reference.get(vid) + obj_info = ASTObjectInfo(vid, value, type(value), external_value) + self.objects[vid] = obj_info + return obj_info + + def check_object_state(self, obj_info: ASTObjectInfo) -> Name | None: + """Check if object can be referenced directly or needs reconstruction.""" + if obj_info.exclusive_for: + forbid_references(obj_info.value) + + # Already fully reconstructed - return reference + if obj_info.expression is not None: + if isinstance(obj_info.expression, Name): + return obj_info.expression + name = self.get_name(obj_info.value) + self.store_expression(obj_info.expression, name) + return Name(name) + + # External reference from memo + if obj_info.external_value is not None: + return self.store(obj_info.external_value) + + # Currently being reconstructed - this is a self-reference + if obj_info.is_reconstructing: + if obj_info.reflective_refs is None: + obj_info.reflective_refs = [] + reference = Name(self.get_name(obj_info.value)) + obj_info.reflective_refs.append(reference) + return reference + + # Mark as being reconstructed + obj_info.is_reconstructing = True + return None + + def finalize_object(self, obj_info: ASTObjectInfo, expression: A) -> A: + """Mark object as reconstructed and store result.""" + obj_info.is_reconstructing = False + obj_info.expression = expression + return expression + + def store_expression(self, expression: AST, name: str) -> None: + if isinstance(expression, NamedExpr): + return + if expression.parent is None: + expression.name = name + else: + expression.store_result(name) + + def store(self, x: T) -> Name: + """Stores object as is to be available in namespace""" + name = self.get_name(x) + name, *_ = name.split("[") + self.names[name] = x + return Name(id=name) + + def get_name(self, value: Any) -> str: + """Assigns names and resolves collisions""" + obj_info = self.objects.get(id(value)) + if obj_info is not None and obj_info.name is not None: + return obj_info.name + + if (name := getattr(value, "__qualname__", None)) is None and ( + name := getattr(value, "__name__", None) + ) is None: + name = type(value).__name__.lower() + + # FIXME: probably better to just store via locals().__setitem__(name, value)... + # this is done just for sake of fast fix + if not name.isidentifier(): + *_, name = name.split(".") + + if not name.isidentifier(): + name = name.strip("<>") + "_1" + + i = 1 + original_name = name + while name in self.used_names and self.names.get(name) is not value: + name = f"{original_name}_{i}" + i += 1 + + # Store name on object if it exists + if obj_info is not None: + obj_info.name = name + self.used_names.add(name) + return name + + def store_interprocess(self, value: Any) -> Name | Attribute | Subscript | Call: + """ + Produce an interprocess-safe AST reference to `value`. + Semantics mirror pickle's global resolution: + - decide module with which_module-like logic, + - import module, walk dotted path, verify identity, + - emit either a global name (top-level) or parent.attribute chain, + - for locals/lambdas use build_function_local, + - for enums: Class.Member after verifying the class path. + Raises pickle.PicklingError on 'not found' or 'not the same object' cases. + """ + if isinstance(value, Enum): + self._verify_same_object(value, (cls := type(value)).__name__ + f".{value._name_}") + class_name = self.get_name(cls) + self.names[class_name] = cls + return Attribute(value=Name(class_name), attr=value._name_) + + if value is types.CodeType: + self.names["types"] = types + return Attribute(value=Name("types"), attr="CodeType") + if value is types.MethodType: + self.names["types"] = types + return Attribute(value=Name("types"), attr="MethodType") + + if value is types.NoneType: + self.names["types"] = types + return Attribute(value=Name("types"), attr="NoneType") + + qualified_name = getattr(value, "__qualname__", None) or getattr(value, "__name__", None) + if not qualified_name: + # Fall back to direct module dict scan; if that fails, raise "not found" + module_name = getattr(value, "__module__", None) or "__main__" + raise pickle.PicklingError( + f"Can't pickle {value!r}: it's not found as {module_name}.{type(value).__name__}" + ) + + # Locals / lambdas: your implementation supports reconstructing them + if isinstance(value, types.FunctionType) and ( + "" in qualified_name or "" in qualified_name + ): + return self.build_function_local(value) + + # Decide module name (fast path via __module__, fallback scan like whichmodule) + module_name = self._which_module(value, qualified_name) + + # Import and resolve dotted path against that module + try: + resolved, parent = self._get_attribute_from_module(module_name, qualified_name) + except (ImportError, KeyError, AttributeError): + raise pickle.PicklingError( + f"Can't pickle {value!r}: it's not found as {module_name}.{qualified_name}" + ) from None + + # Must be exactly the same object + if resolved is not value: + raise pickle.PicklingError( + f"Can't pickle {value!r}: it's not " + f"the same object as {module_name}.{qualified_name}" + ) + + # Emit: top-level name -> import-based global; nested -> parent.attribute + if parent is sys.modules.get(module_name): + # Top-level symbol; build_imports will add ImportFrom(module_name, name) + return self.store(value) + # Nested attribute chain: reconstruct parent expression, then leaf attribute + leaf = qualified_name.rpartition(".")[2] + return self._emit_parent_attribute(parent, leaf, value) + + def _emit_parent_attribute(self, parent: Any, attribute: str, child: Any) -> Attribute | Call: + """ + Ensure `parent` is globally identical and that getattr(parent, attribute) is `child`, + then emit Attribute(parent_expr, attribute). + """ + self._verify_same_object(parent) + if not attribute or getattr(parent, attribute, _nil) is not child: + parent_module = getattr(parent, "__module__", None) or "" + parent_qualified = getattr(parent, "__qualname__", None) or getattr( + parent, "__name__", "" + ) + raise pickle.PicklingError( + f"Can't pickle {child!r}: it's not " + f"the same object as {parent_module}.{parent_qualified}.{attribute or ''}" + ) + if attribute.isidentifier(): + return Attribute(value=self.build_expression(parent), attr=attribute) + + return Call( + Name("getattr"), args=[self.build_expression(parent), Constant(attribute)], keywords=[] + ) + + def _get_attribute_from_module(self, module_name: str, qualified_name: str) -> tuple[Any, Any]: + """ + Resolve `qualified_name` starting from module `module_name`, like pickle._getattribute, + but return (resolved_object, parent_object). Forbid '' path elements. + Raises AttributeError (with pickle-like messages) or ImportError. + """ + module = sys.modules.get(module_name) + if module is None: + module = importlib.import_module(module_name) + + top = module + obj: Any = module + parent: Any = module + + for subpath in qualified_name.split("."): + if subpath == "": + msg = f"Can't get local attribute {qualified_name!r} on {top!r}" + raise AttributeError(msg) + try: + parent = obj + obj = getattr(obj, subpath) + except AttributeError: + msg = f"Can't get attribute {qualified_name!r} on {top!r}" + raise AttributeError(msg) from None + return obj, parent + + def _which_module(self, value: Any, qualified_name: str) -> str: + """ + Pickle-like whichmodule: + - Use value.__module__ if present. + - Else scan a stable copy of sys.modules and return the first module where + resolving qualified_name yields `value`. + - If nothing matches, return '__main__' (pickle's sentinel). + """ + module_name: str | None = getattr(value, "__module__", None) + if module_name is not None: + return module_name + + for name, module in sys.modules.copy().items(): + if name in ("__main__", "__mp_main__") or module is None: + continue + try: + resolved, _ = self._get_attribute_from_module(name, qualified_name) + except Exception: # noqa: S112, BLE001 + continue + if resolved is value: + return name + return "__main__" + + def _verify_same_object(self, value: Any, qualified_name: str | None = None) -> tuple[str, str]: + """ + Verify that sys.modules[module].qualified_name is exactly `value`. + Returns (module, qualified_name) or raises pickle.PicklingError. + """ + module_name = getattr(value, "__module__", None) + qualified_name = ( + qualified_name + or getattr(value, "__qualname__", None) + or getattr(value, "__name__", None) + ) + if not module_name or not qualified_name: + raise pickle.PicklingError( + f"Can't pickle {value!r}: attribute lookup " + f"{module_name or ''}.{qualified_name or ''} failed" + ) + + try: + resolved, _ = self._get_attribute_from_module(module_name, qualified_name) + except Exception: # noqa: BLE001 + raise pickle.PicklingError( + f"Can't pickle {value!r}: it's not found as {module_name}.{qualified_name}" + ) from None + + if resolved is not value: + raise pickle.PicklingError( + f"Can't pickle {value!r}: " + f"it's not the same object as {module_name}.{qualified_name}" + ) + return module_name, qualified_name + + def build_make_cell_def_ast(self) -> stmt: + """ + def make_cell(value): + def _capture(v): + def _inner(): + return v + return _inner + return _capture(value).__closure__[0] + """ + + return FunctionDef( + name="make_cell", + args=arguments( + posonlyargs=[], + args=[arg(arg="value")], + vararg=None, + kwonlyargs=[], + kw_defaults=[], + kwarg=None, + defaults=[], + ), + body=[ + FunctionDef( + name="_capture", + args=arguments( + posonlyargs=[], + args=[arg(arg="v")], + vararg=None, + kwonlyargs=[], + kw_defaults=[], + kwarg=None, + defaults=[], + ), + body=[ + FunctionDef( + name="_inner", + args=arguments( + posonlyargs=[], + args=[], + vararg=None, + kwonlyargs=[], + kw_defaults=[], + kwarg=None, + defaults=[], + ), + body=[Return(value=Name(id="v"))], + ), + Return(value=Name(id="_inner")), + ], + ), + Return( + value=Subscript( + value=Attribute( + value=Call( + func=Name(id="_capture"), + args=[Name(id="value")], + keywords=[], + ), + attr="__closure__", + ctx=LOAD, + ), + slice=Constant(0), + ctx=LOAD, + ) + ), + ], + ) + + def build_function_local(self, fn: types.FunctionType) -> Subscript: + # We need `types.FunctionType` + self.names["types"] = types # triggers `import types` in repr/unparse + + # Build args for FunctionType(code, globals, name, defaults, closure) + code_e = self.build_expression(fn.__code__) + globals_e = Attribute(self.store_interprocess(builtins), "__dict__") + name_e = Constant(fn.__name__) + defaults_e = ( + Constant(None) if fn.__defaults__ is None else self.build_expression(fn.__defaults__) + ) + + if fn.__closure__ is None: + closure_e: expr = Constant(None) + else: + self.cell_helper_required = True + closure_e = Tuple( + [ + Call( + func=Name("make_cell"), + args=[self.build_expression(c.cell_contents)], + keywords=[], + ) + for c in fn.__closure__ + ] + ) + + ctor = Attribute(value=Name("types"), attr="FunctionType", ctx=LOAD) + new_name = self.get_name(fn) + + steps: list[expr] = [ + assign_name( + new_name, + Call( + func=ctor, args=[code_e, globals_e, name_e, defaults_e, closure_e], keywords=[] + ), + ) + ] + + # Set important slots via setattr (not __dict__): __kwdefaults__, __annotations__, __doc__ + slot_state: dict[Any, Any] = {} + if fn.__kwdefaults__: + slot_state["__kwdefaults__"] = fn.__kwdefaults__ + if fn.__annotations__: + slot_state["__annotations__"] = fn.__annotations__ + if fn.__doc__ is not None: + slot_state["__doc__"] = fn.__doc__ + + if slot_state: + steps.append(self.build_slot_setattrs(Name(new_name, ctx=LOAD), slot_state)) + + return Subscript(Tuple(steps), Constant(0), ctx=LOAD) + + def build_setstate_call(self, new_instance: Name, state: Any) -> Call: + return Call( + func=Attribute(value=new_instance, attr="__setstate__"), + args=[self.build_expression(state)], + keywords=[], + ) + + def build_slot_setattrs(self, new_instance: Name, slot_state: dict[Any, Any]) -> Tuple: + return Tuple( + [ + Call( + func=Name("setattr"), + args=[ + new_instance, + Constant(k), + self.build_expression(v), + ], + keywords=[], + ) + for k, v in slot_state.items() + ] + ) + + def build_dict_update(self, new_instance: Name, dict_state: dict[str, Any]) -> Call: + return Call( + func=Attribute( + value=Attribute(value=new_instance, attr="__dict__"), + attr="update", + ctx=LOAD, + ), + args=[self.build_expression(dict_state)], + keywords=[], + ) + + def build_list_appends(self, new_instance: Name, listiter: Iterable[Any]) -> Tuple: + return Tuple( + [ + Call( + func=Attribute(value=new_instance, attr="append"), + args=[self.build_expression(item)], + keywords=[], + ) + for item in listiter + ] + ) + + def build_dict_setitems(self, new_instance: Name, dictiter: Iterable[tuple[Any, Any]]) -> Tuple: + return Tuple( + [ + Call( + func=Attribute(value=new_instance, attr="__setitem__"), + args=[self.build_expression(k), self.build_expression(v)], + keywords=[], + ) + for k, v in dictiter + ] + ) + + def build_from_reduce( + self, + x: Any, + obj_info: ASTObjectInfo, + func: Callable[..., Any], + args: Any, + kwargs: Any, + state: Any = None, + listiter: Iterable[Any] | None = None, + dictiter: Iterable[tuple[Any, Any]] | None = None, + ) -> Subscript | Call: + if state is None and listiter is None and dictiter is None: + return Call( + func=self.build_expression(cast("FunctionType", func)), + args=[self.build_expression(item) for item in args], + keywords=[ + keyword( + arg=name, + value=self.build_expression(item), + ) + for name, item in kwargs.items() + ], + ) + + name = self.get_name(x) + new_instance = Name(id=name) + new_instance_assign = assign_name( + name, + Call( + func=self.build_expression(cast("FunctionType", func)), + args=[self.build_expression(arg) for arg in args], + keywords=[ + keyword(arg=k, value=self.build_expression(v)) for k, v in kwargs.items() + ], + ), + ) + + steps: list[expr] = [new_instance_assign] + + if state is not None: + if hasattr(obj_info.value, "__setstate__"): + steps.append(self.build_setstate_call(new_instance, state)) + else: + if isinstance(state, tuple) and len(state) == 2: + dict_state, slot_state = state + if slot_state is not None: + steps.append(self.build_slot_setattrs(new_instance, slot_state)) + else: + dict_state = state + + if dict_state is not None: + steps.append(self.build_dict_update(new_instance, dict_state)) + + if listiter is not None: + steps.append(self.build_list_appends(new_instance, listiter)) + + if dictiter is not None: + steps.append(self.build_dict_setitems(new_instance, dictiter)) + + return Subscript(Tuple(steps), Constant(0)) + + def build_immutable( + self, x: ImmutableType | ImmutableCollectionType, *, not_const: bool = False + ) -> Name | Constant[ImmutableType | ImmutableCollectionType] | Attribute | Subscript | Call: + # can't look up by value since {True: True}[1] will return True + # could store tuple[type[T], T], but considering that these objects should be immortal + # this should work just fine + if (cached := CACHE.get(id(x), _nil)) is not _nil: + return cached # type: ignore[return-value] + + if not_const or type(x) not in CONST_TYPES: + # can't use Constant with types in ast (which makes sense, there's no literals for them) + # it's possible to substitute LOAD_GLOBAL with LOAD_CONST later in the bytecode, + # but it's quite slow (with libs from PyPi), and doesn't give a big performance uplift + # later, so I'm ignoring this for now + return self.store_interprocess(x) if self.interprocess else self.store(x) + + return Constant(value=x) + + def build_list(self, x: list[Any], obj_info: ASTObjectInfo) -> List | Subscript: + elts = [self.build_expression(i) for i in x] + + if obj_info.reflective_refs is None: + return List(elts) + + return Subscript( + value=Tuple( + [ + assign_name(self.get_name(x), List([])), + Call( + func=Attribute(value=Name(id=self.get_name(x)), attr="extend"), + args=[Tuple(elts)], + keywords=[], + ), + ], + ), + slice=Constant(value=0), + ) + + def build_set(self, x: set[Any], obj_info: ASTObjectInfo) -> Set | Subscript: + elts = [self.build_expression(i) for i in x] + + if obj_info.reflective_refs is None: + return Set(elts) + + return Subscript( + value=Tuple( + elts=[ + assign_name(self.get_name(x), Set([])), + Call( + func=Attribute(value=Name(id=self.get_name(x)), attr="update"), + args=[Tuple(elts)], + keywords=[], + ), + ], + ), + slice=Constant(value=0), + ) + + def build_dict(self, x: dict[Any, Any], obj_info: ASTObjectInfo) -> Dict | Subscript: + keys = [self.build_expression(i) for i in x] + values = [self.build_expression(i) for i in x.values()] + + if obj_info.reflective_refs is None: + return Dict(keys=keys, values=values) + + name = self.get_name(x) + return Subscript( + value=Tuple( + elts=[ + assign_name(name, Dict(keys=[], values=[])), + Call( + func=Attribute(value=Name(id=name), attr="update"), + args=[ + Call( + func=Name(id="zip"), + args=[Tuple(keys), Tuple(values)], + keywords=[], + ) + ], + keywords=[], + ), + ], + ), + slice=Constant(value=0), + ) + + def build_reference_injection(self, name: str, reference: AST, value: AST) -> expr: + """ + Emit AST that injects `value` into the `reference` location. + Supports: + - list/tuple elts + - dict keys or values + - attribute assignment (via setattr) + """ + parent = reference.parent + container = reference.container + location = reference.location + + if isinstance(container, list) and isinstance(parent, List | Tuple): + # parent is, e.g. List(elts=[...]) + return Call( + func=Attribute(Name(name), "__setitem__"), + args=[ + Constant(location), # index + value, + ], + keywords=[], + ) + + if isinstance(parent, Dict): + if container is parent.keys: + key = value + val: AST = parent.values[cast("int", location)] + else: + key = parent.keys[cast("int", location)] + val = value + + return Call( + func=Attribute(Name(name), "__setitem__"), + args=[ + key, + val, + ], + keywords=[], + ) + + if isinstance(parent, Attribute): + return Call( + func=self.build_immutable(cast("ImmutableType", setattr)), + args=[ + Name(name), + Constant(parent.attr), + value, + ], + keywords=[], + ) + + raise NotImplementedError( + f"Injection for reference inside {type(parent)} is not implemented yet" + ) + + def build_tuple(self, x: tuple[Any, ...] | frozenset[Any], obj_info: ASTObjectInfo) -> expr: + immutable = True + not_const = False + values = [ + expression + for i in x + if ( # this is a tuple of two expressions, always results in True + # constructing list with list comprehension should be faster + # than appending values, but we need to check each one individually as well, + # so we're doing it with Lennon's (or Paul's?) operator. + (expression := self.build_expression(i)), + ( + immutable := immutable + and ( + isinstance(expression, Constant) + or ( + not_const := isinstance(expression, Name) + and self.names.get(expression.id) is i + ) # True for types and functions + ) + ), + ) + ] + + if immutable and not self.interprocess: + return self.build_immutable(x, not_const=not_const) + + if obj_info.reflective_refs is None: + return Tuple(values) + + tuple_expr = Tuple(values) + injections = [] + tuple_name = self.get_name(x) + for name_reference_to_this_tuple in obj_info.reflective_refs: + expression_that_referenced_this_tuple = name_reference_to_this_tuple.parent + if expression_that_referenced_this_tuple is None: + raise RuntimeError( + "Tuple was referenced by one of its elements, " + "but parent for this element wasn't set." + "Parent is required to store reference to this element in order " + "to inject tuple after its construction." + ) + + # Find the object that contains this expression + expression_obj_info = None + for other_obj_info in self.objects.values(): + if other_obj_info.expression is expression_that_referenced_this_tuple: + expression_obj_info = other_obj_info + break + + if expression_obj_info is None: + raise RuntimeError("Could not find object info for referencing expression") + + name = self.get_name(expression_obj_info.value) + self.store_expression(expression_that_referenced_this_tuple, name) + + # replace premature name reference to the tuple with None + name_reference_to_this_tuple.container[name_reference_to_this_tuple.location] = ( # type: ignore[index] + Constant(None) + ) + + injections.append( + self.build_reference_injection(name, name_reference_to_this_tuple, Name(tuple_name)) + ) + + return Subscript( + value=Tuple([NamedExpr(Name(tuple_name, ctx=STORE), tuple_expr), *injections]), + slice=Constant(0), + ctx=LOAD, + ) + + def build_method_deep(self, x: types.MethodType, obj_info: ASTObjectInfo) -> Call: + return Call( + func=self.build_immutable(obj_info.cls), + args=[ + self.build_immutable(cast("ImmutableType", x.__func__)), + self.build_expression(x.__self__), + ], + keywords=[], + ) + + def build_expression(self, x: Any) -> expr: + """""" + cls = type(x) + if ( + ( + cls + in ( + IMMUTABLE_NON_COLLECTIONS + if not self.interprocess + else IMMUTABLE_CONSTANTS_OR_IMPORTABLE + ) + ) + or issubclass(cls, type) + or (self.interprocess and issubclass(cls, Enum)) + ): + return self.build_immutable(x) + + if cls is tuple and not x: # avoid memorizing it + return EMPTY_TUPLE_CONSTANT + + obj_info = self.get_or_create_object_info(x) + result: expr + + existing = self.check_object_state(obj_info) + if existing is not None: + return existing + + # Dispatch based on type - all functions now take ObjectInfo + if cls is dict: + result = self.build_dict(x, obj_info) + elif cls is list: + result = self.build_list(x, obj_info) + elif cls is set: + result = self.build_set(x, obj_info) + elif cls is tuple or cls is frozenset: + result = self.build_tuple(x, obj_info) + if cls is frozenset and isinstance(result, Tuple): + result = Call(Name("frozenset"), args=[result], keywords=[]) + elif cls is types.ModuleType: + result = self.build_immutable(x) + elif cls is types.MethodType: + result = self.build_method_deep(x, obj_info) + elif cls is property: + result = self.build_property(x, obj_info) + elif cls is types.CodeType: + result = self.build_from_reduce( + x, obj_info, cls, map(x.__getattribute__, CODE_ATTRIBUTES), {} + ) + elif ( + not (self.ignore_deepcopy_method or self.interprocess) + and (__deepcopy__ := getattr(x, "__deepcopy__", None)) is not None + ): + # get a deepcopied version of the object and bake it into the blueprint + deepcopied = __deepcopy__(SentinelMemo(obj_info.vid, self.objects, ASTObjectInfo)) + if deepcopied is x: # no need to call __deepcopy__ on each reconstruction + result = self.build_immutable(x) + else: + result = Call(self.store(deepcopied.__deepcopy__), [Dict([], [])], []) + else: + rv = get_reduce(x, cls) + if isinstance(rv, str): # global name + result = self.build_immutable(x) + else: + func, args, kwargs, state, listiter, dictiter = debunk_reduce( + rv, cls=cls, try_simplify_call=True + ) + result = self.build_from_reduce( + x, obj_info, func, args, kwargs, state, listiter, dictiter + ) + + return self.finalize_object(obj_info, result) + + def build_property(self, x: property, obj_info: ASTObjectInfo) -> Call | Attribute | Subscript: + getter = cast("FunctionType", x.fget) + if "." in getter.__qualname__: + parent = getter.__globals__[getter.__qualname__.split(".")[0]] + if getattr(parent, getter.__name__) is x: + return Attribute(self.build_immutable(parent), getter.__name__) + + return self.build_from_reduce( + x, obj_info, obj_info.cls, (x.fget, x.fset, x.fdel, x.__doc__), {} + ) + + def build_imports(self) -> list[Import | ImportFrom]: + imports: list[Import | ImportFrom] = [] + for namespace_name, item in self.names.items(): + if item is types.NoneType: + continue + if isinstance(item, str): + # FIXME: temp + continue + names = [ + alias( + item.__name__ + if getattr(item, "__name__", None) and item.__name__.isidentifier() + else namespace_name, + asname=namespace_name + if namespace_name != getattr(item, "__name__", None) + and getattr(item, "__name__", "<>").isidentifier() + else None, + ) + ] + if namespace_name not in DEFAULT_NAMESPACE: + if isinstance(item, types.ModuleType): + imports.append(Import(names)) + else: + imports.append(ImportFrom(item.__module__, names)) + return imports + + +_nil: Final = object() + + +def assign_name(name: str, value: E) -> NamedExpr[E]: + return NamedExpr( + target=Name( + id=name, + ctx=STORE, + ), + value=value, + ) + + +def build_reconstructor( + x: T, keep_by_reference: dict[int, Any] | None = None, *, ignore_deepcopy_method: bool = False +) -> Callable[[], T]: + ast_builder = ASTBuilder( + keep_by_reference, ignore_deepcopy_method=ignore_deepcopy_method, interprocess=False + ) + + name = type(x).__name__.lower() + return_value_ast = ast_builder.build_expression(x) + + return create_function( + f"reconstruct_{name}", + [Return(value=return_value_ast)], + ast_builder, + ) + + +def create_function(name: str, body: list[stmt], builder: ASTBuilder) -> FunctionType: + module: Final = Module(body=[function_def := FunctionDef(name=name, body=body)]) + function_def.name = name + filename = f"" + code = compile(module, filename, "exec") # type: ignore[call-overload] + + full_ns = {**globals(), **builder.names} + exec(code, full_ns) # noqa: S102 (this is trusted code that we've just compiled from AST + function: FunctionType = full_ns[name] + function.__module__ = __name__ + function.__ast__ = function_def # type: ignore[attr-defined] + linecache.cache[filename] = (0, None, LazySourceLines(function, filename), filename) # type: ignore[assignment] + return function + + +def reprx(x: T, name: str | None = None) -> str: + """ + Generate a Python source code that would reconstruct object equivalent to `x`. + """ + builder = ASTBuilder({}, ignore_deepcopy_method=True, interprocess=True) + if name is None: + name = f"reconstruct_{type(x).__name__.lower()}" + return_value_ast = builder.build_expression(x) + return ast.unparse( + Module( + body=[ + *builder.build_imports(), + *((builder.build_make_cell_def_ast(),) if builder.cell_helper_required else ()), + FunctionDef( # type: ignore[arg-type] + [ + Return(value=return_value_ast), + ], + name, + ), + Assign([Name("__duper_reconstructor__")], Name(name)), + ] + ) + ) diff --git a/duper/builders/bytecode.py b/duper/builders/bytecode.py new file mode 100644 index 0000000..c062e2a --- /dev/null +++ b/duper/builders/bytecode.py @@ -0,0 +1,917 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +""" +Direct bytecode builder +""" + +from __future__ import annotations + +import array +import builtins +import dis +import opcode +import sys +import types +from functools import partial +from typing import TYPE_CHECKING +from typing import Any +from typing import Final +from typing import Literal +from typing import TypeVar +from typing import cast + +from duper.builders._helpers import SentinelMemo +from duper.builders._helpers import forbid_references +from duper.builders._tracking import BytecodeObjectInfo +from duper.builders.runtime import debunk_reduce +from duper.builders.runtime import get_reduce +from duper.constants import IMMUTABLE_NON_COLLECTIONS +from duper.constants import ImmutableCollectionType +from duper.constants import ImmutableType +from duper.debug import run_in_xpython + +if TYPE_CHECKING: + from collections.abc import Callable + from collections.abc import Iterable + from collections.abc import Sequence +import warnings + +warnings.warn( + "This module is experimental and may lead to python crashes. Don't use it on production.", + stacklevel=2, +) + +RAISE_ON_REFLEXIVE: Final = True + +LINETABLE: Final = b"\x00\x01" if sys.version_info >= (3, 10) else b"" + +T = TypeVar("T") + +# Python 3.12+ opcodes +RESUME: Final = dis.opmap["RESUME"] +POP_TOP: Final = dis.opmap["POP_TOP"] +PUSH_NULL: Final = dis.opmap["PUSH_NULL"] +NOP: Final = dis.opmap["NOP"] +COPY: Final = dis.opmap["COPY"] +SWAP: Final = dis.opmap["SWAP"] +RETURN_VALUE: Final = dis.opmap["RETURN_VALUE"] +STORE_NAME: Final = dis.opmap["STORE_NAME"] +DELETE_NAME: Final = dis.opmap["DELETE_NAME"] +UNPACK_SEQUENCE: Final = dis.opmap["UNPACK_SEQUENCE"] +FOR_ITER: Final = dis.opmap["FOR_ITER"] +STORE_ATTR: Final = dis.opmap["STORE_ATTR"] +DELETE_ATTR: Final = dis.opmap["DELETE_ATTR"] +STORE_GLOBAL: Final = dis.opmap["STORE_GLOBAL"] +DELETE_GLOBAL: Final = dis.opmap["DELETE_GLOBAL"] +LOAD_CONST: Final = dis.opmap["LOAD_CONST"] +LOAD_NAME: Final = dis.opmap["LOAD_NAME"] +BUILD_TUPLE: Final = dis.opmap["BUILD_TUPLE"] +BUILD_LIST: Final = dis.opmap["BUILD_LIST"] +BUILD_SET: Final = dis.opmap["BUILD_SET"] +BUILD_MAP: Final = dis.opmap["BUILD_MAP"] +LOAD_ATTR: Final = dis.opmap["LOAD_ATTR"] +COMPARE_OP: Final = dis.opmap["COMPARE_OP"] +IMPORT_NAME: Final = dis.opmap["IMPORT_NAME"] +IMPORT_FROM: Final = dis.opmap["IMPORT_FROM"] +JUMP_FORWARD: Final = dis.opmap["JUMP_FORWARD"] +LOAD_GLOBAL: Final = dis.opmap["LOAD_GLOBAL"] +LOAD_FAST: Final = dis.opmap["LOAD_FAST"] +STORE_FAST: Final = dis.opmap["STORE_FAST"] +DELETE_FAST: Final = dis.opmap["DELETE_FAST"] +LOAD_DEREF: Final = dis.opmap["LOAD_DEREF"] +STORE_DEREF: Final = dis.opmap["STORE_DEREF"] +DELETE_DEREF: Final = dis.opmap["DELETE_DEREF"] +JUMP_BACKWARD: Final = dis.opmap["JUMP_BACKWARD"] +CALL: Final = dis.opmap["CALL"] +KW_NAMES: Final = dis.opmap["KW_NAMES"] +LIST_EXTEND: Final = dis.opmap["LIST_EXTEND"] +SET_UPDATE: Final = dis.opmap["SET_UPDATE"] +DICT_UPDATE: Final = dis.opmap["DICT_UPDATE"] +DICT_MERGE: Final = dis.opmap["DICT_MERGE"] +MAP_ADD: Final = dis.opmap["MAP_ADD"] +BINARY_SUBSCR: Final = dis.opmap["BINARY_SUBSCR"] +STORE_SUBSCR: Final = dis.opmap["STORE_SUBSCR"] +LIST_APPEND: Final = dis.opmap["LIST_APPEND"] + +# Special opcodes +CACHE: Final = dis.opmap["CACHE"] +EXTENDED_ARG: Final = dis.opmap["EXTENDED_ARG"] + +_inline_cache_entries: dict[int, int] | Sequence[int] = getattr( + opcode, "_inline_cache_entries", (0,) * 256 +) +if isinstance(_inline_cache_entries, dict): + _inline_cache_entries = [_inline_cache_entries.get(op, 0) for op in range(256)] + +# we could've used fixed-length tuple here, but it's actually slower: +# https://github.com/python/mypy/issues/19537 +INLINE_CACHE_ENTRIES: Final[tuple[tuple[int, ...] | None, ...]] = tuple( + [(0,) * cache_ops for cache_ops in _inline_cache_entries] +) + + +Const = int +Name = int +Varname = int + + +class Bytecode: + def __init__(self, name: str = "") -> None: + self.name = name + self.units: array.array[int] = array.array("H") + self.consts: list[Any] = [] + self.names: list[str] = [] + self.varnames: list[str] = [] + self.const_map: dict[int, Const] = {} + self.const_value_map: dict[tuple[type, Any], Const] = {} + self.name_map: dict[str, Name] = {} + self.var_map: dict[str, Varname] = {} + self.stack_size = 0 + self.max_stack_size = 0 + self._emit(RESUME, 0) + + def _emit_codeunit(self, op: int, arg: int) -> None: + """Emit a single 16-bit code unit.""" + self.units.append((arg & 0xFF) << 8 | (op & 0xFF)) + + def _emit(self, op: int, arg: int | None = None) -> None: + """Emit an opcode with cache slots.""" + if arg is None: + arg = 0 + + hi = arg >> 8 + lo = arg & 0xFF + if hi: + self._emit_codeunit(EXTENDED_ARG, hi) + self._emit_codeunit(op, lo) + + if caches := INLINE_CACHE_ENTRIES[op]: + self.units.extend(caches) + + def add_const(self, value: Any) -> int: + """Add constant to constants table.""" + try: + key: tuple[type[Any], Any] = (type(value), value) + if (idx := self.const_value_map.get(key)) is not None: + return idx + + idx = len(self.consts) + self.consts.append(value) + self.const_value_map[key] = idx + except TypeError: + pass + else: + return idx + + # FIXME: the way to handle consts above should be the only one, probably + # The way below was my intuition about how it could be improved + # and it makes all stdlib tests pass, however it's inefficient for + # equivalent objects with different ids + # But the real issue here is that we're even trying to store unhashable object here + # (we're ending up here when trying to reconstruct WeakKeyDictionary + + vid = id(value) + if (idx := self.const_map.get(vid)) is not None: + return idx + + idx = len(self.consts) + self.consts.append(value) + self.const_map[vid] = idx + return idx + + def add_name(self, name: str) -> int: + """Add name to names table.""" + if (idx := self.name_map.get(name)) is not None: + return idx + + idx = len(self.names) + self.names.append(name) + self.name_map[name] = idx + return idx + + def add_var(self, name: str) -> int: + """Add variable name.""" + if (idx := self.var_map.get(name)) is not None: + return idx + idx = len(self.varnames) + self.varnames.append(name) + self.var_map[name] = idx + return idx + + def update_stack(self, delta: int) -> None: + """Update stack size tracking.""" + self.stack_size += delta + self.max_stack_size = max(self.max_stack_size, self.stack_size) + + def load_const(self, value: Any) -> None: + """Emit LOAD_CONST for a value.""" + idx = self.add_const(value) + self._emit(LOAD_CONST, idx) + self.update_stack(1) + + def load_global(self, name: str) -> None: + """Emit LOAD_GLOBAL for a global name with NULL.""" + idx = self.add_name(name) + # LOAD_GLOBAL with low bit set pushes NULL + value for calling + arg = (idx << 1) | 1 + self._emit(LOAD_GLOBAL, arg) + self.update_stack(2) # Pushes NULL + value + + def load_attr(self, name: str, *, is_method: bool = False) -> None: + """Emit LOAD_ATTR for attribute access.""" + idx = self.add_name(name) + # In Python 3.12, LOAD_ATTR uses the low bit to indicate method call + # If low bit is set, it's a method call and will push NULL/self + # If low bit is clear, it's a regular attribute access + arg = (idx << 1) | (1 if is_method else 0) + self._emit(LOAD_ATTR, arg) + if is_method: + self.update_stack(1) # Method calls push an extra NULL/self + # Stack: obj -> result (+ NULL/self for methods) + + def store_fast(self, name: str) -> None: + """Store TOS in a local variable.""" + idx = self.add_var(name) + self._emit(STORE_FAST, idx) + self.update_stack(-1) + + def load_fast(self, name: str) -> None: + """Load a local variable.""" + idx = self.add_var(name) + self._emit(LOAD_FAST, idx) + self.update_stack(1) + + def build_tuple(self, count: int) -> None: + """Build tuple from TOS items.""" + self._emit(BUILD_TUPLE, count) + self.update_stack(-count + 1) + + def build_list(self, count: int) -> None: + """Build list from TOS items.""" + self._emit(BUILD_LIST, count) + self.update_stack(-count + 1) + + def build_map(self, count: int) -> None: + """Build dict from TOS key-value pairs.""" + self._emit(BUILD_MAP, count) + self.update_stack(-2 * count + 1) + + def build_set(self, count: int) -> None: + """Build set from TOS items.""" + self._emit(BUILD_SET, count) + self.update_stack(-count + 1) + + def push_null(self) -> None: + self._emit(PUSH_NULL) + self.update_stack(1) + + def call(self, argc: int) -> None: + # Ensure to PUSH_NULL before args + # TODO: generalize and abstract this away + self._emit(CALL, argc) + self.update_stack(-argc - 1) + + def call_method(self, argc: int) -> None: + # Method calls use CALL, stack has NULL + method + args from LOAD_ATTR is_method=True + self._emit(CALL, argc) + self.update_stack(-argc - 2) # Remove NULL + method + args + + def kw_names(self, names: tuple[str, ...]) -> None: + """Set keyword argument names for the next CALL.""" + idx = self.add_const(names) + self._emit(KW_NAMES, idx) + # KW_NAMES doesn't affect the stack + + def binary_subscr(self) -> None: + """TOS = TOS1[TOS].""" + self._emit(BINARY_SUBSCR) + self.update_stack(-1) + + def store_subscr(self) -> None: + """TOS1[TOS] = TOS2.""" + self._emit(STORE_SUBSCR) + self.update_stack(-3) + + def pop_top(self) -> None: + """Remove TOS.""" + self._emit(POP_TOP) + self.update_stack(-1) + + def list_extend(self, count: int = 1) -> None: + """Extend list on TOS1 with iterable TOS.""" + self._emit(LIST_EXTEND, count) + self.update_stack(-1) + + def set_update(self, count: int = 1) -> None: + """Update set on TOS1 with iterable TOS.""" + self._emit(SET_UPDATE, count) + self.update_stack(-1) + + def dict_update(self, count: int = 1) -> None: + """Update dict on TOS1 with dict/iterable TOS.""" + self._emit(DICT_UPDATE, count) + self.update_stack(-1) + + def return_value(self) -> None: + """Return TOS.""" + self._emit(RETURN_VALUE) + self.update_stack(-1) + + def copy(self, n: int) -> None: + """Copy the n-th item to TOS.""" + self._emit(COPY, n) + self.update_stack(1) + + def swap(self, n: int = 2) -> None: + """Swap TOS with the n-th item.""" + self._emit(SWAP, n) + # No stack change + + def list_append(self, i: int = 1) -> None: + """Append TOS to list at TOS-(i+1).""" + self._emit(LIST_APPEND, i) + self.update_stack(-1) + + def insert_after_position(self, pos: int, var_name: str) -> None: + """Insert COPY 1; STORE_FAST after the given position.""" + # Create new instructions + copy_unit = (1 << 8) | (COPY & 0xFF) + idx = self.add_var(var_name) + store_unit = (idx << 8) | (STORE_FAST & 0xFF) + self.units.insert(pos + 1, copy_unit) + self.units.insert(pos + 2, store_unit) + + def to_function(self) -> types.FunctionType: + """Convert bytecode to a callable function.""" + self.return_value() + + # Create code object + code = types.CodeType( # type: ignore[call-arg, unused-ignore] + 0, # argcount + 0, # posonlyargcount + 0, # kwonlyargcount + len(self.varnames), # nlocals + self.max_stack_size + 10, # stacksize (with buffer) + 3, # flags (OPTIMIZED | NEWLOCALS) + self.units.tobytes(), # codestring + tuple(self.consts), # constants + tuple(self.names), # names + tuple(self.varnames), # varnames + "", # filename + self.name, # name + "", # type: ignore[arg-type, unused-ignore] + 1, # type: ignore[arg-type, unused-ignore] + LINETABLE, # type: ignore[arg-type, unused-ignore] + b"", # type: ignore[arg-type, unused-ignore] + (), # freevars + (), # cellvars + ) + + # Create function + return types.FunctionType(code, globals(), self.name) + + +class BytecodeCompiler: + """Tracks object reconstruction state using single lookup.""" + + def __init__( + self, + bytecode: Bytecode, + memo: dict[int, Any] | None, + *, + ignore_deepcopy_method: bool, + ) -> None: + self.bytecode = bytecode + self.objects: dict[int, BytecodeObjectInfo] = {} + self.keep_by_reference = memo + self.used_names: set[str] = set(builtins.__dict__) + self.names: dict[str, Any] = {} # External objects to inject + self.ignore_deepcopy_method = ignore_deepcopy_method + + def get_or_create_object_info(self, value: Any) -> BytecodeObjectInfo: + """Single lookup/creation point. Returns ObjectInfo for direct manipulation.""" + vid = id(value) + obj_info = self.objects.get(vid) + if obj_info is None: + external_value = None + if self.keep_by_reference is not None: + external_value = self.keep_by_reference.get(vid) + obj_info = BytecodeObjectInfo(vid, value, type(value), external_value) + self.objects[vid] = obj_info + return obj_info + + def get_name(self, obj_info: BytecodeObjectInfo) -> str: + """Assigns names and resolves collisions using ObjectInfo directly.""" + if obj_info.name is not None: + return obj_info.name + + name: str | None + if (name := getattr(obj_info.value, "__qualname__", None)) is None: + name = obj_info.cls.__name__.lower() + + i = 1 + base_name = name + while name in self.used_names: + name = f"{base_name}_{i}" + i += 1 + + obj_info.name = name + self.used_names.add(name) + return name + + def check_object_state(self, obj_info: BytecodeObjectInfo) -> str | None: + """Check if object can be referenced directly or needs reconstruction.""" + if obj_info.exclusive_for: + forbid_references(obj_info.value) + + # Object is stored in a variable - just load it + if obj_info.var_name is not None: + self.bytecode.load_fast(obj_info.var_name) + return obj_info.var_name + + # External reference from memo + if obj_info.external_value is not None: + self.bytecode.load_const(obj_info.external_value) + name = self.get_name(obj_info) + self.bytecode.store_fast(name) + obj_info.var_name = name + return name + + # Currently being reconstructed - this is a self-reference + if obj_info.is_reconstructing: + if RAISE_ON_REFLEXIVE and isinstance(obj_info.value, list | set | dict | tuple): + raise NotImplementedError + name = self.get_name(obj_info) + if obj_info.reflective_refs is None: + obj_info.reflective_refs = [] + obj_info.reflective_refs.append(name) + self.bytecode.load_fast(name) # Load the reference (will be filled later) + return name + + # Second encounter - patch first occurrence to store it + if obj_info.first_construction_position is not None: + name = self.get_name(obj_info) + self.bytecode.insert_after_position(obj_info.first_construction_position, name) + obj_info.var_name = name + self.bytecode.load_fast(name) + return name + + # First encounter - mark it + obj_info.is_reconstructing = True + return None + + def finalize_object(self, obj_info: BytecodeObjectInfo, name: str | None = None) -> None: + """Mark object as fully reconstructed.""" + obj_info.is_reconstructing = False + if name: + obj_info.var_name = name + else: + obj_info.first_construction_position = len(self.bytecode.units) - 1 + + def emit_immutable(self, x: ImmutableType | ImmutableCollectionType) -> None: + """Emit bytecode to load an immutable object.""" + # Bytecode allows LOAD_CONST for any object, not just primitives + self.bytecode.load_const(x) + + def emit_list(self, x: list[Any], obj_info: BytecodeObjectInfo) -> None: + """Emit bytecode to reconstruct a list.""" + + if obj_info.reflective_refs is None: + # Simple case - build list directly + for item in x: + self.emit_expression(item) + self.bytecode.build_list(len(x)) + else: + if RAISE_ON_REFLEXIVE: + raise NotImplementedError + # Self-referential list - create empty first + name = self.get_name(obj_info) + self.bytecode.build_list(0) + self.bytecode.store_fast(name) + obj_info.var_name = name + + # Build items tuple + for item in x: + self.emit_expression(item) + self.bytecode.build_tuple(len(x)) + + # Call extend + self.bytecode.load_fast(name) + self.bytecode.load_attr("extend", is_method=True) + self.bytecode.swap(2) + self.bytecode.call_method(1) + self.bytecode.pop_top() + + # Load result + self.bytecode.load_fast(name) + + def emit_dict(self, x: dict[Any, Any], obj_info: BytecodeObjectInfo) -> None: + """Emit bytecode to reconstruct a dict.""" + + if obj_info.reflective_refs is None: + # Simple case + for key, value in x.items(): + self.emit_expression(key) + self.emit_expression(value) + self.bytecode.build_map(len(x)) + else: + if RAISE_ON_REFLEXIVE: + raise NotImplementedError + name = self.get_name(obj_info) + self.bytecode.build_map(0) + self.bytecode.store_fast(name) + obj_info.var_name = name + + # Build keys and values tuples + for key in x: + self.emit_expression(key) + self.bytecode.build_tuple(len(x)) + + for value in x.values(): + self.emit_expression(value) + self.bytecode.build_tuple(len(x)) + + # Call update with zip + self.bytecode.load_fast(name) + self.bytecode.load_attr("update", is_method=True) + self.bytecode.load_const(zip) + self.bytecode.swap(3) + self.bytecode.swap(2) + self.bytecode.call(2) + self.bytecode.call(1) + self.bytecode.pop_top() + + # Load result + self.bytecode.load_fast(name) + + def emit_set(self, x: set[Any], obj_info: BytecodeObjectInfo) -> None: + """Emit bytecode to reconstruct a set.""" + + if obj_info.reflective_refs is None: + # Simple case + for item in x: + self.emit_expression(item) + self.bytecode.build_set(len(x)) + else: + if RAISE_ON_REFLEXIVE: + raise NotImplementedError + name = self.get_name(obj_info) + self.bytecode.build_set(0) + self.bytecode.store_fast(name) + obj_info.var_name = name + + # Build items tuple + for item in x: + self.emit_expression(item) + self.bytecode.build_tuple(len(x)) + + # Call update + self.bytecode.load_fast(name) + self.bytecode.load_attr("update", is_method=True) + self.bytecode.swap(2) + self.bytecode.call_method(1) + self.bytecode.pop_top() + + # Load result + self.bytecode.load_fast(name) + + def emit_tuple(self, x: tuple[Any, ...] | frozenset[Any], obj_info: BytecodeObjectInfo) -> None: + """Emit bytecode to reconstruct a tuple or frozenset.""" + + # For tuples, check if it's immutable and can be loaded as const + if isinstance(x, tuple) and is_deeply_immutable(x): # I'd prefer only one pass, + self.emit_immutable(x) + return + + # Check for self-references + if obj_info.reflective_refs is None: + # Simple case + if isinstance(x, frozenset): + self.bytecode.push_null() + self.bytecode.load_const(frozenset) + for item in x: + self.emit_expression(item) + self.bytecode.build_tuple(len(x)) + self.bytecode.call(1) + else: + for item in x: + self.emit_expression(item) + self.bytecode.build_tuple(len(x)) + else: + if RAISE_ON_REFLEXIVE: + raise NotImplementedError + tuple_name = self.get_name(obj_info) + + # First pass: build tuple with None placeholders for self-refs + ref_positions = [] + for i, item in enumerate(x): + if obj_info.reflective_refs and any( + ref == tuple_name for ref in obj_info.reflective_refs + ): + self.bytecode.load_const(None) + ref_positions.append(i) + else: + self.emit_expression(item) + + self.bytecode.build_tuple(len(x)) + self.bytecode.store_fast(tuple_name) + obj_info.var_name = tuple_name + + # Now inject the actual tuple into the positions + for pos in ref_positions: + # Load the object that referenced the tuple + self.bytecode.load_fast(tuple_name) + self.bytecode.load_const(pos) + self.bytecode.load_fast(tuple_name) + self.bytecode.store_subscr() + + # Load final result + self.bytecode.load_fast(tuple_name) + + def emit_setstate_call(self, var: str, state: Any) -> None: + """Emit bytecode for obj.__setstate__(state).""" + self.bytecode.load_fast(var) + self.bytecode.load_attr("__setstate__", is_method=True) + self.emit_expression(state) + self.bytecode.call_method(1) + self.bytecode.pop_top() + + def emit_slot_setattrs(self, var: str, slot_state: dict[Any, Any]) -> None: + """Emit bytecode for setattr calls on slots.""" + for k, v in slot_state.items(): + self.bytecode.push_null() + self.bytecode.load_const(setattr) + self.bytecode.load_fast(var) + self.bytecode.load_const(k) + self.emit_expression(v) + self.bytecode.call(3) + self.bytecode.pop_top() + + def emit_dict_update(self, var: str, dict_state: dict[str, Any]) -> None: + """Emit bytecode for obj.__dict__.update(state).""" + self.bytecode.load_fast(var) + self.bytecode.load_attr("__dict__", is_method=False) # Regular attribute access + self.emit_expression(dict_state) + self.bytecode.dict_update() + self.bytecode.pop_top() + + def emit_list_appends(self, var: str, listiter: Iterable[Any]) -> None: + """Emit bytecode for list comprehension that appends items.""" + for item in listiter: + self.bytecode.load_fast(var) + self.bytecode.load_attr("append", is_method=True) + self.emit_expression(item) + self.bytecode.call_method(1) + self.bytecode.pop_top() + + def emit_dict_setitems(self, var: str, dictiter: Iterable[tuple[Any, Any]]) -> None: + """Emit bytecode for dict comprehension that sets items.""" + + # Convert dictiter to list first + items = list(dictiter) + + # Build the list of tuples + for key, value in items: + self.emit_expression(value) + self.emit_expression(key) + self.bytecode.load_fast(var) + self.bytecode.load_attr("__setitem__", is_method=True) + self.bytecode.swap(3) # -> ["value", "__setitem__", "NULL", "key"] + self.bytecode.swap(2) # -> ["value", "__setitem__", "key", "NULL"] + self.bytecode.swap(4) # -> ["NULL", "__setitem__", "key", "value"] + self.bytecode.call_method(2) + self.bytecode.pop_top() + + def emit_from_reduce( + self, + x: Any, + obj_info: BytecodeObjectInfo, + func: Callable[..., Any], + args: tuple[Any, ...], + kwargs: dict[str, Any], + state: Any = None, + listiter: Iterable[Any] | None = None, + dictiter: Iterable[tuple[Any, Any]] | None = None, + ) -> None: + """Emit bytecode to reconstruct object using __reduce__ protocol.""" + + if state is None and listiter is None and dictiter is None: + # Simple case - just call the function + if hasattr(func, "__qualname__") and func.__qualname__.endswith(".__new__"): + # Special handling for __new__ methods - load via LOAD_ATTR + if args and hasattr(args[0], "__name__"): + self.bytecode.load_const(args[0]) # Load the class + self.bytecode.load_attr("__new__", is_method=True) # Load __new__ as method + # Load all arguments (including args[0] as first argument to method) + for arg in args: + self.emit_expression(arg) + # Handle keyword arguments + if kwargs: + for value in kwargs.values(): + self.emit_expression(value) + self.bytecode.kw_names(tuple(kwargs.keys())) + self.bytecode.call(len(args) + len(kwargs)) + else: + self.bytecode.call(len(args)) + else: + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + self.bytecode.call(len(args)) + elif isinstance(func, type): + # Class constructor - use regular function call + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + # Handle keyword arguments for classes + if kwargs: + for value in kwargs.values(): + self.emit_expression(value) + self.bytecode.kw_names(tuple(kwargs.keys())) + self.bytecode.call(len(args) + len(kwargs)) + else: + self.bytecode.call(len(args)) + else: + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + self.bytecode.call(len(args)) + + return + + # Complex case - need to store intermediate result + name = self.get_name(obj_info) + + # Call constructor + if hasattr(func, "__qualname__") and func.__qualname__.endswith(".__new__"): + # Special handling for __new__ methods - load via LOAD_ATTR + if args and hasattr(args[0], "__name__"): + self.bytecode.load_const(args[0]) # Load the class + self.bytecode.load_attr("__new__", is_method=True) # Load __new__ as method + # Load all arguments (including args[0] as first argument to method) + for arg in args: + self.emit_expression(arg) + # Handle keyword arguments + if kwargs: + for value in kwargs.values(): + self.emit_expression(value) + self.bytecode.kw_names(tuple(kwargs)) + self.bytecode.call_method(len(args) + len(kwargs)) + else: + self.bytecode.call_method(len(args)) + else: + # Fallback to regular approach + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + self.bytecode.call(len(args)) + elif isinstance(func, type): + # Class constructor - use regular function call + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + # Handle keyword arguments for classes + if kwargs: + for value in kwargs.values(): + self.emit_expression(value) + self.bytecode.kw_names(tuple(kwargs.keys())) + self.bytecode.call(len(args) + len(kwargs)) + else: + self.bytecode.call(len(args)) + else: + self.bytecode.push_null() + self.bytecode.load_const(func) + for arg in args: + self.emit_expression(arg) + self.bytecode.call(len(args)) + self.bytecode.store_fast(name) + + # Apply state + if state is not None: + if hasattr(x, "__setstate__"): + self.emit_setstate_call(name, state) + else: + if isinstance(state, tuple) and len(state) == 2: + dict_state, slot_state = cast( + "tuple[dict[str, Any] | None, dict[str, Any] | None]", state + ) + if slot_state is not None: + self.emit_slot_setattrs(name, slot_state) + else: + dict_state = state + + if dict_state is not None: + self.emit_dict_update(name, dict_state) + + # Apply list items + if listiter is not None: + self.emit_list_appends(name, listiter) + + # Apply dict items + if dictiter is not None: + self.emit_dict_setitems(name, dictiter) + + # Load final result + self.bytecode.load_fast(name) + + def emit_method_deep(self, x: types.MethodType) -> None: + """Emit bytecode to reconstruct a bound method.""" + # Use getattr to create bound method from instance + self.emit_expression(x.__self__) + self.bytecode.load_attr(x.__func__.__name__, is_method=False) + + def emit_expression(self, x: Any) -> None: + """Main dispatcher for object reconstruction.""" + cls = type(x) + + # Immutable types + if cls in IMMUTABLE_NON_COLLECTIONS or issubclass(cls, type): + self.emit_immutable(x) + return + + # Empty object() + if cls is object: + self.bytecode.push_null() + self.bytecode.load_const(object) + self.bytecode.call(0) + return + + # Single lookup/creation point + obj_info = self.get_or_create_object_info(x) + + # Check if we can reference directly + existing = self.check_object_state(obj_info) + if existing is not None: + return + + # Dispatch based on type - all functions now take both x and obj_info + if cls is dict: + self.emit_dict(x, obj_info) + elif cls is list: + self.emit_list(x, obj_info) + elif cls is set: + self.emit_set(x, obj_info) + elif cls is tuple or cls is frozenset: + self.emit_tuple(x, obj_info) + elif cls is types.ModuleType: + self.emit_immutable(x) + elif cls is types.MethodType: + self.emit_method_deep(x) + elif ( + not self.ignore_deepcopy_method + and (__deepcopy__ := getattr(x, "__deepcopy__", None)) is not None + ): + deepcopied = __deepcopy__(SentinelMemo(obj_info.vid, self.objects, BytecodeObjectInfo)) + self.bytecode.load_const(deepcopied) + self.bytecode.load_attr("__deepcopy__", is_method=True) + self.bytecode.build_map(0) # Empty memo dict + self.bytecode.call_method(1) + else: + # Standard reduce protocol + rv = get_reduce(x, cls) + if isinstance(rv, str): + # Global reference + self.emit_immutable(x) + else: + func, args, kwargs, state, listiter, dictiter = debunk_reduce( + rv, cls=cls, try_simplify_call=True + ) + self.emit_from_reduce(x, obj_info, func, args, kwargs, state, listiter, dictiter) + + self.finalize_object(obj_info) + + +def is_deeply_immutable(obj: Any) -> bool: + """Check if an object is deeply immutable (including nested tuples).""" + if obj is None or type(obj) in IMMUTABLE_NON_COLLECTIONS: + return True + if isinstance(obj, tuple | frozenset): + return all(is_deeply_immutable(item) for item in obj) + return False + + +def build_reconstructor( + x: T, + keep_by_reference: dict[int, Any] | None = None, + *, + ignore_deepcopy_method: bool = False, + runtime: Literal["cpython", "xpython"] = "cpython", +) -> Callable[[], T]: + """ + Compiles bytecode that reconstructs `x` from atomic types. + """ + bytecode = Bytecode(f"reconstruct_{type(x).__name__}") + compiler = BytecodeCompiler( + bytecode, memo=keep_by_reference, ignore_deepcopy_method=ignore_deepcopy_method + ) + compiler.emit_expression(x) + if runtime == "cpython": + return bytecode.to_function() + return partial(run_in_xpython, bytecode.to_function()) diff --git a/duper/builders/runtime.py b/duper/builders/runtime.py new file mode 100644 index 0000000..a1890bc --- /dev/null +++ b/duper/builders/runtime.py @@ -0,0 +1,331 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +"""Runtime instructions to reproduce an object""" + +from __future__ import annotations + +import copyreg +import types +from collections.abc import Callable +from collections.abc import Iterable +from collections.abc import Iterator +from collections.abc import MutableMapping +from collections.abc import MutableSequence +from copy import Error +from copyreg import __newobj__ # type: ignore[attr-defined] +from copyreg import __newobj_ex__ # type: ignore[attr-defined] +from typing import Any +from typing import Final +from typing import TypeVar +from typing import cast + +from duper.constants import BUILTIN_COLLECTIONS +from duper.constants import IMMUTABLE_NON_COLLECTIONS + +T = TypeVar("T", bound=Any) + + +def returns(x: T) -> T: + return x + + +_nil: Final = object + +State = Any | tuple[dict[str, Any], dict[str, Any]] | None + + +def reconstruct_state( + new_obj: T, + state: State = None, + listiter: Iterable[Any] | None = None, + dictiter: Iterable[tuple[Any, Any]] | None = None, +) -> T: + if state is not None: + if (setstate := getattr(new_obj, "__setstate__", None)) is not None: + setstate(state) + else: + if isinstance(state, tuple) and len(state) == 2: + dict_state, slot_state = state + if slot_state is not None: + for key, value in slot_state.items(): + setattr(new_obj, key, value) + else: + dict_state = cast("dict[str, Any]", state) + + if dict_state is not None: + new_obj.__dict__.update(dict_state) + + if listiter is not None: + supports_append = cast("MutableSequence[Any]", new_obj) + for item in listiter: + supports_append.append(item) + if dictiter is not None: + supports_setitem = cast("MutableMapping[Any, Any]", new_obj) + for key, value in dictiter: + supports_setitem[key] = value + + return new_obj + + +def reconstruct_copy( + func: Callable[..., T], + args: Any, + kwargs: Any, + state: Any = None, + listiter: Iterable[Any] | None = None, + dictiter: Iterable[tuple[Any, Any]] | None = None, +) -> T: + reconstruct_state(new_obj := func(*args, **kwargs), state, listiter, dictiter) + return new_obj + + +ReduceTuple = ( + tuple[Callable[..., Any], tuple[Any, ...]] + | tuple[Callable[..., Any], tuple[Any, ...], Any | None] + | tuple[Callable[..., Any], tuple[Any, ...], Any | None, Iterator[Any] | None] + | tuple[ + Callable[..., Any], + tuple[Any, ...], + Any | None, + Iterator[Any] | None, + Iterator[tuple[Any, Any]] | None, + ] +) + + +def get_reduce(x: Any, cls: type[Any]) -> str | ReduceTuple: + if custom_reduce := copyreg.dispatch_table.get(cls): + return custom_reduce(x) + if (__reduce_ex__ := getattr(x, "__reduce_ex__", None)) is not None: + return cast("tuple[Any, ...] | str", __reduce_ex__(4)) + if __reduce__ := getattr(x, "__reduce__", None): + return cast("tuple[Any, ...] | str", __reduce__()) + raise Error(f"un(deep)copyable object of type {cls}") + + +def debunk_reduce( + rv: ReduceTuple, cls: type[Any], *, try_simplify_call: bool = False +) -> tuple[ + Callable[..., Any], + tuple[Any, ...], + dict[str, Any], + Any, + Iterator[Any] | None, + Iterator[tuple[Any, Any]] | None, +]: + func, args = rv[0], rv[1] + # mypy cannot understand length = len(rv) narrowing :( + if len(rv) == 2: + state = listiter = dictiter = None + elif len(rv) == 3: + state = rv[2] + listiter = dictiter = None + elif len(rv) == 4: + state = rv[2] + listiter = rv[3] + dictiter = None + else: + state = rv[2] + listiter = rv[3] + dictiter = rv[4] + + # __newobj__ and __newobj_ex__ are special wrapper functions + # getting rid of them saves us from one extra call on stack + kwargs: dict[str, Any] + if func is __newobj_ex__: + cls, args, kwargs = args + args = (cls, *args) + func = cls.__new__ + elif func is __newobj__: + func = (cls := args[0]).__new__ + kwargs = {} + else: + kwargs = {} + + if cls is object or ( + try_simplify_call + # convert A.__new__(A) to just A() + # doesn't make sense to do it when reconstructing object on the fly + # but might be worth for repeated reconstruction + and ( + func is cls.__new__ + # A() triggers: type(A).__call__() -> A.__init__(A.__new__(A)) + # so we want to make sure there's no custom logic in here + # by verifying that these were not overwritten + and (cls.__init__ is object.__init__ and type(cls).__call__ is type.__call__) + ) + ): + return cls, args[1:], kwargs, state, listiter, dictiter + return func, args, kwargs, state, listiter, dictiter + + +Memo = dict[int, Any] + + +def _deepcopy(obj: T, memo: Memo) -> T: + if (cls := type(obj)) in IMMUTABLE_NON_COLLECTIONS or issubclass(cls, type): + return obj + + vid = id(obj) + if (existing_copy := memo.get(vid, _nil)) is not _nil: + return cast("T", existing_copy) + + if cls in BUILTIN_COLLECTIONS and not obj: # type: ignore[comparison-overlap] + empty_instance = cls() + memo[vid] = empty_instance + memo[id(memo)].append(obj) + return cast("T", empty_instance) + + if cls is dict: + result_dict: dict[Any, Any] = {} + memo[vid] = result_dict # memoize before recursion + for original_key, original_value in obj.items(): + copied_key = _deepcopy(original_key, memo) + copied_value = _deepcopy(original_value, memo) + result_dict[copied_key] = copied_value + + memo[id(memo)].append(obj) + return result_dict # type: ignore[return-value] + + if cls is list: + result_list: list[Any] = [] + memo[vid] = result_list # memoize before recursion + for original_item in obj: + result_list.append(_deepcopy(original_item, memo)) # noqa: PERF401 + memo[id(memo)].append(obj) + return result_list # type: ignore[return-value] + + if cls is set: + result_set: set[Any] = set() + memo[vid] = result_set # memoize before recursion + for original_item in obj: + result_set.add(_deepcopy(original_item, memo)) + memo[id(memo)].append(obj) + return result_set # type: ignore[return-value] + + if cls is tuple: + changed = False + items: list[Any] = [] + for original_item in obj: + copied_item = _deepcopy(original_item, memo) + if copied_item is not original_item: + changed = True + items.append(copied_item) + + if not changed: + return obj + + if (already_copied := memo.get(vid, _nil)) is not _nil: + return cast("T", already_copied) + + result_tuple = tuple(items) + memo[vid] = result_tuple + memo[id(memo)].append(obj) + return result_tuple # type: ignore[return-value] + + if cls is frozenset: + changed = False + items_list: list[Any] = [] + for original_item in obj: + copied_item = _deepcopy(original_item, memo) + if copied_item is not original_item: + changed = True + items_list.append(copied_item) + + if not changed: + return obj + + if (already_copied := memo.get(vid, _nil)) is not _nil: + return cast("T", already_copied) + + copied = frozenset(items_list) + memo[vid] = copied + memo[id(memo)].append(obj) + return copied # type: ignore[return-value] + + if cls is types.ModuleType: + return obj + + if cls is types.MethodType: + bound_self_copied = _deepcopy(obj.__self__, memo) + result_method = types.MethodType( + cast("Callable[..., object]", obj.__func__), + bound_self_copied, + ) + memo[vid] = result_method + memo[id(memo)].append(obj) + return result_method # type: ignore[return-value] + + if cls is object: # doesn't hurt at this point + memo[vid] = new_object = object() + memo[id(memo)].append(obj) + return new_object # type: ignore[return-value] + + deep_copy_hook = getattr(obj, "__deepcopy__", None) + if deep_copy_hook is not None: + result: T = deep_copy_hook(memo) + memo[vid] = result + memo[id(memo)].append(obj) + return result + + rv = get_reduce(obj, cls) + if isinstance(rv, str): + return obj + + args: tuple[Any, ...] + func, args = rv[0], rv[1] + # mypy cannot understand length = len(rv) narrowing :( + if len(rv) == 2: + state = listiter = dictiter = None + elif len(rv) == 3: + state = rv[2] + listiter = dictiter = None + elif len(rv) == 4: + state = rv[2] + listiter = rv[3] + dictiter = None + else: + state = rv[2] + listiter = rv[3] + dictiter = rv[4] + + if args == (): + new_object = cast("T", func()) + else: + new_object = cast("T", func(*[_deepcopy(arg, memo) for arg in args])) + + memo[vid] = new_object # memoize before state reconstruction + + if state is not None: + setstate = getattr(new_object, "__setstate__", None) + if setstate is not None: + setstate(_deepcopy(state, memo)) + else: + if isinstance(state, tuple) and len(state) == 2: + dict_state, slot_state = state + if slot_state is not None: + for attribute_name, attribute_value in slot_state.items(): + setattr( + new_object, + attribute_name, + _deepcopy(attribute_value, memo), + ) + else: + dict_state = cast("dict[str, Any]", state) + + if dict_state is not None: + new_object.__dict__.update(_deepcopy(dict_state, memo)) + + if listiter is not None: + sequence_interface = cast("MutableSequence[Any]", new_object) + for sequence_item in listiter: + sequence_interface.append(_deepcopy(sequence_item, memo)) + + if dictiter is not None: + mapping_interface = cast("MutableMapping[Any, Any]", new_object) + for mapping_key, mapping_value in dictiter: + mapping_interface[mapping_key] = _deepcopy(mapping_value, memo) + + return new_object diff --git a/duper/constants.py b/duper/constants.py index 8dda051..d2a332b 100644 --- a/duper/constants.py +++ b/duper/constants.py @@ -1,15 +1,24 @@ -# SPDX-FileCopyrightText: 2023 Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # # SPDX-License-Identifier: MPL-2.0 +import ast +import fractions +import numbers +import re import types import weakref +from decimal import Decimal +from typing import TYPE_CHECKING from typing import Any from typing import Final -from typing import Union +from typing import cast +if TYPE_CHECKING: + from _ast import FunctionDef -IMMUTABLE_NON_COLLECTIONS: Final = frozenset( +# NON_COLLECTIONS should really be called "atomic" +IMMUTABLE_NON_COLLECTIONS_CONST_TYPES: Final = frozenset( { type(None), type(Ellipsis), @@ -20,36 +29,83 @@ complex, bytes, str, - types.CodeType, + } +) +IMMUTABLE_NON_COLLECTIONS: Final = frozenset( + { + *IMMUTABLE_NON_COLLECTIONS_CONST_TYPES, + types.BuiltinFunctionType, + types.FunctionType, type, range, + property, + weakref.ref, + types.CodeType, + re.Pattern, + Decimal, + fractions.Fraction, + numbers.Complex, + numbers.Number, + numbers.Rational, + numbers.Real, + numbers.Integral, + } +) +IMMUTABLE_CONSTANTS_OR_IMPORTABLE: Final = frozenset( + { + *IMMUTABLE_NON_COLLECTIONS_CONST_TYPES, types.BuiltinFunctionType, types.FunctionType, - weakref.ref, - property, + type, } ) -IMMUTABLE_TYPES: Final = frozenset({*IMMUTABLE_NON_COLLECTIONS, tuple, frozenset, slice}) -BUILTIN_COLLECTIONS: Final = frozenset({dict, list, set, tuple, frozenset}) +CONST_COLLECTIONS: Final = frozenset({tuple, frozenset, slice}) +IMMUTABLE_TYPES: Final = frozenset({*IMMUTABLE_NON_COLLECTIONS, *CONST_COLLECTIONS}) +CONST_TYPES = frozenset({*IMMUTABLE_NON_COLLECTIONS_CONST_TYPES, *CONST_COLLECTIONS}) +BUILTIN_COLLECTIONS: Final = frozenset({dict, list, set, tuple, frozenset, bytearray}) BUILTIN_MUTABLE: Final = frozenset({bytearray, dict, list, set}) -BuiltinMutableType = Union[bytearray, dict[Any, Any], list[Any], set[Any]] -BuiltinCollectionType = Union[dict[Any, Any], list[Any], set[Any], tuple[Any, ...], frozenset[Any]] -ImmutableCollectionType = Union[tuple[Any, ...], frozenset, slice] -ImmutableType = Union[ - type[None], - type[Any], - int, - float, - bool, - complex, - bytes, - str, - types.CodeType, - type, - range, - types.BuiltinFunctionType, - types.FunctionType, - weakref.ref, - property, -] +BuiltinMutableType = bytearray | dict[Any, Any] | list[Any] | set[Any] +BuiltinCollectionType = ( + dict[Any, Any] | list[Any] | set[Any] | tuple[Any, ...] | frozenset[Any] | bytearray +) +ImmutableCollectionType = tuple[Any, ...] | frozenset[Any] | slice +ImmutableType = ( + type[None] + | type[Any] + | int + | float + | bool + | complex + | bytes + | str + | types.CodeType + | type + | range + | types.BuiltinFunctionType + | types.FunctionType + | weakref.ref[Any] + | property +) +IMMORTAL_OBJECTS = { + None, + True, + False, + (), + frozenset(), + *range(-5, 257), # Small integers that CPython interns + *[chr(i) for i in range(256)], # ASCII characters + "", + " ", + "\n", + "\t", + "\r", # Common strings +} +_code_definition: Final = cast( + "FunctionDef", + ast.parse(f"def code{types.CodeType.__text_signature__}: pass").body[0], +) +CODE_ATTRIBUTES: Final = tuple( + {"codestring": "co_code", "constants": "co_consts"}.get(arg.arg, f"co_{arg.arg}") + for arg in _code_definition.args.posonlyargs +) diff --git a/duper/copy.py b/duper/copy.py new file mode 100644 index 0000000..3d80071 --- /dev/null +++ b/duper/copy.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import copy as builtin_copy +from collections.abc import Callable +from functools import partial +from typing import Any +from typing import NoReturn +from typing import Protocol +from typing import TypeVar +from typing import cast + +from duper import _msg +from duper import _replicate +from duper.builders.ast import build_reconstructor +from duper.builders.runtime import _deepcopy +from duper.builders.runtime import debunk_reduce +from duper.builders.runtime import get_reduce +from duper.builders.runtime import reconstruct_copy +from duper.builders.runtime import returns +from duper.constants import BUILTIN_COLLECTIONS +from duper.constants import BUILTIN_MUTABLE +from duper.constants import IMMUTABLE_NON_COLLECTIONS +from duper.constants import IMMUTABLE_TYPES +from duper.constants import BuiltinMutableType + +T = TypeVar("T") + +Constructor = Callable[[], T] +Memo = dict[int, Any] + + +class Builder(Protocol[T]): + def __call__( + self, + x: T, + *, + keep_by_reference: dict[int, Any] | None = None, + ignore_deepcopy_method: bool = False, + ) -> Constructor[T]: ... + + +class Error(builtin_copy.Error): + """ + Builtin copy module can rise either copy.Error or other exceptions that happens during copying. + + Duper will do its best to always rise duper.Error subclasses. + """ + + +def warn( + obj: T, memo: Any, factory: Callable[[T], Callable[[], T]], error: Exception +) -> Callable[[], T]: + import warnings + + warnings.warn( + f"\nCan't use {factory.__name__[60:]} to copy this {type(obj).__name__}: {error}" + f"\nFalling back to standard deepcopy()" + f"\nNote: such fallbacks may be slow, if they happen too often, " + f"consider using duper.deepcopy() directly", + RuntimeWarning, + stacklevel=3, + ) + return partial(_deepcopy, obj, memo or {}) + + +def fail(obj: T, _: Any, factory: Builder[T], error: Exception) -> NoReturn: + __tracebackhide__ = True + + raise Error( + f"Can't use `{_msg.repr(reconstructs)}(..., factory={_msg.repr(factory)})` " + f"to copy this {_msg.repr(obj)[300:]}:" + "\n" + " " * (len(_msg.repr(Error)) + 3) + f"{error!r}" + f"\n\nTip: `{_msg.repr(reconstructs)}(..., fallback={_msg.repr(warn)})` " + f"will fallback to standard deepcopy on errors" + ) from error + + +def reraise(_: T, __: Any, ___: Builder[T], ____: Exception) -> NoReturn: + raise # noqa: PLE0704 + + +def reconstructs( + obj: T, + /, + *, + builder: Builder[T] = build_reconstructor, + ignore_deepcopy_method: bool = False, + fallback: Callable[..., Callable[[], T]] = fail, + check: bool = True, + _keep_by_reference: Memo | None = None, +) -> Callable[[], T]: + """ + Finds the fastest way of deep-copying an object. + + If object is immutable, it will be returned as is. + If it's an empty builtin collection, it will return its class (list, dict, etc.) + + If obj is non-empty builtin collection, it will check if all values + + Then it will check for __deepcopy__ method and will use it, if it's defined. + + Constructs a factory that knows how to reconstruct an object _fast_. + + :param obj: object to reconstruct + :param builder: an internal factory that will do the work if we + :param fallback: + :param check: + :param _keep_by_reference: (experimental) can be used to avoid reconstruction + of some values and instead reference them + :param ignore_deepcopy_method: if True, custom __deepcopy__ will be ignored. + Instead, objects will be reconstructed via __reduce__ + """ + + if ( + _keep_by_reference is not None and id(obj) in _keep_by_reference + ): # quite an exotic case, but should be supported, I guess + return partial(returns, _keep_by_reference[id(obj)]) + + if (cls := cast("type[Any]", type(obj))) in IMMUTABLE_NON_COLLECTIONS or issubclass(cls, type): + return partial(returns, obj) + + # special case for empty collections. should also work for empty tuples since they are constant + if (cls in BUILTIN_COLLECTIONS) and not obj: + return cls + + if not ignore_deepcopy_method and (cp := getattr(obj, "__deepcopy__", None)) is not None: + # seems like we can't speed things up here + # being consistent with builtin deepcopy is better + # than being just faster + return partial(cp({}).__deepcopy__, {}) + + try: + compiled = builder( + obj, + keep_by_reference=_keep_by_reference, + ignore_deepcopy_method=ignore_deepcopy_method, + ) + if not check: + return compiled + try: + compiled() + except Exception as e: + raise Error( + f"Cannot reconstruct this object (see details above): {e}, {repr(compiled)[:300]=}" + ) from e + else: + return compiled + except Exception as e: # noqa: BLE001 + return fallback(obj, None, builder, e) + + +def copies(obj: T) -> Callable[[], T]: + """ + Finds the fastest way to repeatedly copy an `obj` and returns copy factory + """ + # handle two special cases when we don't need to build any fancy reconstructor + if (cls := cast("type[Any]", type(obj))) in IMMUTABLE_TYPES or issubclass(cls, type): + return partial(returns, obj) # can just always return the same object + + if cls in BUILTIN_COLLECTIONS and not obj: + return cls # special case for empty collections + + if cls in BUILTIN_MUTABLE: + return cast("Callable[[], T]", cast("BuiltinMutableType", obj).copy().copy) + if cp := getattr(obj, "__copy__", None): + return cast("Callable[[], T]", cp().__copy__) + + rv = get_reduce(obj, cls) + if isinstance(rv, str): + return partial(returns, obj) + + func, args, kwargs, state, listiter, dictiter = debunk_reduce(rv, cls=cls) + if state is not None or listiter is not None or dictiter is not None: + return partial(reconstruct_copy, func, args, kwargs, state, listiter, dictiter) + + return partial(func, *args, **kwargs) + + +def copy(obj: T) -> T: + """ + Faster drop-in replacement for copy.copy(). + """ + if (cls := cast("type[Any]", type(obj))) in IMMUTABLE_TYPES or issubclass(cls, type): + return obj + + if cls in BUILTIN_COLLECTIONS and not obj: + return cast("T", cls()) # special case for empty collections + + if cls in BUILTIN_MUTABLE: + return cast("T", cast("BuiltinMutableType", obj).copy()) + + if cp := getattr(obj, "__copy__", None): + return cast("T", cast("Callable[[], T]", cp())) + + rv = get_reduce(obj, cls) + if isinstance(rv, str): + return obj + + func, args, kwargs, state, listiter, dictiter = debunk_reduce(rv, cls=cls) + + if state is not None or listiter is not None or dictiter is not None: + return cast("T", reconstruct_copy(func, args, kwargs, state, listiter, dictiter)) + + return cast("T", func(*args, **kwargs)) + + +def deepcopy(x: T, memo: Memo | None = None) -> T: + """ + Faster drop-in replacement for copy.deepcopy. + """ + if memo is None: + memo = {} + + memo.setdefault(id(memo), []) + + return _deepcopy(x, memo) + + +def replicate(obj: T, /, n: int, compile_after: int = 20) -> list[T]: + """ + Returns list of `n` deep copies of the `obj`. + + if `n` > `compile_after`, it compiles a minimal set of instructions to reproduce the object. + Compilation takes time, but pays off by much faster copying. + """ + if n <= compile_after: # likely still going to be faster than building a factory + return [deepcopy(obj) for _ in range(n)] + factory = build_reconstructor(obj) + # calling non-compiled function in a loop here turns out to be slower than + # calling it in python + return _replicate(factory, n) diff --git a/duper/debug.py b/duper/debug.py new file mode 100644 index 0000000..8800dd3 --- /dev/null +++ b/duper/debug.py @@ -0,0 +1,249 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +import copy +from collections.abc import Callable +from math import ceil +from typing import Any +from typing import NoReturn +from typing import TypeVar + +T = TypeVar("T") + + +def reraise() -> NoReturn: + raise # noqa: PLE0704 + + +def run_in_xpython(func: Callable[[], T]) -> T: + try: + from xpython.byteop.byteop24 import ( # type: ignore[import-untyped, import-not-found, unused-ignore] + ByteOp24, + ) + from xpython.vm import PyVM # type: ignore[import-untyped, import-not-found, unused-ignore] + except ImportError: + raise ImportError(f"To use {run_in_xpython.__name__}, install 'duper[debug]'") + + ByteOp24.traceback_from_frame = staticmethod(reraise) + return PyVM().run_code(func.__code__, func.__globals__.copy(), {}, toplevel=False) # type: ignore[no-any-return] + + +def find_minimal_example( + data: T, + check: Callable[[T], Any], + raises: type[BaseException] | tuple[type[BaseException]], +) -> T: + """ + Reduce `data` to a minimal example that still triggers `check` to raise. + - Delta-debug chunk removals for speed. + - Smartly detect container-length-only bugs via exponential + linear search, + for both lists (number of items) and dicts (number of keys). + - Print both successful (βœ”) and unsuccessful (✘) removal attempts. + """ + current = copy.deepcopy(data) + + def try_check(d: T) -> bool: + try: + check(d) + except raises: + return True + else: + return False + + if not try_check(current): + raise RuntimeError("Original data must raise.") + + def format_path(path: list[Any]) -> str: + return "root" if not path else "".join(f"[{p!r}]" for p in path) + + def get_at(path: list[Any]) -> Any: + ref = current + for p in path: + ref = ref[p] # type: ignore[index] + return ref + + def set_at(path: list[Any], subtree: Any) -> None: + nonlocal current + if not path: + current = subtree + else: + ref = current + for p in path[:-1]: + ref = ref[p] # type: ignore[index] + ref[path[-1]] = subtree # type: ignore[index] + + def try_replace(path: list[Any], subtree: Any) -> bool: + trial = copy.deepcopy(current) + if not path: + trial = subtree + else: + ref = trial + for p in path[:-1]: + ref = ref[p] # type: ignore[index] + ref[path[-1]] = subtree # type: ignore[index] + return try_check(trial) + + def ddmin_list(path: list[Any]) -> None: + lst = get_at(path) + pth = format_path(path) + length = len(lst) + + # 1) Try removing everything + if length: + if try_replace(path, []): + print(f"βœ” Removed ALL items at {pth}") + set_at(path, []) + return + print(f"✘ Cannot remove ALL items at {pth} (error goes away)") + + # 2) Detect length-only bug via exponential search + if length: + placeholder = [None] * length + if try_replace(path, placeholder): + print(f"β„Ή Detected length-only condition at {pth}, original length={length}") + # exponential find upper bound + step = 1 + while step < length and not try_replace(path, [None] * step): + step *= 2 + step = min(step, length) + lo = step // 2 + hi = step + threshold = None + for n in range(lo + 1, hi + 1): + if try_replace(path, [None] * n): + threshold = n + break + if threshold is not None: + print(f"βœ” Minimal failing length at {pth} is {threshold}") + set_at(path, [None] * threshold) + return + else: + print(f"✘ Content matters at {pth}, not just length={length}") + + # 3) Bulk chunk removals + n = 2 + length = len(get_at(path)) + while n <= length: + size = ceil(length / n) + removed_any = False + for start in range(0, length, size): + end = min(start + size, length) + chunk = list(range(start, end)) + trial = [v for i, v in enumerate(lst) if i < start or i >= end] + if try_replace(path, trial): + print(f"βœ” Removed chunk indices {chunk} at {pth}") + set_at(path, trial) + lst = get_at(path) + length = len(lst) + n = 2 + removed_any = True + break + print(f"✘ Cannot remove chunk {chunk} at {pth}") + if not removed_any: + n *= 2 + + # 4) Single-item removals + lst = get_at(path) + i = 0 + while i < len(lst): + trial = lst[:i] + lst[i + 1 :] + if try_replace(path, trial): + print(f"βœ” Removed index {i} at {pth}, element={repr(lst[i])[:50]}") + set_at(path, trial) + lst = get_at(path) + else: + print(f"✘ Keeping index {i} at {pth}, element={repr(lst[i])[:50]}") + i += 1 + + # 5) Recurse into elements + for j in range(len(get_at(path))): + recurse([*path, j]) + + def ddmin_dict(path: list[Any]) -> None: + d = get_at(path) + pth = format_path(path) + + # 1) Remove all keys + if d: + if try_replace(path, {}): + print(f"βœ” Removed ALL keys at {pth}") + set_at(path, {}) + return + print(f"✘ Cannot remove ALL keys at {pth} (error goes away)") + + # 2) Detect length-only bug for dict keys + length = len(d) + if length: + placeholder = {f"__dummy{i}": None for i in range(length)} + if try_replace(path, placeholder): + print(f"β„Ή Detected key-count–only condition at {pth}, original keys={length}") + # exponential find upper bound + step = 1 + while step < length and not try_replace( + path, {f"__dummy{i}": None for i in range(step)} + ): + step *= 2 + step = min(step, length) + lo = step // 2 + hi = step + threshold = None + for n in range(lo + 1, hi + 1): + trial = {f"__dummy{i}": None for i in range(n)} + if try_replace(path, trial): + threshold = n + break + if threshold is not None: + print(f"βœ” Minimal failing key-count at {pth} is {threshold}") + set_at(path, {f"__dummy{i}": None for i in range(threshold)}) + return + else: + print(f"✘ Content matters at {pth}, not just key-count={length}") + + # 3) Bulk key-chunk removals + keys = list(d.keys()) + n = 2 + while n <= len(keys): + size = ceil(len(keys) / n) + removed_any = False + for start in range(0, len(keys), size): + chunk = keys[start : start + size] + trial = {k: v for k, v in d.items() if k not in chunk} + if try_replace(path, trial): + print(f"βœ” Removed chunk keys {chunk} at {pth}") + set_at(path, trial) + d = get_at(path) + keys = list(d.keys()) + n = 2 + removed_any = True + break + print(f"✘ Cannot remove chunk keys {chunk} at {pth}") + if not removed_any: + n *= 2 + + # 4) Single-key removals + for key in list(get_at(path).keys()): + trial = copy.deepcopy(get_at(path)) + removed: Any = trial.pop(key) + if try_replace(path, trial): + print(f"βœ” Removed key {key!r} at {pth}, value={repr(removed)[:50]}") + set_at(path, trial) + else: + print(f"✘ Keeping key {key!r} at {pth}") + + # 5) Recurse into values + for key in list(get_at(path).keys()): + recurse([*path, key]) + + def recurse(path: list[Any] | None = None) -> None: + if path is None: + path = [] + subtree = get_at(path) + if isinstance(subtree, dict): + ddmin_dict(path) + elif isinstance(subtree, list): + ddmin_list(path) + # primitives: nothing to do + + recurse([]) + return current diff --git a/duper/factories/__init__.py b/duper/factories/__init__.py deleted file mode 100644 index de9db87..0000000 --- a/duper/factories/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Bobronium -# -# SPDX-License-Identifier: MPL-2.0 diff --git a/duper/factories/ast.py b/duper/factories/ast.py deleted file mode 100644 index a257f5a..0000000 --- a/duper/factories/ast.py +++ /dev/null @@ -1,372 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Bobronium -# -# SPDX-License-Identifier: MPL-2.0 - -""" -Construct AST that creates a deep copy of a given object -""" -from __future__ import annotations - -import ast -import linecache -import types -from collections.abc import Callable -from collections.abc import Iterable -from threading import Lock -from types import FunctionType -from typing import Any -from typing import Final -from typing import TypeVar -from typing import cast - -import duper -from duper.constants import IMMUTABLE_NON_COLLECTIONS -from duper.constants import IMMUTABLE_TYPES -from duper.constants import ImmutableType -from duper.factories.runtime import debunk_reduce -from duper.factories.runtime import get_reduce -from duper.factories.runtime import reconstruct_state -from duper.fastast import Call -from duper.fastast import Constant -from duper.fastast import Dict -from duper.fastast import FunctionDef -from duper.fastast import List -from duper.fastast import Load -from duper.fastast import Module -from duper.fastast import Name -from duper.fastast import NamedExpr -from duper.fastast import Return -from duper.fastast import Set -from duper.fastast import Store -from duper.fastast import Tuple -from duper.fastast import expr -from duper.fastast import keyword -from duper.fastast import stmt - - -T = TypeVar("T") -E = TypeVar("E", bound=expr) - -# default locations that ast sets in ast.fix_missing_locations() -# we can save a lot of time by setting them ourselves - -LOAD: Final = Load() -STORE: Final = Store() -LOC: Final = dict(lineno=1, col_offset=0, end_lineno=1, end_col_offset=0) -Undefined: Final = NamedExpr(Name("UNDEFINED"), Constant(1)) -CONSTANT_AST_TYPES: Final = frozenset({Name, Constant}) - - -def __loader__() -> None: - """Special method to tell inspect that this file has special logic for loading the code""" - - -class Namespace: - def __init__(self) -> None: - self.forbid_references: dict[int, Any] = {} - self.names: dict[str, Any] = {} - self.used_names: set[str] = set() - self.vid_to_name: dict[int, str] = {} - self.reconstructed: dict[int, expr] = {} - - def check_references(self, value: Any) -> Name | None: - if (vid := id(value)) in self.reconstructed: - # This is the hackiest hack, and it shouldn't be done like this - # but this allows to make things simpler in other places - # which is a good trade for now. - # In later versions this will be resolved in a more general way. - expression = self.reconstructed[vid] - name = self.get_name(value) - if isinstance(expression, NamedExpr): - return Name(name) - new_expression = NamedExpr(target=Name(name, ctx=STORE), value=duper.dupe(expression)) - expression.__dict__.clear() - expression.__dict__.update(new_expression.__class__.__dict__) - expression.__dict__.update(new_expression.__dict__) - expression.__class__ = ast.NamedExpr - return Name(name) - - if (vid := id(value)) in self.forbid_references and vid not in self.vid_to_name: - # If we end up here, it must mean type has been referenced again before we - # finished reconstructing an AST statement for it. - # this can be resolved, but I don't want to overcomplicate the logic to include - # all such interactions before releasing the PoC. - # There are some special cases that duper handles already, like reconstruction from - # reduce, which may require reconstructed instance value to be present - # to reconstruct its state, but a more general approach is needed to support them all - raise NotImplementedError( - f"Already seen {type(value)=}, {id(value)=} self-reflexive types are not supported yet" - ) - self.forbid_references[vid] = value - return None - - def unlock_references(self, value: Any, expression: T) -> T: - self.forbid_references.pop(vid := id(value), None) - self.reconstructed[vid] = cast(expr, expression) - return expression - - def store(self, x: T) -> Name: - """ - Stores object as is to be available in namespace - """ - name = self.get_name(x) - self.names[name] = x - return Name(id=name) - - def get_name(self, value: Any) -> str: - """ - Assigns names and resolves collisions - Once name is set, it's remembered in vid_to_name map - - Previously, it resolved collisions in advance by f'{type(value).__name__}{id(value)}' - But this was quite verbose and not pleasant to read/debug - """ - - # already assigned a name previously - if (vid := id(value)) in self.vid_to_name: - return self.vid_to_name[vid] - if (name := getattr(value, "__qualname__", None)) is None: - name = type(value).__name__.lower() - - i = 1 - while name in self.used_names: - name = f"{name}{i}" - i += 1 - - # remember assigned names for future lookup - self.vid_to_name[vid] = name - self.used_names.add(name) - return name - - -def reconstruct_from_reduce( - x: T, - namespace: Namespace, - func: Callable[..., T], - args: Any, - kwargs: Any, - state: Any = None, - listiter: Iterable[Any] | None = None, - dictiter: Iterable[tuple[Any, Any]] | None = None, -) -> Call: - if state is None and listiter is None and dictiter is None: - return Call( - func=namespace.store(func), - args=[reconstruct_expression(item, namespace) for item in args], - keywords=[ - keyword( - arg=name, - value=reconstruct_expression(item, namespace), - ) - for name, item in kwargs.items() - ], - ) - return Call( - func=namespace.store(reconstruct_state), - args=[ - # newly created instance will be referenced during reconstruction - namespace.unlock_references( - x, - NamedExpr( - target=Name(id=namespace.get_name(x), ctx=STORE), - value=Call( - func=namespace.store(func), - args=[reconstruct_expression(item, namespace) for item in args], - keywords=[ - keyword( - arg=name, - value=reconstruct_expression(item, namespace), - ) - for name, item in kwargs.items() - ], - ), - ), - ), - reconstruct_expression(state, namespace), - Call( - func=reconstruct_const(iter, namespace), - args=[reconstruct_list(list(listiter) if listiter else [], namespace)], - keywords=[], - ), - Call( - func=reconstruct_const(dict.items, namespace), - args=[reconstruct_dict(dict(dictiter) if dictiter else {}, namespace)], - keywords=[], - ), - ], - keywords=[], - ) - - -def reconstruct_const(x: T, namespace: Namespace) -> Name | Constant[Any]: - return ( - # can't use Constant with types in ast (which makes sense, there's no literals for them) - # it's possible to substitute LOAD_GLOBAL with LOAD_CONST later in the bytecode, - # but it's quite slow (with libs from PyPi), and doesn't give a big performance uplift - # later, so I'm ignoring this for now - namespace.store(x) - if type(x) not in IMMUTABLE_TYPES - or isinstance( - x, - ( - type, - types.MethodType, - types.BuiltinMethodType, - types.FunctionType, - types.MethodDescriptorType, - ), - ) - else Constant(value=x) - ) - - -def reconstruct_list(x: list[Any], namespace: Namespace) -> List: - return List([reconstruct_expression(i, namespace) for i in x]) - - -def reconstruct_set(x: set[Any], namespace: Namespace) -> Set: - return Set([reconstruct_expression(i, namespace) for i in x]) - - -def reconstruct_dict(x: dict[Any, Any], namespace: Namespace) -> Dict | Call | Name: - return Dict( - keys=[reconstruct_expression(i, namespace) for i in x.keys()], - values=[reconstruct_expression(i, namespace) for i in x.values()], - ) - - -def reconstruct_tuple( - x: tuple[Any, ...] | frozenset[Any], namespace: Namespace -) -> Tuple | NamedExpr[Any] | Name | Constant[tuple[ImmutableType, ...]]: - immutable = True - values = [ - expression - for i in x - if ( - # this is a tuple of two expressions, always results in True - # constructing list with list comprehension should be faster - # than appending values, but we need to check each element as well, - # so we're doing it with Lennon's (or Paul's?) operator. - (expression := reconstruct_expression(i, namespace)), - ( - immutable := immutable - and ( - isinstance(expression, Constant) - or isinstance(expression, Name) - # object is in the namespace, should be immutable - # types, functions, etc. are ending up in namespace.names - and namespace.names.get(expression.id) is i - ) - ), - ) - ] - if immutable: - return reconstruct_const(x, namespace) - return Tuple(elts=values) - - -def reconstruct_method(x: types.MethodType, namespace: Namespace) -> Call: - return Call( - func=reconstruct_const(type(x), namespace), - args=[ - reconstruct_const(x.__func__, namespace), - reconstruct_expression(x.__self__, namespace), - ], - keywords=[], - ) - - -def reconstruct_expression(x: Any, namespace: Namespace) -> expr: - """ - Based on copy._reconstruct - """ - - cls = type(x) - if cls in IMMUTABLE_NON_COLLECTIONS: - return namespace.unlock_references(x, reconstruct_const(x, namespace)) - - existing = namespace.check_references(x) - if existing is not None: - return existing - - constructor: Callable[[Any, Namespace], expr] | None = optimized_constructors.get(cls) - - if constructor is not None: - return namespace.unlock_references(x, constructor(x, namespace)) - - if (custom_copier := getattr(x, "__deepcopy__", None)) is not None: - return namespace.unlock_references( - x, reconstruct_from_reduce(x, namespace, custom_copier, ({},), {}, None, None, None) - ) - - rv = get_reduce(x, cls) - if isinstance(rv, str): # global name - return namespace.unlock_references(x, reconstruct_const(x, namespace)) - rv = debunk_reduce(*rv) - - return namespace.unlock_references(x, reconstruct_from_reduce(x, namespace, *rv)) - - -def ast_factory(x: T) -> Callable[[], T]: - return_value_ast = reconstruct_expression(x, namespace := Namespace()) - return compile_function( - f"produce_{type(x).__name__}", - [Return(value=return_value_ast)], - namespace, - ) - - -optimized_constructors: dict[type[Any], Callable[[Any, Namespace], expr]] = { - dict: reconstruct_dict, - list: reconstruct_list, - set: reconstruct_set, - tuple: reconstruct_tuple, - frozenset: reconstruct_tuple, - types.ModuleType: reconstruct_const, - types.MethodType: reconstruct_method, - **{t: reconstruct_const for t in IMMUTABLE_NON_COLLECTIONS}, -} -FUNCTION: Final = FunctionDef( - name="FN", - body=[], -) -MODULE: Final = Module( - body=[FUNCTION], -) - -with_source: bool = False - - -def compile_function(name: str, body: list[stmt], namespace: Namespace) -> FunctionType: - global MODULE, FUNCTION - with Lock(): - # changing variables on predefined AST is much faster - # than constructing AST from scratch - # locking just in case this is used in different threads - FUNCTION.name = name - FUNCTION.body = body - if with_source: - # this is most useful for debugging - # it visualizes the AST it generated back into python syntax - # it's also slow, so should be disabled, unless utilized - # - # TODO: generate this on demand (when source lines are retrieved) - assert len(body) == 1, "Initial implementation always contained 1 line of code" - assert isinstance(body[0], Return) - assert body[0].value is not None - - return_value = ast.unparse(body[0].value) - source = [name := f"lambda: {return_value}"] - FUNCTION.name = name - file = f"" - linecache.cache[file] = (0, None, source, "") - else: - file = "" - - code = compile(MODULE, file, "exec") # type: ignore[arg-type] - - full_ns = {**globals(), **namespace.names} - exec(code, full_ns) - function: FunctionType = full_ns[name] - function.__module__ = __name__ - return function diff --git a/duper/factories/runtime.py b/duper/factories/runtime.py deleted file mode 100644 index 207667c..0000000 --- a/duper/factories/runtime.py +++ /dev/null @@ -1,121 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Bobronium -# -# SPDX-License-Identifier: MPL-2.0 - -"""Runtime instructions to reproduce an object""" -from __future__ import annotations - -import copyreg -from collections.abc import Callable -from collections.abc import Iterable -from collections.abc import MutableMapping -from collections.abc import MutableSequence -from copy import Error -from copyreg import __newobj__ # type: ignore[attr-defined] -from copyreg import __newobj_ex__ # type: ignore[attr-defined] -from typing import Any -from typing import TypeVar -from typing import Union -from typing import cast - - -T = TypeVar("T") - - -def returns(x: T) -> T: - return x - - -State = Union[Any, tuple[dict[str, Any], dict[str, Any]], None] - - -def reconstruct_state( - new_obj: T, - state: State = None, - listiter: Iterable[Any] | None = None, - dictiter: Iterable[tuple[Any, Any]] | None = None, -) -> T: - if state is not None: - if (setstate := getattr(new_obj, "__setstate__", None)) is not None: - setstate(state) - else: - if isinstance(state, tuple) and len(state) == 2: - dict_state, slot_state = state - if slot_state is not None: - for key, value in slot_state.items(): - setattr(new_obj, key, value) - else: - dict_state = cast(dict[str, Any], state) - - if dict_state is not None: - new_obj.__dict__.update(dict_state) - - if listiter is not None: - supports_append = cast(MutableSequence[Any], new_obj) - for item in listiter: - supports_append.append(item) - if dictiter is not None: - supports_setitem = cast(MutableMapping[Any, Any], new_obj) - for key, value in dictiter: - supports_setitem[key] = value - - return new_obj - - -def reconstruct_copy( - func: Callable[..., T], - args: Any, - kwargs: Any, - state: Any = None, - listiter: Iterable[Any] | None = None, - dictiter: Iterable[tuple[Any, Any]] | None = None, - *unsupported: Any, -) -> T: - if unsupported: - raise NotImplementedError(f"Unsupported reduce value length {5 + len(unsupported)}") - reconstruct_state(new_obj := func(*args, **kwargs), state, listiter, dictiter) - return new_obj - - -def get_reduce( - x: Any, cls: type[Any] -) -> ( - str - | tuple[Callable[..., Any], tuple[Any, ...]] - | tuple[Callable[..., Any], tuple[Any, ...], Any] - | tuple[Callable[..., Any], tuple[Any, ...], Any, Any, Any] - | tuple[Callable[..., Any], tuple[Any, ...], Any, Any, Any, Any] - | tuple[Any, ...] # FIXME: enforce proper types and make sure we never fail -): - if custom_reduce := copyreg.dispatch_table.get(cls): - return custom_reduce(x) - elif (__reduce_ex__ := getattr(x, "__reduce_ex__", None)) is not None: - return cast("tuple[Any, ...] | str", __reduce_ex__(4)) - elif __reduce__ := getattr(x, "__reduce__", None): - return cast("tuple[Any, ...] | str", __reduce__()) - else: - raise Error(f"un(deep)copyable object of type {cls}") - - -def debunk_reduce( - func: Callable[..., Any], - args: tuple[Any, ...], - state: Any = None, - listiter: Iterable[Any] | None = None, - dictiter: Iterable[tuple[Any, Any]] | None = None, - *unsupported: Any, -) -> tuple[Any, ...]: - if unsupported: - raise NotImplementedError(f"Unexpected values in reduce value: {unsupported}") - # __newobj__ and __newobj_ex__ are special wrapper functions - # getting rid of them saves us from one extra call on stack - if func is __newobj_ex__: - cls, args, kwargs = args - args = (cls, *args) - func = cls.__new__ - elif func is __newobj__: - func = args[0].__new__ - kwargs = {} - else: - kwargs = {} - return func, args, kwargs, state, listiter, dictiter diff --git a/duper/fastast.py b/duper/fastast.py index 5810110..2d923af 100644 --- a/duper/fastast.py +++ b/duper/fastast.py @@ -1,28 +1,41 @@ -# SPDX-FileCopyrightText: 2023 Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # # SPDX-License-Identifier: MPL-2.0 """ -These are duck-typed definitions of builtin AST classes. - -I'm experimenting with speed of such alternative, when compiled to native classes. +Using these native classes is faster than using builtin ast.AST subclasses `__class__: type[ast.expr] = ast.expr` and equivalents are used to flavour new classes enough, so they would smell enough like ast.AST for compile() + +Additionally, nodes are aware of their parents and their location, +so we could handle self-reflecting data. + +This is not intended to be a full replacement to AST, instead this strives to its faster subset. + +Unused, but required field are set as final class variables +to reduce amount of work required to do when initializing class. """ + from __future__ import annotations import ast +from typing import TYPE_CHECKING from typing import Any from typing import Final from typing import Generic from typing import TypeVar +from typing import cast -from mypy_extensions import trait +from duper.constants import ImmutableCollectionType +from duper.constants import ImmutableType +if TYPE_CHECKING: + from collections.abc import Sequence T = TypeVar("T") -E = TypeVar("E", bound="expr") +C = TypeVar("C", bound=ImmutableType | ImmutableCollectionType) +A = TypeVar("A", bound="AST") S = TypeVar("S", bound="stmt") @@ -35,17 +48,51 @@ class AST: end_col_offset = 0 end_lineno = 1 - -class expr(AST): - __class__: type[ast.expr] = ast.expr + def __init__(self) -> None: + self.parent: AST | None = None + self.container: Sequence[AST] | AST | None = None + self.location: int | str | None = None + self.name: str | None = None def __setitem__(self, key: str, value: AST) -> None: setattr(self, key, value) + def set_parent( + self, + parent: AST, + container: AST | Sequence[AST], + location: int | str, + *, + store_result: bool = True, + ) -> AST: + self.parent = parent + self.container = container + self.location = location + if store_result and self.name is not None: + self.store_result(self.name) + return self + + def store_result(self, name: str) -> NamedExpr[expr]: + if self.parent is None or self.container is None or self.location is None: + raise RuntimeError( + f"All of the following attributes must be set" + f" {self.parent=}, {self.container=}, {self.location=}" + ) + original_parent = self.parent + original_container = self.container + original_location = self.location + new_expr = NamedExpr(Name(id=name, ctx=STORE), value=cast("expr", self)) + original_container[original_location] = new_expr # type: ignore[index] + new_expr.set_parent(original_parent, original_container, original_location) + return new_expr + + +class expr(AST): + __class__: type[ast.expr] = ast.expr + class stmt(AST): __class__: type[ast.stmt] = ast.stmt - pass class mod(AST): @@ -62,135 +109,292 @@ def __init__(self, arg: str) -> None: type_comment = None -class Load(expr): - __class__: type[ast.Load] = ast.Load # type: ignore[assignment] +class comprehension(AST): + __class__: type[ast.comprehension] = ast.comprehension + _fields: Final = ("target", "iter", "ifs", "is_async") + + def __init__(self, target: expr, iter: expr, ifs: list[expr], is_async: int) -> None: + super().__init__() + self.target = target + self.iter = iter + self.ifs = ifs + self.is_async = is_async + target.set_parent(self, self, "target") + iter.set_parent(self, self, "iter") + for i, cond in enumerate(ifs): + cond.set_parent(self, ifs, i) -LOAD: Final = Load() +class expr_context(AST): + __class__: type[ast.expr_context] = ast.expr_context -class Store(expr): - __class__: type[ast.Store] = ast.Store # type: ignore[assignment] +class Load(expr_context): + __class__: type[ast.Load] = ast.Load -class Return(stmt, Generic[E]): + +class Store(expr_context): + __class__: type[ast.Store] = ast.Store + + +class Return(stmt, Generic[A]): __class__: type[ast.Return] = ast.Return - def __init__(self, value: E) -> None: + def __init__(self, value: A) -> None: self.value = value -class arguments(expr): - __class__: type[ast.arguments] = ast.arguments # type: ignore[assignment] - """Not used in any way other tnen as required arg ot FunctionDef""" +class _Comprehension(expr): + _fields: Final = ("elt", "generators") + + def __init__(self, elt: expr, generators: list[comprehension]) -> None: + super().__init__() + self.elt = elt + self.generators = generators + + elt.set_parent(self, self, "elt") + for i, gen in enumerate(generators): + gen.set_parent(self, gen, i) + + +class ListComp(_Comprehension, expr): + __class__: type[ast.ListComp] = ast.ListComp - posonlyargs: Final[list[arg]] = [] - args: Final[list[arg]] = [] - vararg: Final = None # real type is arg | None - kwonlyargs: Final[list[arg]] = [] - kw_defaults: Final[list[arg]] = [] - kwarg: Final = None # real type is arg | None - defaults: Final[list[arg]] = [] +class GeneratorExp(_Comprehension, expr): + __class__: type[ast.GeneratorExp] = ast.GeneratorExp -class keyword(stmt, Generic[E]): - __class__: type[ast.keyword] = ast.keyword # type: ignore[assignment] - def __init__(self, arg: str, value: E) -> None: +class arguments(AST): + __class__: type[ast.arguments] = ast.arguments + """Not used in any way other than as required arg to FunctionDef""" + + def __init__( + self, + posonlyargs: list[arg], + args: list[arg], + vararg: arg | None, + kwonlyargs: list[arg], + kw_defaults: list[arg], + kwarg: arg | None, + defaults: list[arg], + ) -> None: + self.posonlyargs = posonlyargs + self.args = args + self.vararg = vararg + self.kwonlyargs = kwonlyargs + self.kw_defaults = kw_defaults + self.kwarg = kwarg + self.defaults = defaults + + +class keyword(AST, Generic[A]): + __class__: type[ast.keyword] = ast.keyword + + def __init__(self, arg: str, value: A) -> None: self.arg = arg self.value = value + value.set_parent(self, self, "value") + + +LOAD: Final = Load() +STORE: Final = Store() + + +class _EltsChildren(expr): + _fields: Final = ("elts",) + + def __init__(self, elts: list[expr], ctx: Load | Store = LOAD) -> None: + super().__init__() + self.elts = elts + self.ctx = ctx + for i, el in enumerate(self.elts): + el.set_parent(cast("AST", self), elts, i) + class Name(expr): __class__: type[ast.Name] = ast.Name def __init__(self, id: str, ctx: Load | Store = LOAD) -> None: + self.name = None self.id = id self.ctx = ctx +class Attribute(expr): + __class__: type[ast.Attribute] = ast.Attribute + _fields = ("value", "attr", "ctx") + + def __init__(self, value: expr, attr: str, ctx: Load | Store = LOAD) -> None: + super().__init__() + self.value = value + self.attr = attr + self.ctx = ctx + + class Constant(expr, Generic[T]): __class__: type[ast.Constant] = ast.Constant kind: Final = None def __init__(self, value: T) -> None: + self.name = None self.value = value -class NamedExpr(expr, Generic[E]): +class NamedExpr(expr, Generic[A]): __class__: type[ast.NamedExpr] = ast.NamedExpr + _fields: Final = ("target", "value") - def __init__(self, target: Name, value: E) -> None: + def __init__(self, target: Name, value: A) -> None: + super().__init__() self.target = target self.value = value + target.set_parent(self, self, "target") + value.set_parent(self, self, "value", store_result=False) -class Expression(expr, Generic[E]): - __class__: type[ast.Expression] = ast.Expression # type: ignore[assignment] - def __init__(self, body: E) -> None: - self.body = body +class Expression(mod, Generic[A]): + __class__: type[ast.Expression] = ast.Expression + _fields: Final = ("body",) - -@trait -class _Elts: - _fields: Final = ("elts",) - - def __init__(self, elts: list[expr]) -> None: - self.elts = elts - self.ctx = LOAD + def __init__(self, body: A) -> None: + super().__init__() + self.body = body + body.set_parent(self, self, "body") -class List(expr, _Elts): - __class__: type[ast.List] = ast.List # type: ignore[assignment] +class List(_EltsChildren, expr): + __class__: type[ast.List] = ast.List -class Tuple(expr, _Elts): - __class__: type[ast.Tuple] = ast.Tuple # type: ignore[assignment] +class Tuple(_EltsChildren, expr): + __class__: type[ast.Tuple] = ast.Tuple -class Set(expr, _Elts): - __class__: type[ast.Set] = ast.Set # type: ignore[assignment] +class Set(_EltsChildren, expr): + __class__: type[ast.Set] = ast.Set class Dict(expr): __class__: type[ast.Dict] = ast.Dict + _fields = ("keys", "values") def __init__(self, keys: list[expr], values: list[expr]) -> None: super().__init__() self.keys = keys self.values = values + for i, el in enumerate(keys): + el.set_parent(self, keys, i) + + for i, el in enumerate(values): + el.set_parent(self, values, i) + class Call(expr): __class__: type[ast.Call] = ast.Call + _fields: Final = ("func", "args", "keywords") - def __init__( - self, func: Name | Constant[Any], args: list[expr], keywords: list[keyword[Any]] - ) -> None: + def __init__(self, func: expr, args: list[AST], keywords: list[keyword[Any]]) -> None: + super().__init__() self.func = func - self.args: list[expr] = args + self.args: list[AST] = args self.keywords = keywords + # func.set_parent(self, "func") do we need it? + for i, el in enumerate(args): + el.set_parent(self, args, i) + + +class Subscript(expr): + """Subscript(expr value, expr slice, expr_context ctx)""" + + __class__: type[ast.Subscript] = ast.Subscript + _fields: Final = ("value", "slice", "ctx") + + def __init__(self, value: expr, slice: Constant[Any], ctx: Load | Store = LOAD) -> None: + super().__init__() + self.value = value + self.slice = slice + self.ctx = ctx + + +class alias(AST): + __class__: type[ast.alias] = ast.alias + + def __init__(self, name: str, asname: str | None = None) -> None: + self.name = name + self.asname = asname + + +class ImportFrom(stmt): + __class__: type[ast.ImportFrom] = ast.ImportFrom + + def __init__(self, module: str, names: list[alias], level: int | None = None) -> None: + self.name = None + self.module = module + self.names = names + self.level = level + + +class Import(stmt): + __class__: type[ast.Import] = ast.Import + + def __init__(self, names: list[alias]) -> None: + self.name = None + self.names = names + + +class Assign(stmt): + __class__: type[ast.Assign] = ast.Assign + _fields = ( + "targets", + "value", + "type_comment", + ) + type_comment = None + + def __init__(self, targets: list[Name], value: expr) -> None: + self.targets = targets + self.value = value + + +EMPTY_LIST: Final[list[Any]] = [] # not indented to be modified + class FunctionDef(stmt): __class__: type[ast.FunctionDef] = ast.FunctionDef """Just a blank value""" + _fields: Final = ("name", "args", "body", "decorator_list", "returns", "type_comment") - args: Final = arguments() - decorator_list: Final[list[expr]] = [] + decorator_list: Final[list[expr]] = EMPTY_LIST returns: Constant[str] = Constant("Any") type_comment: Final = None - def __init__(self, body: list[stmt], name: str) -> None: + def __init__(self, body: list[stmt], name: str, args: arguments | None = None) -> None: + super().__init__() self.body = body self.name = name + self.args = args or arguments( + posonlyargs=EMPTY_LIST, + args=EMPTY_LIST, + vararg=None, + kwonlyargs=EMPTY_LIST, + kw_defaults=EMPTY_LIST, + kwarg=None, + defaults=EMPTY_LIST, + ) class Module(mod): __class__: type[ast.Module] = ast.Module + _fields: Final = ("body",) - type_ignores: Final[list[int]] = [] + type_ignores: Final[list[int]] = EMPTY_LIST def __init__(self, body: list[S]) -> None: + super().__init__() self.body = body diff --git a/pyproject.toml b/pyproject.toml index 50b8591..a0c27e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,43 +6,23 @@ build-backend = "hatchling.build" name = "duper" description = '' readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.10" keywords = [] authors = [ - { name = "Bobronium", email = "appkiller16@gmail.com" }, + { name = "Bobronium", email = "appkiller16@gmail.com" }, ] +dynamic = ["version"] classifiers = [ - "Development Status :: 4 - Beta", - "Programming Language :: Python", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] -dependencies = [ - "typing_extensions; python_version < '3.11'", - "mypy-extensions", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] -dynamic = ["version"] - - -[tool.hatch.dirs.env] -virtual = ".hatch" -#path = "./.venv_{env_name}" - - -[project.optional-dependencies] -dev = ["astpretty", "bytecode"] -benchmark = ["pyinstrument", "ipython", "dill", "orjson"] -typing = ["mypy"] -profiling = ["pyinstrument"] -debugging = ["ipython"] -style = ["ruff", "black", "isort", "pyupgrade"] -testing = ["pytest", "pytest-cov"] - [project.urls] Documentation = "https://github.com/Bobronium/duper#readme" Issues = "https://github.com/Bobronium/duper/issues" @@ -51,96 +31,166 @@ Source = "https://github.com/Bobronium/duper" [tool.hatch.version] path = "duper/__about__.py" - -[tool.hatch.envs.default] -path = ".venv" -features = ["benchmark", "lint", "test", "build"] - - -[tool.hatch.envs.311] -path = ".venv_311" -python = "3.11" -features = ["benchmark", "lint", "test", "build"] - -dependencies = [ - "pytest", - "pytest-cov", +dependencies = [] +[project.optional-dependencies] +dev = [ + "duper[style,types,test]", + "prek>=0.0.27", + "reuse>=5.0.2", + "setuptools>=80.9.0", ] - -[tool.hatch.envs.test] -features = ["testing"] - - -[tool.hatch.envs.test.scripts] -version = "python --version" -cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=duper --cov=tests {args}" -no-cov = "cov --no-cov {args}" - - -[tool.hatch.envs.style] -features = ["style"] - -[tool.hatch.envs.types] -features = ["typing"] - -[tool.hatch.envs.types.scripts] -check = [ - "mypy {env:PACKAGE}", +benchmark = [ + "pyinstrument", + "ipython", + "dill", + "orjson", + "cloudpickle>=3.1.1", ] - -[tool.hatch.envs.default.env-vars] -PACKAGE = "{root}/duper" -TESTS = "{root}/tests" -SOURCES = "{root}/tests {root}/duper" - - -[tool.hatch.envs.style.scripts] -# hatch run is pretty slow and it doesn't feel good to use it -# TODO: fix this or find a snapier alternative -check = [ - "ruff {env:SOURCES}", - "black --check {args} {env:SOURCES}", - "isort --check-only {args} {env:SOURCES}", +types = [ + "mypy>=1.17.1", +] +style = [ + "ruff>=0.12.9", ] -# Relatively safe operation that shouldn't break anything. -fmt = [ - "isort {env:SOURCES}", - "black {env:SOURCES}", +test = [ + "x-python @ git+https://github.com/rocky/x-python@b562021dfd102552d1ee404756ee8dd40903f83a ; python_version == '3.12'", + "pytest>=8.4.1", + "pytest-cov>=6.2.1", + "pytest-codspeed>=4.0.0", + "duper[data]" ] -# fmt + auto fixes. Potentially can lead to unwanted changes. -fix = [ - "pyupgrade --py310-plus --exit-zero-even-if-changed {env:PACKAGE}/**/*.py {env:PACKAGE}/*.py", - "ruff {env:SOURCES} --fix", - "fmt", +debug = [ + "x-python @ git+https://github.com/rocky/x-python@b562021dfd102552d1ee404756ee8dd40903f83a ; python_version == '3.12'", + "ipython", ] +data = [ + "attrs>=25.3.0", + "msgspec>=0.18.6", + "networkx>=3.1", + "numpy>=1.24.4", + "pandas>=2.0.3", + "pillow>=10.4.0", + "pydantic>=2.10.6", + "sympy>=1.13.3", + "torch>=2.4.1", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel.hooks.mypyc] +dependencies = ["hatch-mypyc", "setuptools"] +include = ["duper"] +exclude = ["__init__.py"] +enable-by-default = false # set HATCH_BUILD_HOOK_ENABLE_MYPYC=1 to compile with mypyc + +[tool.hatch.build.targets.wheel.hooks.mypyc.options] +opt_level = "1" +debug_level = "3" + +[tool.cibuildwheel.macos] +archs = ["x86_64", "universal2", "arm64"] + +[tool.cibuildwheel.linux] +archs = ["auto", "aarch64"] +before-test = """ +set -e + +# run block only on CPython 3.14 +if python - <<'PY' +import sys +raise SystemExit(0 if (sys.implementation.name=='cpython' and sys.version_info[:2]==(3,14)) else 1) +PY +then + echo "Preparing to build pytest-codspeed from source on cp314" + + # (libffi is optional; keep only if your dep needs headers) + ( command -v yum >/dev/null 2>&1 && yum install -y libffi-devel ) \ + || ( command -v microdnf >/dev/null 2>&1 && microdnf install -y libffi-devel ) \ + || ( command -v dnf >/dev/null 2>&1 && dnf install -y libffi-devel ) +fi +""" + +[tool.cibuildwheel] +archs = ["auto64", "auto32"] +build-frontend = "build[uv]" +environment = { HATCH_BUILD_HOOK_ENABLE_MYPYC = "1" } +test-command = "pytest -q" +test-extras = ["test"] +test-sources = ["tests", "pyproject.toml"] + [tool.coverage.run] branch = true parallel = true omit = [ - "duper/__about__.py", + "duper/__about__.py", ] [tool.coverage.report] exclude_lines = [ - "no cov", - "if __name__ == .__main__.:", - "if TYPE_CHECKING:", + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", ] -[tool.isort] -force_single_line = true -lines_after_imports = 2 -line_length = 100 -profile = "black" - -[tool.black] -line_length = 100 [tool.ruff] line-length = 100 -ignore = ["E501"] +target-version = "py310" + + +[tool.ruff.lint] + +select = ["ALL"] +extend-safe-fixes = ["ALL"] +unfixable = ["T201"] + +ignore = [ + "ANN401", # | allow Any + "RUF001", # | allow cyrillic letters in comments + "COM812", # | trailing comas are handled by ruff format + "ISC001", # | recommended to be disabled when using ruff format + "B904", # | raise from: modern pythons preserve previous exceptions + "EM101", # | forbids using literal strings in exceptions + "EM102", # | + "TRY003", # | + "A003", # | Class attribute `id` is shadowing a Python builtin β€” it's ok in class body + "FIX001", # | Forbids using TODOs + "FIX002", # | + "TD001", # | + "TD002", # | + "TD003", # | + "PLC0415" # | Top level imports +] +extend-ignore = [ + "D", # docstrings + "PLR", # allowing complexity (lazy to meaningfully refactor + reducing function calling overhead) + "C" +] + + +[tool.ruff.lint.per-file-ignores] +"*.pyi" = ["ALL"] +"tools/*" = ["TID252"] +"__init__.py" = ["F401", "I001"] +"duper/fastast.py" = ["N801", "A002"] +"duper/factories/bytecode.py" = ["ALL"] +"duper/debug.py" = ["T201"] +"tests/*" = ["S101", "ANN", "ERA", "SLOT", "SLF", "PLW", "PT0", "N", "FIX", "TD", "B024", "PLE", "PYI", "PLC", "BLE", "S", "PERF"] +"data/*" = ["ALL"] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["mcs"] + +[tool.ruff.lint.isort] +force-single-line = true +sections = { "duper_compile_support" = ["duper._compile_support"] } +section-order = ["future", "duper_compile_support", "standard-library", "third-party", "first-party", "local-folder"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" [tool.mypy] # are these possible to comply with? @@ -183,7 +233,8 @@ local_partial_types = false implicit_reexport = false strict_equality = true strict = true -no_silence_site_packages = true +# enable once this is resolved: https://github.com/python/mypy/issues/14796 +# no_silence_site_packages = true # Configuring error messages show_error_context = false @@ -191,3 +242,20 @@ show_column_numbers = false show_error_codes = true color_output = true error_summary = true + +[tool.uv.sources] +x-python = { git = "https://github.com/rocky/x-python" } +duper = { workspace = true } + +[tool.codeflash] +# All paths are relative to this pyproject.toml's directory. +module-root = "duper" +tests-root = "tests" +test-framework = "pytest" +ignore-paths = [] +disable-telemetry = true +formatter-cmds = ["ruff check --exit-zero --fix $file", "ruff format $file"] + + +[tool.pytest.ini_options] +addopts = "-k 'ast or duper.deepcopy or test_copy or test_deepcopy or test_reprx or (bytecode and not reflexive)'" diff --git a/showcase.py b/showcase.py new file mode 100644 index 0000000..9bd8c09 --- /dev/null +++ b/showcase.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +import copy +from decimal import Decimal + +import duper +import duper.builders.ast +from timesup import timesup + + +@timesup(number=1000, profile=not duper.COMPILED, repeats=1) +def reconstruction() -> None: + x = {"a": 1, "b": [(1, 2, 3), (4, 5, 6)], "c": [object(), Decimal("3.14"), {()}]} # i + + copy.deepcopy(x) # t deepcopy + duper.deepcopy(x) # t duper_deepcopy deepcopy + + [copy.deepcopy(x) for _ in range(10)] # t deepcopy_10 + [copy.deepcopy(x) for _ in range(100)] # t deepcopy_100 + [copy.deepcopy(x) for _ in range(1000)] # t deepcopy_1000 + + duper.replicate(x, 10) # t replicate_10 deepcopy_10 + duper.replicate(x, 100) # t replicate_100 deepcopy_100 + duper.replicate(x, 1000) # t replicate_1000 deepcopy_1000 + + replicate_x = duper.reconstructs(x) # # t build_reconstructor deepcopy + replicate_x() # t duper_deepcopies deepcopy diff --git a/tests/__init__.py b/tests/__init__.py index faa6711..d2466dd 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Bobronium +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MPL-2.0 diff --git a/tests/comparer.py b/tests/comparer.py new file mode 100644 index 0000000..a55da16 --- /dev/null +++ b/tests/comparer.py @@ -0,0 +1,579 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import contextlib +import datetime as dt +import decimal +import enum +import types +import uuid +from dataclasses import dataclass +from typing import Any + +# =============================== Public API ================================== + + +def assert_copies_equivalent(original: Any, deepcopied: Any, reconstructed: Any) -> None: + """ + Compare (original, deepcopied) with (original, reconstructed) through a stable + metadata projection that checks: + - structural shape and types + - atomic equality vs the original + - identity preservation for atomics where it matters + - shared-reference (alias) topology with consistent leftβ†’right mapping + - object instance state (both __dict__ and __slots__) + + On mismatch, raises AssertionError showing ALL differences with clear paths. + """ + expected = _build_metadata(original, deepcopied, alias_map=_AliasAssigner()) + actual = _build_metadata(original, reconstructed, alias_map=_AliasAssigner()) + differences = compare_metadata(expected, actual) + if differences: + raise AssertionError(_format_differences(differences)) + + +# ============================== Data Structures ============================== + + +class TriState(str): + """'T' | 'F' | 'EXC:' for atomic equality vs original.""" + + +@dataclass(frozen=True) +class AtomInfo: + type_name: str + same_identity_as_original: bool + equality_vs_original: TriState # T/F/EXC:... + alias_group: int # shared-reference group id (per right-hand graph) + + +@dataclass(frozen=True) +class Difference: + path: str + kind: str + details: str + + +class _AliasMap: + """ + Maintains a leftβ†’right mapping of alias group ids to ensure shared-reference + topology is preserved across the two compared graphs. + """ + + def __init__(self) -> None: + self._map: dict[int, int] = {} + + def unify(self, left: int, right: int) -> list[Difference]: + diffs: list[Difference] = [] + mapped = self._map.get(left) + if mapped is None: + self._map[left] = right + elif mapped != right: + diffs.append( + Difference( + path="", + kind="alias_inconsistent", + details=f"left alias {left} previously mapped to {mapped}, now {right}", + ) + ) + return diffs + + +# ============================== Safe Predicates ============================== + + +def _is_atomic(value: Any) -> bool: + """Values we treat as atomic (we may call == on them, but never on containers).""" + ATOMS = ( + type(None), + bool, + int, + float, + complex, + str, + bytes, + bytearray, + range, + types.CodeType, + types.BuiltinFunctionType, + types.FunctionType, + type, + property, + slice, + staticmethod, + classmethod, + types.ModuleType, + decimal.Decimal, + uuid.UUID, + dt.datetime, + dt.date, + dt.time, + dt.timedelta, + enum.Enum, + ) + if isinstance(value, ATOMS): + return True + # typing objects (including parametrized PEP 585 aliases in 3.9+) + mod = getattr(value, "__module__", "") + return mod in {"typing", "types"} + + +def _tri_state_equality(a: Any, b: Any) -> TriState: + try: + return TriState("T" if a == b else "F") + except Exception as exc: + return TriState(f"EXC:{type(exc).__name__}") + + +# ============================== Metadata Builder ============================= + + +class _AliasAssigner: + """Assigns deterministic small integers per *right-hand* object identity.""" + + def __init__(self) -> None: + self._seen: dict[int, int] = {} + self._next = 1 + + def group_id(self, obj: Any) -> int: + oid = id(obj) + if oid in self._seen: + return self._seen[oid] + gid = self._next + self._seen[oid] = gid + self._next += 1 + return gid + + +def _iter_all_slots(cls: type[Any]) -> list[str]: + names: list[str] = [] + for base in cls.__mro__: + slots = getattr(base, "__slots__", ()) + if isinstance(slots, str): + slots = (slots,) + for s in slots or (): + if isinstance(s, str): + names.append(s) + return names + + +def _public_instance_state(obj: Any) -> tuple[dict[str, Any], dict[str, Any]]: + """Return (__dict__ snapshot, __slots__ snapshot) without invoking custom logic.""" + out_dict: dict[str, Any] = {} + out_slots: dict[str, Any] = {} + if hasattr(obj, "__dict__") and isinstance(obj.__dict__, dict): + for k in sorted(obj.__dict__.keys(), key=str): + out_dict[k] = obj.__dict__[k] + for name in _iter_all_slots(type(obj)): + if hasattr(obj, name): + with contextlib.suppress(Exception): + out_slots[name] = getattr(obj, name) + return out_dict, out_slots + + +def _pair_dict_value(copy_dict: dict[Any, Any], original_key: Any) -> tuple[Any, Any] | None: + """Find a key in copy_dict that equals original_key, even if hashing differs.""" + try: + if original_key in copy_dict: + return original_key, copy_dict[original_key] + except Exception: + pass + for candidate_key in copy_dict: + try: + if candidate_key == original_key: + return candidate_key, copy_dict[candidate_key] + except Exception: + continue + return None + + +def _pair_set_element( + copy_set: set[Any] | frozenset[Any], original_element: Any, used_right_ids: set[int] +) -> Any | None: + """Greedy matching by equality, avoiding reusing the same right-hand element.""" + for candidate in copy_set: + cid = id(candidate) + if cid in used_right_ids: + continue + try: + if candidate == original_element: + used_right_ids.add(cid) + return candidate + except Exception: + continue + return None + + +def _build_metadata( + original: Any, + right: Any, + *, + alias_map: _AliasAssigner, + seen_pairs: set[tuple[int, int]] | None = None, +) -> Any: + """ + Build a pure-builtin metadata tree for (original, right) without calling __eq__ + on containers or instances. Only atomic leaves use equality. + Shapes/types/aliasing are preserved to support robust diffs. + """ + if seen_pairs is None: + seen_pairs = set() + + oid, rid = id(original), id(right) + if (oid, rid) in seen_pairs: + return ("BACKREF", alias_map.group_id(right)) + seen_pairs.add((oid, rid)) + + if _is_atomic(original): + return ( + "ATOM", + AtomInfo( + type_name=type(original).__name__, + same_identity_as_original=(right is original), + equality_vs_original=_tri_state_equality(right, original), + alias_group=alias_map.group_id(right), + ), + ) + + def hdr(type_name: str) -> AtomInfo: + return AtomInfo( + type_name=type_name, + same_identity_as_original=(right is original), + equality_vs_original=TriState("-"), # not computed for non-atomics + alias_group=alias_map.group_id(right), + ) + + # Sequence-like + if isinstance(original, list): + if not isinstance(right, list): + return ("TYPE_MISMATCH", "list", type(right).__name__) + return ( + "LIST", + hdr("list"), + [ + _build_metadata(ov, right[i], alias_map=alias_map, seen_pairs=seen_pairs) + for i, ov in enumerate(original) + ], + ) + + if isinstance(original, tuple): + if not isinstance(right, tuple): + return ("TYPE_MISMATCH", "tuple", type(right).__name__) + return ( + "TUPLE", + hdr("tuple"), + [ + _build_metadata(ov, right[i], alias_map=alias_map, seen_pairs=seen_pairs) + for i, ov in enumerate(original) + ], + ) + + # Mapping + if isinstance(original, dict): + if not isinstance(right, dict): + return ("TYPE_MISMATCH", "dict", type(right).__name__) + items = [] + for ok in original: + match = _pair_dict_value(right, ok) + if match is None: + items.append(("MISSING_KEY", repr(ok))) + else: + rk, rv = match + km = _build_metadata(ok, rk, alias_map=alias_map, seen_pairs=seen_pairs) + vm = _build_metadata(original[ok], rv, alias_map=alias_map, seen_pairs=seen_pairs) + items.append(("ITEM", km, vm)) + return ("DICT", hdr("dict"), tuple(items)) + + # Set-like + if isinstance(original, (set, frozenset)): + if not isinstance(right, type(original)): + return ("TYPE_MISMATCH", type(original).__name__, type(right).__name__) + used: set[int] = set() + elems = [] + for oe in original: + match = _pair_set_element(right, oe, used) + if match is None: + elems.append(("MISSING_ELEMENT", repr(oe))) + else: + elems.append(_build_metadata(oe, match, alias_map=alias_map, seen_pairs=seen_pairs)) + tag = type(original).__name__.upper() # SET or FROZENSET + return (tag, hdr(type(original).__name__), tuple(sorted(elems, key=str))) + + # Method boundness matters only as a tag; deeper comparison is not helpful here + if isinstance(original, types.MethodType): + return ("METHOD", hdr("method")) + + # Fallback: treat as a plain instance by introspecting __dict__ and __slots__ + o_dict, o_slots = _public_instance_state(original) + r_dict, r_slots = _public_instance_state(right) + dict_items = tuple( + (k, _build_metadata(o_dict[k], r_dict[k], alias_map=alias_map, seen_pairs=seen_pairs)) + if k in r_dict + else (k, ("MISSING_ATTR",)) + for k in sorted(o_dict.keys(), key=str) + ) + slot_items = tuple( + (s, _build_metadata(o_slots[s], r_slots[s], alias_map=alias_map, seen_pairs=seen_pairs)) + if s in r_slots + else (s, ("MISSING_SLOT",)) + for s in sorted(o_slots.keys(), key=str) + ) + return ("INSTANCE", hdr(type(original).__name__), dict_items, slot_items) + + +# ============================== Metadata Compare ============================= + + +def compare_metadata(expected: Any, actual: Any) -> list[Difference]: + """ + Deep comparison of two metadata trees. + Returns ALL differences as a list of Difference objects. + """ + differences: list[Difference] = [] + _compare_impl(expected, actual, differences, alias_unifier=_AliasMap(), path=".") + return differences + + +def _compare_atoms( + left: AtomInfo, right: AtomInfo, alias_unifier: _AliasMap, path: str, out: list[Difference] +) -> None: + if left.type_name != right.type_name: + out.append( + Difference( + path, "type_mismatch", f"Expected type {left.type_name}, actual {right.type_name}" + ) + ) + # For ints/strs identity can differ without being a bug; everything else we check + if left.same_identity_as_original != right.same_identity_as_original and left.type_name not in { + "int", + "str", + }: + out.append( + Difference( + path, + "identity_mismatch", + f"Expected same-identity={left.same_identity_as_original}, " + f"actual {right.same_identity_as_original}", + ) + ) + if left.equality_vs_original != right.equality_vs_original: + out.append( + Difference( + path, + "equality_mismatch", + f"Expected equality flag {left.equality_vs_original}, " + f"actual {right.equality_vs_original}", + ) + ) + out.extend(alias_unifier.unify(left.alias_group, right.alias_group)) + + +def _compare_impl( + left: Any, right: Any, out: list[Difference], alias_unifier: _AliasMap, path: str +) -> None: + # BACKREF + if isinstance(left, tuple) and left and left[0] == "BACKREF": + if not (isinstance(right, tuple) and right and right[0] == "BACKREF"): + out.append(Difference(path, "tag_mismatch", f"Expected BACKREF, actual {right[0]}")) + return + la, rb = left[1], right[1] + out.extend(alias_unifier.unify(la, rb)) + return + + # ATOM + if isinstance(left, tuple) and left and left[0] == "ATOM": + if not (isinstance(right, tuple) and right and right[0] == "ATOM"): + out.append(Difference(path, "tag_mismatch", f"Expected ATOM, actual {right[0]}")) + return + _compare_atoms(left[1], right[1], alias_unifier, path, out) + return + + # TYPE_MISMATCH (leaf produced by builder) + if isinstance(left, tuple) and left and left[0] == "TYPE_MISMATCH": + if left != right: + out.append( + Difference( + path, + "type_mismatch", + f"Expected {left[1]}, actual {left[2]} (builder leaf differs)", + ) + ) + return + + # Container / instance tags + if isinstance(left, tuple) and left: + if not (isinstance(right, tuple) and right): + out.append( + Difference( + path, "tag_mismatch", f"Expected tuple tag, actual {type(right).__name__}" + ) + ) + return + + left_tag, right_tag = left[0], right[0] + if left_tag != right_tag: + out.append( + Difference(path, "tag_mismatch", f"Expected tag {left_tag}, actual {right_tag}") + ) + return + + # Header meta (AtomInfo) for alias/identity checks on container/instance nodes + if left_tag in {"LIST", "TUPLE", "DICT", "SET", "FROZENSET", "METHOD", "INSTANCE"}: + _compare_atoms(left[1], right[1], alias_unifier, path + f".{left_tag}", out) + + # Recurse into payloads + if left_tag in {"LIST", "TUPLE"}: + l_items, r_items = left[2], right[2] + if len(l_items) != len(r_items): + out.append( + Difference( + path, + "length_mismatch", + f"{left_tag} length expected {len(l_items)}, actual {len(r_items)}", + ) + ) + for idx in range(min(len(l_items), len(r_items))): + _compare_impl( + l_items[idx], r_items[idx], out, alias_unifier, f"{path}.{left_tag}[{idx}]" + ) + return + + if left_tag == "DICT": + l_pairs, r_pairs = left[2], right[2] + if len(l_pairs) != len(r_pairs): + out.append( + Difference( + path + ".DICT", + "length_mismatch", + f"pairs expected {len(l_pairs)}, actual {len(r_pairs)}", + ) + ) + for i in range(min(len(l_pairs), len(r_pairs))): + la, ra = l_pairs[i], r_pairs[i] + if la[0] != ra[0]: + out.append( + Difference( + f"{path}.DICT[{i}]", + "entry_tag_mismatch", + f"expected {la[0]}, actual {ra[0]}", + ) + ) + continue + if la[0] == "MISSING_KEY": + if la[1] != ra[1]: + out.append( + Difference( + f"{path}.DICT[{i}]", + "missing_key_mismatch", + f"expected {la[1]}, actual {ra[1]}", + ) + ) + else: + _compare_impl(la[1], ra[1], out, alias_unifier, f"{path}.DICT[{i}].key") + _compare_impl(la[2], ra[2], out, alias_unifier, f"{path}.DICT[{i}].value") + return + + if left_tag in {"SET", "FROZENSET"}: + l_elems, r_elems = left[2], right[2] + if len(l_elems) != len(r_elems): + out.append( + Difference( + path + f".{left_tag}", + "length_mismatch", + f"elements expected {len(l_elems)}, actual {len(r_elems)}", + ) + ) + for i in range(min(len(l_elems), len(r_elems))): + le, re = l_elems[i], r_elems[i] + if le[0] != re[0]: + out.append( + Difference( + f"{path}.{left_tag}[{i}]", + "entry_tag_mismatch", + f"expected {le[0]}, actual {re[0]}", + ) + ) + continue + if le[0] == "MISSING_ELEMENT": + if le[1] != re[1]: + out.append( + Difference( + f"{path}.{left_tag}[{i}]", + "missing_element_mismatch", + f"expected {le[1]}, actual {re[1]}", + ) + ) + else: + _compare_impl(le, re, out, alias_unifier, f"{path}.{left_tag}[{i}]") + return + + if left_tag == "METHOD": + return + + if left_tag == "INSTANCE": + l_dict, l_slots = left[2], left[3] + r_dict, r_slots = right[2], right[3] + if len(l_dict) != len(r_dict): + out.append( + Difference( + path + ".INSTANCE.__dict__", + "length_mismatch", + f"items expected {len(l_dict)}, actual {len(r_dict)}", + ) + ) + for i in range(min(len(l_dict), len(r_dict))): + (lk, lv), (rk, rv) = l_dict[i], r_dict[i] + if lk != rk: + out.append( + Difference( + f"{path}.INSTANCE.__dict__[{i}]", + "key_mismatch", + f"expected key {lk!r}, actual {rk!r}", + ) + ) + _compare_impl(lv, rv, out, alias_unifier, f"{path}.INSTANCE.__dict__[{lk}]") + if len(l_slots) != len(r_slots): + out.append( + Difference( + path + ".INSTANCE.__slots__", + "length_mismatch", + f"slots expected {len(l_slots)}, actual {len(r_slots)}", + ) + ) + for i in range(min(len(l_slots), len(r_slots))): + (ls, lv), (rs, rv) = l_slots[i], r_slots[i] + if ls != rs: + out.append( + Difference( + f"{path}.INSTANCE.__slots__[{i}]", + "slot_name_mismatch", + f"expected slot {ls!r}, actual {rs!r}", + ) + ) + _compare_impl(lv, rv, out, alias_unifier, f"{path}.INSTANCE.__slots__[{ls}]") + return + + # Fallback raw inequality if builder produced plain values (should be rare) + if left != right: + out.append(Difference(path, "raw_value_mismatch", f"expected {left!r}, actual {right!r}")) + + +# ============================== Pretty Printer =============================== + + +def _format_differences(differences: list[Difference]) -> str: + # Group by path to make large outputs navigable + by_path: dict[str, list[Difference]] = {} + for d in differences: + by_path.setdefault(d.path, []).append(d) + + lines = ["Metadata mismatch(s):"] + for path in sorted(by_path.keys(), key=str): + lines.append(f"\nβ€’ At {path}:") + for diff in by_path[path]: + lines.append(f" - {diff.kind}: {diff.details}") + return "\n".join(lines) diff --git a/tests/test_copy.py b/tests/test_copy.py index 0e75646..6f43cc4 100644 --- a/tests/test_copy.py +++ b/tests/test_copy.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + """ Slightly modified builtin test.test_copy module """ @@ -13,28 +17,25 @@ from operator import le from operator import lt from operator import ne -from test import support +from typing import NoReturn import pytest import duper -from duper.factories import ast - -ast.with_source = True +def panic(*_, **__) -> NoReturn: + raise -class Copy: - error = Error = original_copy.Error - copy = staticmethod(duper.dupe) - deepcopy = staticmethod(duper.deepdupe) +def _deepcopy(obj, memo=None): + factory = duper.reconstructs(obj, check=False, fallback=panic, _keep_by_reference=memo) + return factory() -copy = Copy() -# keeps copy module intact, but allows to keep test cases relatively unchanged -copy.__dict__.update(original_copy.__dict__) -del copy.__dict__["copy"], copy.__dict__["deepcopy"] +def _copy(obj): + factory = duper.copies(obj) + return factory() order_comparisons = le, lt, ge, gt @@ -42,15 +43,38 @@ class Copy: comparisons = order_comparisons + equality_comparisons -@pytest.fixture() -def self(): - return unittest.TestCase() +class CopyModule: + error = Error = original_copy.Error + copy = staticmethod(duper.copy) + deepcopy = staticmethod(duper.deepcopy) + + +@pytest.fixture( + params=[ + pytest.param((duper.copy, duper.deepcopy), id="runtime"), + pytest.param((_copy, _deepcopy), id="factory"), + # pytest.param((_copy, orcopy.deepcopy), id="orcopy"), + ] +) +def copy(request) -> CopyModule: + copy_substitute, deepcopy_substitute = request.param + + class CopyModule: + error = Error = original_copy.Error + copy = staticmethod(copy_substitute) + deepcopy = staticmethod(deepcopy_substitute) + + copy_module = CopyModule() + # keeps copy module intact, but allows to keep test cases relatively unchanged + copy_module.__dict__.update(original_copy.__dict__) + del copy_module.__dict__["copy"], copy_module.__dict__["deepcopy"] + return copy_module # Attempt full line coverage of copy.py from top to bottom -def test_exceptions(self): +def test_exceptions(copy) -> None: assert copy.Error is copy.error assert issubclass(copy.Error, Exception) @@ -60,15 +84,15 @@ def test_exceptions(self): # The copy() method -def test_copy_basic(self): +def test_copy_basic(copy) -> None: x = 42 y = copy.copy(x) assert x == y -def test_copy_copy(self): - class C(object): - def __init__(self, foo): +def test_copy_copy(copy) -> None: + class C: + def __init__(self, foo) -> None: self.foo = foo def __copy__(self): @@ -80,8 +104,8 @@ def __copy__(self): assert y.foo == x.foo -def test_copy_registry(self): - class C(object): +def test_copy_registry(copy) -> None: + class C: def __new__(cls, foo): obj = object.__new__(cls) obj.foo = foo @@ -97,8 +121,8 @@ def pickle_C(obj): copy.copy(x) -def test_copy_reduce_ex(self): - class C(object): +def test_copy_reduce_ex(copy) -> None: + class C: def __reduce_ex__(self, proto): c.append(1) return "" @@ -113,8 +137,8 @@ def __reduce__(self): assert c == [1] -def test_copy_reduce(self): - class C(object): +def test_copy_reduce(copy) -> None: + class C: def __reduce__(self): c.append(1) return "" @@ -126,8 +150,8 @@ def __reduce__(self): assert c == [1] -def test_copy_cant(self): - class C(object): +def test_copy_cant(copy) -> None: + class C: def __getattribute__(self, name): if name.startswith("__reduce"): raise AttributeError(name) @@ -142,10 +166,10 @@ def get_copy_atomic(): class Classic: pass - class NewStyle(object): + class NewStyle: pass - def f(): + def f() -> None: pass class WithMetaclass(metaclass=abc.ABCMeta): @@ -177,11 +201,11 @@ class WithMetaclass(metaclass=abc.ABCMeta): @pytest.mark.parametrize("x", get_copy_atomic()) -def test_copy_atomic(x): +def test_copy_atomic(copy, x) -> None: assert copy.copy(x) is x -def test_copy_list(self): +def test_copy_list(copy) -> None: x = [1, 2, 3] y = copy.copy(x) assert y == x @@ -192,7 +216,7 @@ def test_copy_list(self): assert y is not x -def test_copy_tuple(self): +def test_copy_tuple(copy) -> None: x = (1, 2, 3) assert copy.copy(x) is x x = () @@ -201,7 +225,7 @@ def test_copy_tuple(self): assert copy.copy(x) is x -def test_copy_dict(self): +def test_copy_dict(copy) -> None: x = {"foo": 1, "bar": 2} y = copy.copy(x) assert y == x @@ -212,7 +236,7 @@ def test_copy_dict(self): assert y is not x -def test_copy_set(self): +def test_copy_set(copy) -> None: x = {1, 2, 3} y = copy.copy(x) assert y == x @@ -223,14 +247,14 @@ def test_copy_set(self): assert y is not x -def test_copy_frozenset(self): +def test_copy_frozenset(copy) -> None: x = frozenset({1, 2, 3}) assert copy.copy(x) is x x = frozenset() assert copy.copy(x) is x -def test_copy_bytearray(self): +def test_copy_bytearray(copy) -> None: x = bytearray(b"abc") y = copy.copy(x) assert y == x @@ -241,9 +265,9 @@ def test_copy_bytearray(self): assert y is not x -def test_copy_inst_vanilla(self): +def test_copy_inst_vanilla(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __eq__(self, other): @@ -253,9 +277,9 @@ def __eq__(self, other): assert copy.copy(x) == x -def test_copy_inst_copy(self): +def test_copy_inst_copy(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __copy__(self): @@ -268,9 +292,9 @@ def __eq__(self, other): assert copy.copy(x) == x -def test_copy_inst_getinitargs(self): +def test_copy_inst_getinitargs(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getinitargs__(self): @@ -283,7 +307,7 @@ def __eq__(self, other): assert copy.copy(x) == x -def test_copy_inst_getnewargs(self): +def test_copy_inst_getnewargs(copy) -> None: class C(int): def __new__(cls, foo): self = int.__new__(cls) @@ -304,7 +328,7 @@ def __eq__(self, other): assert y.foo == x.foo -def test_copy_inst_getnewargs_ex(self): +def test_copy_inst_getnewargs_ex(copy) -> None: class C(int): def __new__(cls, *, foo): self = int.__new__(cls) @@ -325,9 +349,9 @@ def __eq__(self, other): assert y.foo == x.foo -def test_copy_inst_getstate(self): +def test_copy_inst_getstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getstate__(self): @@ -340,9 +364,9 @@ def __eq__(self, other): assert copy.copy(x) == x -def test_copy_inst_setstate(self): +def test_copy_inst_setstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __setstate__(self, state): @@ -355,9 +379,9 @@ def __eq__(self, other): assert copy.copy(x) == x -def test_copy_inst_getstate_setstate(self): +def test_copy_inst_getstate_setstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getstate__(self): @@ -373,19 +397,20 @@ def __eq__(self, other): assert copy.copy(x) == x # State with boolean value is false (issue #25718) x = C(0.0) + assert copy.copy(x) == x # The deepcopy() method -def test_deepcopy_basic(self): +def test_deepcopy_basic(copy) -> None: x = 42 y = copy.deepcopy(x) assert y == x -def test_deepcopy_same_object(self): +def test_deepcopy_same_object(copy) -> None: # previously was called test_deepcopy_memo, but I find new name to be clearer # Tests of reflexive objects are under type-specific sections below. # This tests only repetitions of objects. @@ -398,7 +423,20 @@ def test_deepcopy_same_object(self): assert y[0] is y[1] -def test_deepcopy_issubclass(self): +def test_deepcopy_same_object_different_parents(copy) -> None: + # previously was called test_deepcopy_memo, but I find new name to be clearer + # Tests of reflexive objects are under type-specific sections below. + # This tests only repetitions of objects. + x = [] + x = [[x], x] + y = copy.deepcopy(x) + assert y == x + assert y is not x + assert y[0] is not x[0] + assert y[0][0] is y[1] + + +def test_deepcopy_issubclass(copy) -> None: # XXX Note: there's no way to test the TypeError coming out of # issubclass() -- this can only happen when an extension # module defines a "type" that doesn't formally inherit from @@ -412,9 +450,9 @@ class C(metaclass=Meta): assert copy.deepcopy(C) == C -def test_deepcopy_deepcopy(self): - class C(object): - def __init__(self, foo): +def test_deepcopy_deepcopy(copy) -> None: + class C: + def __init__(self, foo) -> None: self.foo = foo def __deepcopy__(self, memo=None): @@ -426,8 +464,8 @@ def __deepcopy__(self, memo=None): assert y.foo == x.foo -def test_deepcopy_registry(self): - class C(object): +def test_deepcopy_registry(copy) -> None: + class C: def __new__(cls, foo): obj = object.__new__(cls) obj.foo = foo @@ -443,8 +481,8 @@ def pickle_C(obj): copy.deepcopy(x) -def test_deepcopy_reduce_ex(self): - class C(object): +def test_deepcopy_reduce_ex(copy) -> None: + class C: def __reduce_ex__(self, proto): c.append(1) return "" @@ -459,8 +497,8 @@ def __reduce__(self): assert c == [1] -def test_deepcopy_reduce(self): - class C(object): +def test_deepcopy_reduce(copy) -> None: + class C: def __reduce__(self): c.append(1) return "" @@ -472,8 +510,8 @@ def __reduce__(self): assert c == [1] -def test_deepcopy_cant(self): - class C(object): +def test_deepcopy_cant(copy) -> None: + class C: def __getattribute__(self, name): if name.startswith("__reduce"): raise AttributeError(name) @@ -491,10 +529,10 @@ def get_deepcopy_atomic(): class Classic: pass - class NewStyle(object): + class NewStyle: pass - def f(): + def f() -> None: pass return [ @@ -517,11 +555,11 @@ def f(): @pytest.mark.parametrize("x", get_deepcopy_atomic()) -def test_deepcopy_atomic(x): +def test_deepcopy_atomic(copy, x) -> None: assert copy.deepcopy(x) is x -def test_deepcopy_list(self): +def test_deepcopy_list(copy) -> None: x = [[1, 2], 3] y = copy.deepcopy(x) assert y == x @@ -529,9 +567,8 @@ def test_deepcopy_list(self): assert x[0] is not y[0] -@pytest.mark.xfail(strict=True, raises=duper.Error) @pytest.mark.parametrize("op", comparisons) -def test_deepcopy_reflexive_list(op): +def test_deepcopy_reflexive_list(copy, op) -> None: x = [] x.append(x) y = copy.deepcopy(x) @@ -542,13 +579,13 @@ def test_deepcopy_reflexive_list(op): assert len(y) == 1 -def test_deepcopy_empty_tuple(self): +def test_deepcopy_empty_tuple(copy) -> None: x = () y = copy.deepcopy(x) assert x is y -def test_deepcopy_tuple(self): +def test_deepcopy_tuple(copy) -> None: x = ([1, 2], 3) y = copy.deepcopy(x) assert y == x @@ -556,15 +593,14 @@ def test_deepcopy_tuple(self): assert x[0] is not y[0] -def test_deepcopy_tuple_of_immutables(self): +def test_deepcopy_tuple_of_immutables(copy) -> None: x = ((1, 2), 3) y = copy.deepcopy(x) assert x is y -@pytest.mark.xfail(strict=True, raises=duper.Error) @pytest.mark.parametrize("op", comparisons) -def test_deepcopy_reflexive_tuple(op): +def test_deepcopy_reflexive_tuple(copy, op) -> None: x = ([], 4, 3) x[0].append(x) y = copy.deepcopy(x) @@ -576,7 +612,7 @@ def test_deepcopy_reflexive_tuple(op): op(y, x) -def test_deepcopy_dict(self): +def test_deepcopy_dict(copy) -> None: x = {"foo": [1, 2], "bar": 3} y = copy.deepcopy(x) assert y == x @@ -584,9 +620,10 @@ def test_deepcopy_dict(self): assert x["foo"] is not y["foo"] -@pytest.mark.xfail(strict=True, raises=duper.Error) -@pytest.mark.parametrize("order_op,eq_op", zip(order_comparisons, equality_comparisons)) -def test_deepcopy_reflexive_dict_order(order_op, eq_op): +@pytest.mark.parametrize( + ("order_op", "eq_op"), zip(order_comparisons, equality_comparisons, strict=False) +) +def test_deepcopy_reflexive_dict_order(copy, order_op, eq_op) -> None: x = {} x["foo"] = x y = copy.deepcopy(x) @@ -599,16 +636,25 @@ def test_deepcopy_reflexive_dict_order(order_op, eq_op): assert len(y) == 1 -@pytest.mark.xfail(strict=True, raises=duper.Error) -def test_deepcopy_keepalive(self): +def test_deepcopy_keepalive(copy) -> None: + if copy.deepcopy is _deepcopy: + pytest.xfail("Factories don't respect memo") memo = {} x = [] + # original_copy.deepcopy(x, memo) copy.deepcopy(x, memo) assert memo[id(memo)][0] is x -@pytest.mark.xfail(strict=True, raises=duper.Error) -def test_deepcopy_dont_memo_immutable(self): +def test_deepcopy_by_reference(copy) -> None: + x = [] + memo = {id(x): 123} + assert copy.deepcopy(x, memo) == 123 + + +def test_deepcopy_dont_memo_immutable(copy) -> None: + if copy.deepcopy is _deepcopy: + pytest.xfail("Factories don't respect memo") memo = {} x = [1, 2, 3, 4] y = copy.deepcopy(x, memo) @@ -624,9 +670,9 @@ def test_deepcopy_dont_memo_immutable(self): assert len(memo) == 2 -def test_deepcopy_inst_vanilla(self): +def test_deepcopy_inst_vanilla(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __eq__(self, other): @@ -638,9 +684,9 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_deepcopy(self): +def test_deepcopy_inst_deepcopy(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __deepcopy__(self, memo): @@ -656,9 +702,9 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_getinitargs(self): +def test_deepcopy_inst_getinitargs(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getinitargs__(self): @@ -674,7 +720,7 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_getnewargs(self): +def test_deepcopy_inst_getnewargs(copy) -> None: class C(int): def __new__(cls, foo): self = int.__new__(cls) @@ -696,7 +742,7 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_getnewargs_ex(self): +def test_deepcopy_inst_getnewargs_ex(copy) -> None: class C(int): def __new__(cls, *, foo): self = int.__new__(cls) @@ -718,9 +764,9 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_getstate(self): +def test_deepcopy_inst_getstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getstate__(self): @@ -736,9 +782,9 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_setstate(self): +def test_deepcopy_inst_setstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __setstate__(self, state): @@ -754,9 +800,9 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_inst_getstate_setstate(self): +def test_deepcopy_inst_getstate_setstate(copy) -> None: class C: - def __init__(self, foo): + def __init__(self, foo) -> None: self.foo = foo def __getstate__(self): @@ -781,7 +827,7 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_deepcopy_reflexive_inst(self): +def test_deepcopy_reflexive_inst(copy) -> None: class C: pass @@ -795,8 +841,8 @@ class C: # _reconstruct() -def test_reconstruct_string(self): - class C(object): +def test_reconstruct_string(copy) -> None: + class C: def __reduce__(self): return "" @@ -807,8 +853,8 @@ def __reduce__(self): assert y is x -def test_reconstruct_nostate(self): - class C(object): +def test_reconstruct_nostate(copy) -> None: + class C: def __reduce__(self): return (C, ()) @@ -820,8 +866,8 @@ def __reduce__(self): assert y.__class__ is x.__class__ -def test_reconstruct_state(self): - class C(object): +def test_reconstruct_state(copy) -> None: + class C: def __reduce__(self): return (C, (), self.__dict__) @@ -837,8 +883,8 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_reconstruct_state_setstate(self): - class C(object): +def test_reconstruct_state_setstate(copy) -> None: + class C: def __reduce__(self): return (C, (), self.__dict__) @@ -857,8 +903,8 @@ def __eq__(self, other): assert y.foo is not x.foo -def test_reconstruct_reflexive(self): - class C(object): +def test_reconstruct_reflexive(copy) -> None: + class C: pass x = C() @@ -871,7 +917,7 @@ class C(object): # Additions for Python 2.3 and pickle protocol 2 -def test_reduce_4tuple(self): +def test_reduce_4tuple(copy) -> None: class C(list): def __reduce__(self): return (C, (), self.__dict__, iter(self)) @@ -890,7 +936,7 @@ def __eq__(self, other): assert x[0] is not y[0] -def test_reduce_5tuple(self): +def test_reduce_5tuple(copy) -> None: class C(dict): def __reduce__(self): return (C, (), self.__dict__, None, self.items()) @@ -909,8 +955,8 @@ def __eq__(self, other): assert x["foo"] is not y["foo"] -def test_copy_slots(self): - class C(object): +def test_copy_slots(copy) -> None: + class C: __slots__ = ["foo"] x = C() @@ -919,8 +965,8 @@ class C(object): assert x.foo is y.foo -def test_deepcopy_slots(self): - class C(object): +def test_deepcopy_slots(copy) -> None: + class C: __slots__ = ["foo"] x = C() @@ -930,15 +976,15 @@ class C(object): assert x.foo is not y.foo -def test_deepcopy_dict_subclass(self): +def test_deepcopy_dict_subclass(copy) -> None: class C(dict): - def __init__(self, d=None): + def __init__(self, d=None) -> None: if not d: d = {} self._keys = list(d.keys()) super().__init__(d) - def __setitem__(self, key, item): + def __setitem__(self, key, item) -> None: super().__setitem__(key, item) if key not in self._keys: self._keys.append(key) @@ -953,7 +999,7 @@ def __setitem__(self, key, item): assert x._keys != y._keys -def test_copy_list_subclass(self): +def test_copy_list_subclass(copy) -> None: class C(list): pass @@ -966,7 +1012,7 @@ class C(list): assert x.foo is y.foo -def test_deepcopy_list_subclass(self): +def test_deepcopy_list_subclass(copy) -> None: class C(list): pass @@ -979,7 +1025,7 @@ class C(list): assert x.foo is not y.foo -def test_copy_tuple_subclass(self): +def test_copy_tuple_subclass(copy) -> None: class C(tuple): pass @@ -989,7 +1035,7 @@ class C(tuple): assert tuple(y) == (1, 2, 3) -def test_deepcopy_tuple_subclass(self): +def test_deepcopy_tuple_subclass(copy) -> None: class C(tuple): pass @@ -1001,8 +1047,8 @@ class C(tuple): assert x[0] is not y[0] -def test_getstate_exc(self): - class EvilState(object): +def test_getstate_exc(copy) -> None: + class EvilState: def __getstate__(self): raise ValueError("ain't got no stickin' state") @@ -1010,7 +1056,7 @@ def __getstate__(self): copy.copy(EvilState()) -def test_copy_function(self): +def test_copy_function(copy) -> None: assert copy.copy(global_foo) == global_foo def foo(x, y): @@ -1018,13 +1064,13 @@ def foo(x, y): assert copy.copy(foo) == foo - def bar(): + def bar() -> None: return None assert copy.copy(bar) == bar -def test_deepcopy_function(self): +def test_deepcopy_function(copy) -> None: assert copy.deepcopy(global_foo) == global_foo def foo(x, y): @@ -1032,14 +1078,14 @@ def foo(x, y): assert copy.deepcopy(foo) == foo - def bar(): + def bar() -> None: return None assert copy.deepcopy(bar) == bar -def check_weakref(_copy): - class C(object): +def check_weakref(_copy) -> None: + class C: pass obj = C() @@ -1051,16 +1097,16 @@ class C(object): assert y is x -def test_copy_weakref(self): +def test_copy_weakref(copy) -> None: check_weakref(copy.copy) -def test_deepcopy_weakref(self): +def test_deepcopy_weakref(copy) -> None: check_weakref(copy.deepcopy) -def check_copy_weakdict(_dicttype): - class C(object): +def check_copy_weakdict(copy, _dicttype) -> None: + class C: pass a, b, c, d = [C() for i in range(4)] @@ -1074,7 +1120,7 @@ class C(object): assert v[c] == d assert len(v) == 2 del c, d - support.gc_collect() # For PyPy or other GCs. + # # support.gc_collect() # For PyPy or other GCs. assert len(v) == 1 x, y = C(), C() # The underlying containers are decoupled @@ -1082,17 +1128,17 @@ class C(object): assert x not in u -def test_copy_weakkeydict(self): - check_copy_weakdict(weakref.WeakKeyDictionary) +def test_copy_weakkeydict(copy) -> None: + check_copy_weakdict(copy, weakref.WeakKeyDictionary) -def test_copy_weakvaluedict(self): - check_copy_weakdict(weakref.WeakValueDictionary) +def test_copy_weakvaluedict(copy) -> None: + check_copy_weakdict(copy, weakref.WeakValueDictionary) -def test_deepcopy_weakkeydict(self): - class C(object): - def __init__(self, i): +def test_deepcopy_weakkeydict(copy) -> None: + class C: + def __init__(self, i) -> None: self.i = i a, b, c, d = [C(i) for i in range(4)] @@ -1108,13 +1154,13 @@ def __init__(self, i): assert v[a].i == b.i assert v[c].i == d.i del c - support.gc_collect() # For PyPy or other GCs. + # support.gc_collect() # For PyPy or other GCs. assert len(v) == 1 -def test_deepcopy_weakvaluedict(self): - class C(object): - def __init__(self, i): +def test_deepcopy_weakvaluedict(copy) -> None: + class C: + def __init__(self, i) -> None: self.i = i a, b, c, d = [C(i) for i in range(4)] @@ -1134,13 +1180,13 @@ def __init__(self, i): assert t is d del x, y, z, t del d - support.gc_collect() # For PyPy or other GCs. + # support.gc_collect() # For PyPy or other GCs. assert len(v) == 1 -def test_deepcopy_bound_method(self): - class Foo(object): - def m(self): +def test_deepcopy_bound_method(copy) -> None: + class Foo: + def m(self) -> None: pass f = Foo() diff --git a/tests/test_factories.py b/tests/test_factories.py new file mode 100644 index 0000000..a29e310 --- /dev/null +++ b/tests/test_factories.py @@ -0,0 +1,269 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import collections +import copy +import re +import sys +import typing +from functools import partial +from typing import Any +from typing import NamedTuple + +import pytest + +from data import OBJECTS +from duper import deepcopy as duper_deepcopy +from duper.builders import build_reconstructor +from tests.comparer import assert_copies_equivalent + +if typing.TYPE_CHECKING: + from collections.abc import Callable + + from pytest_codspeed import BenchmarkFixture + +SUPPORT_BYTECODE_FACTORY = sys.version_info[:2] == (3, 12) + + +# ── How to select what to run (pytest-native, no ifs) ───────────────────────── +# Examples: +# pytest -k 'ast_factory' # only ast_factory params +# pytest -k 'bytecode_factory and not reflexive' +# pytest -k 'duper.deepcopy' +# pytest -k 'namedtuple or stdlib' # by case id fragments + + +# ── NamedTuple diff helper: show only differing fields ──────────────────────── +class NTFieldDiff(NamedTuple): + unequal: tuple[str, ...] + left_only: tuple[str, ...] + right_only: tuple[str, ...] + + +def assert_namedtuple_equal(a: Any, b: Any) -> None: + if not (hasattr(a, "_asdict") and hasattr(b, "_asdict")): + assert a == b + return + da = a._asdict() + db = b._asdict() + unequal = tuple(sorted([k for k in da.keys() & db.keys() if da[k] != db[k]], key=str)) + left_only = tuple(sorted(da.keys() - db.keys(), key=str)) + right_only = tuple(sorted(db.keys() - da.keys(), key=str)) + diff = NTFieldDiff(unequal=unequal, left_only=left_only, right_only=right_only) + assert not (unequal or left_only or right_only), f"NamedTuple fields differ: {diff}" + + +# ── Param spaces (IDs are descriptive; use -k to select) ────────────────────── +FACTORY_PARAMS = [ + pytest.param(build_reconstructor, id="ast"), +] +if SUPPORT_BYTECODE_FACTORY: + from duper.builders.bytecode import build_reconstructor + + FACTORY_PARAMS.append( + pytest.param( + partial(build_reconstructor, runtime="xpython"), + marks=[pytest.mark.xfail(reason="bytecode_factory is not finishjed yet")], + id="bytecode", + ), + ) + +COPIER_PARAMS = [ + pytest.param(duper_deepcopy, id="duper.deepcopy"), +] + +CASES = [pytest.param(case.obj, id=case.name) for case in OBJECTS] + + +# ── Core equivalence tests for factories (vs copy.deepcopy baseline) ────────── +@pytest.mark.parametrize("factory", FACTORY_PARAMS, ids=lambda f: f.__name__) +@pytest.mark.parametrize("original", CASES) +def test_factories_equivalence( + factory: Callable[[Any], Callable[[], Any]], original: Any, benchmark: BenchmarkFixture +) -> None: + deepcopy_failed = False + try: + deepcopied = copy.deepcopy(original) + except Exception as e: + deepcopied = e + deepcopy_failed = True + + try: + f = benchmark.pedantic( + factory, (original,), {"ignore_deepcopy_method": False}, iterations=10 + ) + reconstructed = f() + except Exception as e: + reconstructed = e + if not deepcopy_failed: + raise + assert_copies_equivalent(original, deepcopied, reconstructed) + + +# ── duper.deepcopy parity with copy.deepcopy across corpus ──────────────────── +@pytest.mark.parametrize( + "copier", + COPIER_PARAMS, + ids=lambda c: c.__name__ if hasattr(c, "__name__") else "duper.deepcopy", +) +@pytest.mark.parametrize("original", CASES) +def test_duper_deepcopy_parity( + copier: Callable[[Any], Any], original: Any, request, benchmark +) -> None: + deepcopy_failed = False + try: + baseline = copy.deepcopy(original) + except Exception as e: + baseline = e + deepcopy_failed = True + + try: + candidate = benchmark.pedantic(copier, (original,), iterations=10) + except Exception as e: + candidate = e + if not deepcopy_failed: + raise + if ( + original is candidate + and isinstance(original, frozenset) + and all(o is c for o, c in zip(sorted(original), sorted(candidate), strict=False)) + ): + request.node.add_marker( + pytest.mark.xfail( + reason="copy.deepcopy doesn't have special handling for frozenset", strict=True + ) + ) + + assert_copies_equivalent(original, baseline, candidate) + + +# ── NamedTuple equality helper sanity ───────────────────────────────────────── +def test_namedtuple_diff_helper() -> None: + Point = collections.namedtuple("Point", ["x", "y", "meta"]) + a = Point(1, 2, {"k": [3, 4]}) + b = Point(1, 2, {"k": [3, 4]}) + assert_namedtuple_equal(a, b) + b_bad = Point(1, 99, {"k": [3, 5]}) + with pytest.raises(AssertionError) as ei: + assert_namedtuple_equal(a, b_bad) + s = str(ei.value) + assert "y" in s + assert "meta" in s + + +# ── Not-supported-yet cases for factories (compile-time/construct-time error) ─ +@pytest.mark.parametrize("factory", FACTORY_PARAMS, ids=lambda f: f.__name__) +def test_custom__deepcopy__referenced_after(factory) -> None: + class C: + def __init__(self, _mutable=None) -> None: + self.deepcopied = False + if _mutable is None: + _mutable = [1, 2, 3] + self.mutable = _mutable + + def __deepcopy__(self, memodict=None): + c = C(self.mutable.copy()) + c.deepcopied = True + memodict[id(self)] = c + memodict[id(self.mutable)] = c.mutable + memodict.setdefault(id(memodict), []).extend([c, c.mutable]) + return c + + original = C() + a = [original, original.mutable] + copied, mutable = copy.deepcopy(a) + + assert copied is not original + assert original.mutable is not copied.mutable + assert copied.mutable is mutable + assert copied.deepcopied + + with pytest.raises( + NotImplementedError, + match=re.escape( + "[1, 2, 3] was reconstructed by custom __deepcopy__" + " and then referenced in other object. This is not supported yet.", + ), + ): + factory(a) + + +@pytest.mark.parametrize("factory", FACTORY_PARAMS, ids=lambda f: f.__name__) +def test_custom__deepcopy__referenced_before(factory) -> None: + class C: + def __init__(self, _mutable=None) -> None: + self.deepcopied = False + if _mutable is None: + _mutable = [1, 2, 3] + self.mutable = _mutable + + def __deepcopy__(self, memodict=None): + mutable = memodict.get(id(self.mutable), ["unexpected"]) + c = C(mutable) + c.deepcopied = True + memodict[id(self)] = c + memodict[id(self.mutable)] = c.mutable + memodict.setdefault(id(memodict), []).extend([c, c.mutable]) + return c + + original = C(mutable := {}) + a = [mutable, original, original.mutable] + first_mutable, copied, mutable = copy.deepcopy(a) + + assert copied is not original + assert first_mutable is mutable + assert original.mutable is not copied.mutable + assert copied.mutable is mutable + assert copied.deepcopied + + with pytest.raises( + NotImplementedError, + match="Types with custom __deepcopy__ referencing previously reconstructed" + " objects are not supported yet. See the comment above.", + ): + factory(a) + + +def test_memoryview_not_deepcopiable() -> None: + mv = memoryview(b"abc") + with pytest.raises(TypeError): + copy.deepcopy(mv) + with pytest.raises(TypeError): + duper_deepcopy(mv) + + +@pytest.mark.parametrize("original", CASES) +def test_reprx_pickle_parity(original: Any, benchmark: BenchmarkFixture) -> None: + import pickle + + import duper + + pickle_failed = False + try: + dumped = pickle.dumps(original) + loaded = pickle.loads(dumped) + except Exception as e: + loaded = e + pickle_failed = True + + reprx_failed = False + try: + source = benchmark.pedantic(duper.reprx, (original,), {"name": "get_obj"}, iterations=10) + exec(source, globalns := {}) + reconstructed = globalns["__duper_reconstructor__"]() + except Exception as e: + reconstructed = e + reprx_failed = True + if not pickle_failed: + raise + + if pickle_failed and not reprx_failed: + if callable(original): + assert original(1) == reconstructed(1) + else: + assert original == reconstructed + else: + assert_copies_equivalent(original, loaded, reconstructed) diff --git a/timesup.py b/timesup.py new file mode 100644 index 0000000..4877c1b --- /dev/null +++ b/timesup.py @@ -0,0 +1,293 @@ +# SPDX-FileCopyrightText: 2023-present Arseny Boykov (Bobronium) +# +# SPDX-License-Identifier: MPL-2.0 + +""" +Render console output + HTML where lines expand with profiler results + +Howto: +Parse function source into ast tree +extract expressions by lines +handle cases such as try: ...; except: pass as single expression +(check all linenos of expression and if any of them contain # t comment, include them) +(we probably don't even need iPython for this, and it might be simpler just to do all the work by ourselves) +though PoC was easier to write with iPython +""" + +import inspect +import math +import sys +import textwrap +import traceback +from functools import partial + +import pyinstrument +from IPython import InteractiveShell +from pyinstrument.frame import Frame +from pyinstrument.renderers import ConsoleRenderer +from pyinstrument.session import Session + + +def _timer_magic() -> None: ... + + +from typing import TYPE_CHECKING +from typing import Any + +from pygments import highlight +from pygments.formatters import Terminal256Formatter +from pygments.lexers import PythonLexer + + +class InlineRenderer(ConsoleRenderer): + def render_preamble(self, session: Session) -> str: + return "" + + def render_frame(self, frame: Frame, *args, **kwargs) -> str: + ".group.exit_frames[0].children[0]" + if self.root_frame is frame: + # locations = ["children", 0, "group", "exit_frames", 0, "children", 0] + # while frame and locations: + # location = locations.pop(0) + # new_frame = getattr(frame, location) if isinstance(location, str) else frame[location] + # if not new_frame: + # break + # frame = new_frame + self.root_frame = frame + if not frame: + return indent + "No frames were recorded" + return super().render_frame(frame, *args, **kwargs) + + +from pygments.styles.monokai import MonokaiStyle + +if TYPE_CHECKING: + from types import CodeType + + from IPython.core.magics.execution import TimeitResult + from pyinstrument.frame_ops import FrameRecordType + + +def colorize(*code, sep=""): + return highlight( + sep.join(code), PythonLexer(), Terminal256Formatter(style=MonokaiStyle) + ).removesuffix("\n") + + +def pprint_line(*obj: Any, start=" ", end="\n", sep="") -> None: + """Pretty-print in color.""" + print(start + colorize(*obj, sep=sep), end=end) + + +def _format_time(timespan, precision=3): + """Formats the timespan in a human readable form""" + + if timespan >= 60.0: + # we have more than a minute, format that in a human readable form + # Idea from http://snipplr.com/view/5713/ + parts = [("d", 60 * 60 * 24), ("h", 60 * 60), ("min", 60), ("s", 1)] + time = [] + leftover = timespan + for suffix, length in parts: + value = int(leftover / length) + if value > 0: + leftover = leftover % length + time.append(f"{value!s}{suffix}") + if leftover < 1: + break + return " ".join(time) + + # Unfortunately the unicode 'micro' symbol can cause problems in + # certain terminals. + # See bug: https://bugs.launchpad.net/ipython/+bug/348466 + # Try to prevent crashes by being more secure than it needs to + # E.g. eclipse is able to print a Β΅, but has no sys.stdout.encoding set. + units = ["s", "ms", "us", "ns"] # the save value + if hasattr(sys.stdout, "encoding") and sys.stdout.encoding: + try: + "\xb5".encode(sys.stdout.encoding) + units = ["s", "ms", "\xb5s", "ns"] + except: + pass + scaling = [1, 1e3, 1e6, 1e9] + + order = min(-int(math.floor(math.log10(timespan)) // 3), 3) if timespan > 0.0 else 3 + return "%.*g %s" % (precision, timespan * scaling[order], units[order]) + + +def _patch_shell_exc_info(shell, function, line_offset) -> None: + """ + Injects a _render_traceback method into exceptions caught in the shell + + This is a terrible hack, and there's certainly a better solution, but this is the first thing + I came up with + """ + orig = shell._get_exc_info + + def _get_line(n): + return inspect.getsourcelines(sys.modules[function.__module__])[0][n - 1] + + def _get_exc_info(exc_tuple=None): + def render_traceback(): + frames = traceback.extract_tb(tb.tb_next) # skip frame with exec() from IPython + for frame in frames: + if not frame.filename.startswith(" None: + if function.__code__.co_argcount: + raise RuntimeError(f"function {function.__qualname__} cannot have any arguments") + + c: CodeType = function.__code__ + + # executor = ExecutionMagics(shell) + decorators = [] + code_lines = [] + body_starts_at = None + longest_line_length = 0 + for i, line in enumerate(inspect.getsource(function).splitlines(keepends=True)): + if "# t" in line and not line.lstrip().startswith("#"): + longest_line_length = max(len(line[: line.find("# t")]), longest_line_length) + if body_starts_at: # skip decorators + code_lines.append(line) + elif line.strip().startswith("def "): + body_starts_at = i + continue + else: + decorators.append(line) + + code_lines = textwrap.dedent("".join(code_lines)) + lines_offset = c.co_firstlineno - body_starts_at + 3 + executable_lines = { + lineno - lines_offset for _, _, lineno in function.__code__.co_lines() if lineno is not None + } + + pprint_line("".join(decorators).strip(), start="") + pprint_line(f"def {function.__name__}():", start="") + + # init shell + shell = InteractiveShell() + # update shall with function __globals__ + # + shell.user_global_ns.update(function.__globals__) + _patch_shell_exc_info(shell, function, lines_offset) + + current_statement = [] + results = {} + for i, line in enumerate(code_lines.splitlines()): + current_statement.append(line) + if "# i" in line or not line or line.lstrip().startswith("#"): + end = line.find("# i") + if end == -1: + end = None + pprint_line(line[:end]) + if i not in executable_lines: + continue + if "# t" in line or (not longest_line_length and "# i" not in line): + try: + output_line, params = line.split("# t") + except ValueError: + output_line = line + params = "" + match params.split(): + case [case_name, compare_to]: + ... + case [case_name]: + compare_to = case_name.removeprefix("?") if case_name.startswith("?") else None + case_name = None if compare_to else case_name + case []: + compare_to = case_name = None + case _: + raise NotImplementedError(f"Can't parse {line}") + + if case_name is not None and case_name in results: + raise NameError(f"case {case_name} was already defined above") + if compare_to is not None: + try: + compare_to_result = results[compare_to] + except KeyError: + raise NameError(f"Can't compare to {compare_to!r}, case is not defined") + else: + compare_to_result = None + + pprint_line(output_line, end=" " * (longest_line_length - len(output_line) - 4) + "# ") + # run timeit first + if profile: + profiler = pyinstrument.Profiler(interval=0.001) + profiler.start() + try: + n = "" + if number is not None: + n = f"-n {number}" + + result: TimeitResult = shell.run_line_magic( + "timeit", f"-q -o {n} -r {repeats} {line}" + ) + except Exception: + raise + # we'll deal with it below, when executing the line in shell context + else: + if profile: + profiler.stop() + results[case_name] = result + # so much for private code... + # TODO: should probably just vendor these parts + out = f"~{result.best * 1000:.5F} ms" + if case_name: + out += f" ({case_name})" + if compare_to_result: + if compare_to_result.best > result.best: + out += f": {compare_to_result.best / result.best:.2F} times faster than {compare_to}" + else: + out += f": {result.best / compare_to_result.best:.2F} times slower than {compare_to}" + # result = "{mean}".format( + # mean=, + # std=_format_time(self.stdev, self._precision), + # ) + pprint_line(str(out), start="") + fr: FrameRecordType + if profile: + # root = profiler.last_session.root_frame(trim_stem=False) + print(InlineRenderer(color=True, show_all=True).render(profiler.last_session)) + + out = shell.run_cell("\n".join(current_statement), silent=True) + if not out.success: + return + + return + + +def timesup(*fn, profile=False, repeats=7, number=None): + if not fn: + return partial(timesup, profile=profile, repeats=repeats, number=number) + assert len(fn) == 1 + + # def proxy(): + run(*fn, profile=profile, repeats=repeats, number=number) + return None + + # return proxy + + +class C: + pass