diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81f56dd..7ddd490 100755 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,87 +1,23 @@ -name: CI -on: - push: - branches: [main] - pull_request: - branches: [main] +name: Pytest -permissions: - contents: write # needed for gh release upload +on: [push] jobs: build: - name: Build and Test Package - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.14"] - os: [ubuntu-latest] - + python-version: ["3.13", "3.14"] steps: - - name: Checkout Code - uses: actions/checkout@v4 - + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - - name: Cache Python Dependencies - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ matrix.python-version }} - restore-keys: | - ${{ runner.os }}-pip- - - - name: Install Build Tools + - name: Install dependencies run: | - python -m pip install --upgrade pip build - python -m pip install pybind11 - - - name: Lint Code - uses: wearerequired/lint-action@v2.3.0 - with: - linters: | - pylint - - - name: Build Wheel - run: | - python -m build --wheel - ls -al dist - - - name: Upload Built Wheels (artifact) - uses: actions/upload-artifact@v4 - with: - name: binaryparser-wheels - path: dist/*.whl - if-no-files-found: error - - - name: Ensure release exists - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh release view v1.2.0 >/dev/null 2>&1 || gh release create v1.2.0 -t "v1.2.0" -n "" - - - name: Upload Release Asset(s) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - ls -al dist - gh release upload v1.2.0 dist/*cp314*.whl --clobber - - - name: Install Built Package - run: pip install dist/*.whl - - - name: Run Tests - # Remove PYTHONPATH so tests import the installed wheel, not files from the repo root - run: | - python -m pip install pytest - pytest tests/ - - - name: Show Python Environment (Debug) - if: failure() + pip install --upgrade pip + pip install .[dev] + - name: Test the code with pytest run: | - python --version - pip list - ls -R + pytest . diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..f85a2e3 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,24 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.14"] + steps: + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[dev] + pip install pylint + - name: Analysing the code with pylint + run: | + pylint binary_parser diff --git a/.gitignore b/.gitignore index e59227a..915a744 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.idea/ bin/TestDaten .development tests/__pycache__ +/BinaryPaser.egg-info/ +/build/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..ab1f416 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Ignored default folder with query files +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/BinaryParser.iml b/.idea/BinaryParser.iml new file mode 100644 index 0000000..435b8ab --- /dev/null +++ b/.idea/BinaryParser.iml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..491ef7d --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,17 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..04db826 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d1a7504 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..f9d79c4 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,638 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint +# in a server-like mode. +clear-cache-post-run=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=5 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS, + venv + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, +# it can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.10 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +source-roots= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + +[MASTER] +disable= + C0114, + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type alias names. If left empty, type +# alias names will be checked with the set naming style. +#typealias-rgx= + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=8 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=0 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=builtins.BaseException,builtins.Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=150 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow explicit reexports by alias from a package __init__. +allow-reexport-from-package=no + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + use-implicit-booleaness-not-comparison-to-string, + use-implicit-booleaness-not-comparison-to-zero + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable= + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are: text, parseable, colorized, +# json2 (improved json format), json (old json format) and msvs (visual +# studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=6 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. No available dictionaries : You need to install +# both the python package and the system dependency for enchant to work. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/BinaryParser.egg-info/PKG-INFO b/BinaryParser.egg-info/PKG-INFO deleted file mode 100644 index cd2084c..0000000 --- a/BinaryParser.egg-info/PKG-INFO +++ /dev/null @@ -1,24 +0,0 @@ -Metadata-Version: 2.2 -Name: BinaryParser -Version: 0.0.1 -Summary: Parsing binary files -Author: Konrad Krämer -Author-email: konrad.kraemer@kit.edu -Requires-Python: >=3.7 -License-File: LICENSE -Requires-Dist: pybind11 -Requires-Dist: pandas -Requires-Dist: numpy -Requires-Dist: typeguard -Requires-Dist: plotly -Requires-Dist: matplotlib -Requires-Dist: seaborn -Requires-Dist: netCDF4 -Provides-Extra: test -Requires-Dist: pytest; extra == "test" -Dynamic: author -Dynamic: author-email -Dynamic: provides-extra -Dynamic: requires-dist -Dynamic: requires-python -Dynamic: summary diff --git a/BinaryParser.egg-info/SOURCES.txt b/BinaryParser.egg-info/SOURCES.txt deleted file mode 100644 index fe16ff3..0000000 --- a/BinaryParser.egg-info/SOURCES.txt +++ /dev/null @@ -1,23 +0,0 @@ -LICENSE -__init__.py -pyproject.toml -setup.py -./__init__.py -./setup.py -BinaryParser.egg-info/PKG-INFO -BinaryParser.egg-info/SOURCES.txt -BinaryParser.egg-info/dependency_links.txt -BinaryParser.egg-info/not-zip-safe -BinaryParser.egg-info/requires.txt -BinaryParser.egg-info/top_level.txt -chemstation/__init__.py -chemstation/read_ms_file.py -hplc/__init__.py -hplc/read_files.py -openlab/__init__.py -openlab/openlab.py -src/parser_hplc.cpp -src/parser_ms.cpp -src/parser_xray.cpp -xray/__init__.py -xray/bruker_xray.py \ No newline at end of file diff --git a/BinaryParser.egg-info/dependency_links.txt b/BinaryParser.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/BinaryParser.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/BinaryParser.egg-info/not-zip-safe b/BinaryParser.egg-info/not-zip-safe deleted file mode 100644 index 8b13789..0000000 --- a/BinaryParser.egg-info/not-zip-safe +++ /dev/null @@ -1 +0,0 @@ - diff --git a/BinaryParser.egg-info/requires.txt b/BinaryParser.egg-info/requires.txt deleted file mode 100644 index 583a78d..0000000 --- a/BinaryParser.egg-info/requires.txt +++ /dev/null @@ -1,11 +0,0 @@ -pybind11 -pandas -numpy -typeguard -plotly -matplotlib -seaborn -netCDF4 - -[test] -pytest diff --git a/BinaryParser.egg-info/top_level.txt b/BinaryParser.egg-info/top_level.txt deleted file mode 100644 index 5dea438..0000000 --- a/BinaryParser.egg-info/top_level.txt +++ /dev/null @@ -1,8 +0,0 @@ -BinaryParser -chemstation -hplc -openlab -parser_hplc -parser_ms -parser_xray -xray diff --git a/__init__.py b/__init__.py deleted file mode 100644 index cbe70c7..0000000 --- a/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from hplc import read_chromatograms, plot_chromatograms -from chemstation import read_chemstation_file -from openlab import read_attr, read_lc, read_ms - -__all__ = ["read_chromatograms", "plot_chromatograms", "read_chemstation_file"] diff --git a/bin/bin.R b/bin/bin.R deleted file mode 100644 index 452fa27..0000000 --- a/bin/bin.R +++ /dev/null @@ -1,247 +0,0 @@ -# path <- "./bin/TestDaten/CV_new/EL_2Elec_4p79mg_Et4NBF4inACN_EDLC_Pl2_211024_05_CV_C11/data/table_01.jdx" -# [10] "##VOLTAGE_START=0.0" -# [11] "##SCAN_RATE=0.05" -# [12] "##VOLTAGE_LIMIT_ONE=2.700000047683716" -# [13] "##VOLTAGE_LIMIT_TWO=0.0" -# [14] "##RESOLUTION=Auto" -# [15] "##CYCLES=9" -# [16] "##DATE=45586" -# [17] "##TIME=.56551085648" -# [18] "##VOLTAGE_LIMIT_END=0.0" -# [19] "##SOFTWARE_VERSION=11.43" -# [20] "##CONCENTRATION_SALT=1" -# [21] "##FIRSTX=0.013841687701642513" -# [22] "##LASTX=-0.0004615047946572304" -# Voltage -# [23] "##MINX=-0.0009872515220195055" --> Set in device -# [24] "##MAXX=2.699223756790161" --> Set in device -# Current -# [25] "##MINY=-0.006186341957729539" -# [26] "##MAXY=0.007231874746488927" -# [27] "##NPOINTS=27030" -# [28] "##FIRSTY=3.6861687898635866e-05" -# [31] "##XYPOINTS=(XY..XY)" -# [32] "0.013841687701642513, 3.6861687898635866e-05" -# [33] "0.015812207013368607, 0.0002835869200211467" -# [34] "0.018651776015758514, 0.0006201966994459773" -# [35] "0.021376779302954674, 0.0009041020984360451" -# [36] "0.02406417950987816, 0.001166420330971846" -# [37] "0.026055805385112762, 0.0013446788548112476" -# [38] "0.02804425358772278, 0.001510192135544381" -library(ggplot2) -library(reshape2) -Rcpp::sourceCpp("./bin/bin.cpp") - -# NOTE: print raw buffer pasted -pr <- function(v, sep) { - v <- as.character(v) - v <- paste(v, collapse = "") - cat(v, sep) -} - -vec_pad <- function(vec, size) { - if (length(vec) < size) { - stop("Vector is too small") - } - if (length(vec) / size == 0) { - return(vec) - } else { - times <- length(vec) %/% size - return(vec[1:(times * size)]) - } -} - -bin <- R6::R6Class( - "bin", - public = list( - path = NULL, - raw = NULL, - raw_subset = NULL, - endian = "big", # NOTE: most devices are constructed for windows - signed = FALSE, - initialize = function(path) { - self$path <- path - con <- file(path, "rb") - size <- file.info(path)$size - self$raw <- readBin(con, what = "raw", n = size) - close(con) - }, - to_char = function(elem) { - rawToChar(elem) - }, - to_int8 = function(elem) { - res <- NULL - if (self$signed) { - res <- rawToChar(elem) |> as.integer() - } else { - res <- rawToChar(elem) |> CastToUint8() - } - return(res) - }, - to_int16 = function(vec) { - op <- NULL - if (self$signed) { - op <- CastToInt16 - } else { - op <- CastToUint16 - } - vec <- as.character(vec) - vec <- vec_pad(vec, 2) - vec <- split(vec, rep(seq_len(length(vec) / 2), each = 2)) - res <- lapply(vec, function(x) { - op(x) - }) - names <- sapply(vec, function(i) { - paste(i, collapse = "") - }) - names(res) <- names - res - }, - to_int32 = function(vec) { - op <- NULL - if (self$signed) { - op <- CastToInt32 - } else { - op <- CastToUint32 - } - vec <- as.character(vec) - vec <- vec_pad(vec, 4) - vec <- split(vec, rep(seq_len(length(vec) / 4), each = 4)) - res <- lapply(vec, op) - names <- sapply(vec, function(i) { - paste(i, collapse = "") - }) - names(res) <- names - res - }, - print_char = function(idx) { - cat(" ") - for (i in idx:(idx + 7)) { - temp <- self$to_char(self$raw_subset[i]) - if (temp == "") temp <- "." - cat(temp, "\t") - } - cat("\n") - }, - print_uint8 = function(idx) { - cat(" ") - for (i in idx:(idx + 7)) { - temp <- self$to_int8(self$raw_subset[i]) - if (temp == "") temp <- "." - cat(temp, "\t") - } - cat("\n") - }, - print_uint16 = function(idx) { - cat(" ") - temp <- self$to_int16(self$raw_subset[idx:(idx + 7)]) - for (i in seq_along(temp)) { - pr(names(temp)[i], "\t") - cat(temp[[i]], "\t") - } - cat("\n") - }, - print_uint32 = function(idx) { - cat(" ") - temp <- self$to_int32(self$raw_subset[idx:(idx + 7)]) - for (i in seq_along(temp)) { - pr(names(temp)[i], "\t\t") - cat(temp[[i]], "\t") - } - cat("\n") - }, - print = function(range) { - self$raw_subset <- self$raw[range] - for (i in seq_along(self$raw_subset)) { - cat(self$raw_subset[i], "\t") - if (i %% 8 == 0) { - cat("\n") - self$print_char(i - 7) - self$print_uint8(i - 7) - self$print_uint16(i - 7) - self$print_uint32(i - 7) - cat("\n") - cat("Elems: ", i + 1, " - ", i + 8, "\n") - } - } - }, - plot = function(range, type, op = NULL) { - self$raw_subset <- self$raw[range] - x <- NULL - y <- NULL - if (type == "int8") { - y <- sapply(self$raw_subset, self$to_int8) - x <- as.character(self$raw_subset) - } else if (type == "int16") { - res <- self$to_int16(self$raw_subset) - x <- names(res) - y <- unlist(res) - attributes(y) <- NULL - } else if (type == "int32") { - res <- self$to_int32(self$raw_subset) - x <- names(res) - y <- unlist(res) - attributes(y) <- NULL - } else { - stop("found unknown type") - } - if (!is.null(op)) { - stopifnot("op has to be a function" = is.function(op)) - y <- op(y) - } - colors <- rep(c("black", "darkred"), length.out = length(y)) - bp <- barplot(y, names.arg = NULL, col = colors, border = "black") - text( - x = bp, - y = par("usr")[3] - 1, - labels = x, srt = 90, - cex = 0.75, - adj = 1, xpd = TRUE - ) - } - ) -) - -path <- "./bin/TestDaten/CV_new/EL_2Elec_4p79mg_Et4NBF4inACN_EDLC_Pl2_211024_05_CV_C11.mpr" -b <- bin$new(path) -b$print(1:100) - -b$signed <- FALSE -range <- 7200:8001 -range <- 7247:(7247 + 400 * 4 - 1) -b$plot(range, "int16") - -int_values <- b$to_int16(b$raw) |> unlist() -summary(int_values) -length(int_values) - -# Determine cycle length -acf(int_values, lag.max = 10000, main = "Autocorrelation of Data") -acf_values <- acf(int_values, lag.max = 10000, plot = FALSE)$acf[-1] -lag_indices <- which(acf_values > 0.5)[1] -print(lag_indices) - -# Plot raw data -indices <- seq(1, length(int_values), by = 80) # 100 -data_start <- 0x1c40 # From hexdump -values <- int_values[data_start:(data_start + 10000)] -plot(values, type = "l", ylim = c(0, 10000)) -abline(v = seq(1, length(values), by = 47), col = "red", lty = 2) -points(values, pch = 19) - - -# Print bits -bit_matrix <- uint16_to_bit_matrix(int_values) -df <- melt(bit_matrix) -colnames(bit_matrix) <- paste0("B_", 15:0) -colnames(df) <- c("Idx", "Bit_Pos", "Value") -df$Bit_Position <- as.numeric(gsub("B_", "", df$Bit_Pos)) -ggplot(df, aes(x = Bit_Pos, y = Idx, fill = Value)) + - geom_tile(width = 1, height = 5.5) + - scale_fill_gradient(low = "white", high = "black") + - scale_x_reverse(breaks = 15:0) + - labs( - x = "Bit Position", y = "Number Index", - title = "Bit Pattern of int Values" - ) + - theme_minimal() diff --git a/bin/bin.cpp b/bin/bin.cpp deleted file mode 100644 index 8dd16a9..0000000 --- a/bin/bin.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include -#include -#include -#include - -// [[Rcpp::export]] -std::uint8_t CastToUint8(std::string &buffer) { - const char *b = buffer.c_str(); - std::uint8_t res = *reinterpret_cast(b); - return res; -} - -// [[Rcpp::export]] -int CastToInt8(std::string &buffer) { - const char *b = buffer.c_str(); - std::int8_t res = *reinterpret_cast(b); - return static_cast(res); -} - -uint8_t hexToByte(const std::string &hex) { - return static_cast(std::stoul(hex, nullptr, 16)); -} - -// [[Rcpp::export]] -std::uint16_t CastToUint16(Rcpp::CharacterVector buffer) { - if (buffer.size() != 2) { - std::cout << "size = " << buffer.size() << std::endl; - Rcpp::stop("Expected exactly 2 hex strings representing bytes."); - } - uint8_t byte1 = hexToByte(Rcpp::as(buffer[0])); - uint8_t byte2 = hexToByte(Rcpp::as(buffer[1])); - uint16_t res = (byte1 << 8) | byte2; - return res; -} - -// [[Rcpp::export]] -int16_t CastToInt16(Rcpp::CharacterVector buffer) { - if (buffer.size() != 2) { - std::cout << "size = " << buffer.size() << std::endl; - Rcpp::stop("Expected exactly 2 hex strings representing bytes."); - } - uint8_t byte1 = hexToByte(Rcpp::as(buffer[0])); - uint8_t byte2 = hexToByte(Rcpp::as(buffer[1])); - int16_t res = (static_cast(byte1) << 8) | byte2; - return res; -} - -// [[Rcpp::export]] -std::uint32_t CastToUint32(Rcpp::CharacterVector buffer) { - if (buffer.size() != 4) { - Rcpp::stop("Expected exactly 4 hex strings representing bytes."); - } - uint8_t byte1 = hexToByte(Rcpp::as(buffer[0])); - uint8_t byte2 = hexToByte(Rcpp::as(buffer[1])); - uint8_t byte3 = hexToByte(Rcpp::as(buffer[2])); - uint8_t byte4 = hexToByte(Rcpp::as(buffer[3])); - uint32_t res = (byte1 << 24) | (byte2 << 16) | (byte3 << 8) | byte4; - return res; -} - -// [[Rcpp::export]] -std::int32_t CastToInt32(Rcpp::CharacterVector buffer) { - if (buffer.size() != 4) { - Rcpp::stop("Expected exactly 4 hex strings representing bytes."); - } - uint8_t byte1 = hexToByte(Rcpp::as(buffer[0])); // unsigned - uint8_t byte2 = hexToByte(Rcpp::as(buffer[1])); - uint8_t byte3 = hexToByte(Rcpp::as(buffer[2])); - uint8_t byte4 = hexToByte(Rcpp::as(buffer[3])); - uint32_t ures = (static_cast(byte1) << 24) | - (static_cast(byte2) << 16) | - (static_cast(byte3) << 8) | byte4; - return static_cast(ures); -} - -// [[Rcpp::export]] -Rcpp::IntegerMatrix uint16_to_bit_matrix(Rcpp::IntegerVector values) { - int n = values.size(); - Rcpp::IntegerMatrix bit_matrix(n, 16); - for (int i = 0; i < n; i++) { - std::bitset<16> bits(values[i]); - for (int j = 0; j < 16; j++) { - bit_matrix(i, j) = bits[15 - j]; - } - } - return bit_matrix; -} diff --git a/binary_parser/__init__.py b/binary_parser/__init__.py new file mode 100644 index 0000000..651b0f0 --- /dev/null +++ b/binary_parser/__init__.py @@ -0,0 +1,5 @@ +from binary_parser.hplc import read_chromatograms, plot_chromatograms +from binary_parser.chemstation import read_chemstation_file +from binary_parser.openlab import read_attr, read_lc, read_ms + +__all__ = ["read_chromatograms", "plot_chromatograms", "read_chemstation_file"] diff --git a/binary_parser/chemstation/__init__.py b/binary_parser/chemstation/__init__.py new file mode 100644 index 0000000..7928a24 --- /dev/null +++ b/binary_parser/chemstation/__init__.py @@ -0,0 +1,6 @@ + +from binary_parser.chemstation.read_ms_file import read_chemstation_file + +__all__ = [ + 'read_chemstation_file', +] diff --git a/build/lib.linux-x86_64-cpython-312/chemstation/read_ms_file.py b/binary_parser/chemstation/read_ms_file.py similarity index 88% rename from build/lib.linux-x86_64-cpython-312/chemstation/read_ms_file.py rename to binary_parser/chemstation/read_ms_file.py index f037b2e..84bda27 100644 --- a/build/lib.linux-x86_64-cpython-312/chemstation/read_ms_file.py +++ b/binary_parser/chemstation/read_ms_file.py @@ -1,7 +1,5 @@ -import parser_ms as pm +import binary_parser.helper.parser_ms as pm import pandas as pd -import plotly.graph_objs as go -import plotly.express as px from typeguard import typechecked from typing import List @@ -32,5 +30,4 @@ def merge_cycles_into_df(cycles: List[dict]) -> pd.DataFrame: @typechecked def read_chemstation_file(file_path: str) -> pd.DataFrame: cycles = pm.read_cycles(file_path) - cycle_dfs = convert_cycles_to_dfs(cycles) return merge_cycles_into_df(cycles) diff --git a/bin/bin.py b/binary_parser/helper/__init__.py similarity index 100% rename from bin/bin.py rename to binary_parser/helper/__init__.py diff --git a/binary_parser/helper/parser_hplc.py b/binary_parser/helper/parser_hplc.py new file mode 100644 index 0000000..4fbcf69 --- /dev/null +++ b/binary_parser/helper/parser_hplc.py @@ -0,0 +1,179 @@ +import struct +import numpy as np + + +def _u16_be_to_host(v): + return struct.unpack(">H", struct.pack(">H", v))[0] + + +def _u32_be_to_host(v): + return struct.unpack(">I", struct.pack(">I", v))[0] + + +def _i16_be(stream): + data = stream.read(2) + if not data: + raise EOFError + return struct.unpack(">h", data)[0] + + +def _i32_be(stream): + data = stream.read(4) + if not data: + raise EOFError + return struct.unpack(">i", data)[0] + + +def _u32_be(stream): + data = stream.read(4) + if not data: + raise EOFError + return struct.unpack(">I", data)[0] + + +def readInt(filepath, offset): + with open(filepath, "rb") as f: + f.seek(offset) + data = f.read(4) + return struct.unpack("d", data)[0] # inverted endian + return val + + +def readUint8(filepath, offset): + with open(filepath, "rb") as f: + f.seek(offset) + out = f.read(40) + return [chr(b) for b in out] + + +def readTime(filepath, offset): + out = np.zeros(2, dtype=float) + with open(filepath, "rb") as f: + f.seek(offset) + for i in range(2): + raw = _i32_be(f) + out[i] = raw / 60000.0 + return out + + +def DeltaCompression(filepath, offset, n=None): + with open(filepath, "rb") as f: + f.seek(offset) + res = [] + prev = 0 + + while True: + try: + header = _i16_be(f) + except EOFError: + break + + # C++: if (buffer1 << 12 == 0) + if (header << 12) == 0: + break + + count = header & 4095 + + for _ in range(count): + try: + delta = _i16_be(f) + except EOFError: + break + + if delta != -32768: + prev += delta + res.append(prev) + else: + prev = _i32_be(f) + res.append(prev) + + return np.array(res, dtype=np.int32) + + +class UVClass: + """ + Python-port of UVClass in pybind11 module + """ + + def __init__(self, filepath): + self.filepath = filepath + + # read nscans + nscans = _read_int32_be(filepath, 0x116) + + self.time = np.zeros(nscans, dtype=float) + self.wavelengths = [] + self.ndata = [] + + with open(filepath, "rb") as f: + offset = 0x1002 + prev_buffer7 = 0 + + for scan in range(nscans): + f.seek(offset) + + # buffer1 + size = struct.unpack("i", data)[0] diff --git a/binary_parser/helper/parser_ms.py b/binary_parser/helper/parser_ms.py new file mode 100644 index 0000000..02e3b5e --- /dev/null +++ b/binary_parser/helper/parser_ms.py @@ -0,0 +1,85 @@ +import struct +import numpy as np + + +def _read_file(path): + with open(path, "rb") as f: + return f.read() + + +def _u16_be(buf, offset): + return struct.unpack(">H", buf[offset:offset + 2])[0] + + +def _u32_be(buf, offset): + return struct.unpack(">I", buf[offset:offset + 4])[0] + + +def _find_number_of_cycles(buf): + return _u32_be(buf, 0x116) + + +def _find_data_start(buf): + offset_correction = _u16_be(buf, 0x10A) + return offset_correction * 2 - 2 + + +def _convert_mz_intensity(data_u16): + n = len(data_u16) + n -= n % 2 + mz = np.zeros(n // 2, dtype=float) + intensity = np.zeros(n // 2, dtype=float) + + for i in range(n): + if (i & 1) == 0: + # MZ + mz[i >> 1] = data_u16[i] / 20.0 + else: + # Intensity encoding: head = bits 14-15, tail = bits 0-13 + head = data_u16[i] >> 14 + tail = data_u16[i] & 0x3FFF + intensity[i >> 1] = (8 ** head) * tail + + return mz, intensity + + +def _read_cycle(buf, start, cycle_size): + data_u16 = [] + for i in range(cycle_size*2): + data_u16.append(_u16_be(buf, start + i * 2)) + return _convert_mz_intensity(data_u16) + + +def read_cycles(path): + buf = _read_file(path) + data_start = _find_data_start(buf) + num_cycles = _find_number_of_cycles(buf) + + cycles = [] + counter = data_start + + for _ in range(num_cycles): + if counter >= len(buf): + raise ValueError("Error extracting data") + + counter += 2 # skip? + time = _u32_be(buf, counter) + counter += 10 + + cycle_size = _u16_be(buf, counter) + counter += 6 + + mz, intensity = _read_cycle(buf, counter, cycle_size) + + rt = time / 60000.0 + + counter += cycle_size * 4 + counter += 10 + + cycles.append({ + "mz": mz, + "intensity": intensity, + "retention_time": rt, + }) + + return cycles diff --git a/binary_parser/helper/parser_xray.py b/binary_parser/helper/parser_xray.py new file mode 100644 index 0000000..6c09ff1 --- /dev/null +++ b/binary_parser/helper/parser_xray.py @@ -0,0 +1,65 @@ +import numpy as np +import struct + + +def read_doubles(filepath, offset=0): + """ + Reads the entire file (from offset) as doubles (little-endian assumed). + """ + with open(filepath, "rb") as f: + f.seek(0, 2) + size_bytes = f.tell() - offset + f.seek(offset) + n = size_bytes // 8 + data = np.fromfile(f, dtype="i", raw[i*4:i*4+4]) # big-endian -> int32 + out[i] = v + return out + + +def read_chars(filepath): + """ + Print file content in a human-readable char + hex format. + Matching original behavior for debugging. + """ + with open(filepath, "rb") as f: + buf = f.read() + + width = 8 + addr = 0 + + for i in range(0, len(buf), width): + chunk = buf[i:i+width] + + # address line (hex) + print(" ".join(f"{addr+j:03x}" for j in range(len(chunk)))) + addr += len(chunk) + + # character view + print(" ".join(f"'{chr(c)}'" for c in chunk)) diff --git a/binary_parser/helper/utils.py b/binary_parser/helper/utils.py new file mode 100644 index 0000000..bf352bc --- /dev/null +++ b/binary_parser/helper/utils.py @@ -0,0 +1,4 @@ +from typing import Union, List +import numpy as np + +NumList = Union[List[float], np.ndarray] \ No newline at end of file diff --git a/binary_parser/hplc/__init__.py b/binary_parser/hplc/__init__.py new file mode 100644 index 0000000..4fcd0b8 --- /dev/null +++ b/binary_parser/hplc/__init__.py @@ -0,0 +1,8 @@ +from binary_parser.hplc.read_files import read_chromatograms, plot_chromatograms, read_uv, plot_uv + +__all__ = [ + 'read_chromatograms', + 'plot_chromatograms', + 'read_uv', + 'plot_uv' +] \ No newline at end of file diff --git a/hplc/read_files.py b/binary_parser/hplc/read_files.py similarity index 89% rename from hplc/read_files.py rename to binary_parser/hplc/read_files.py index ce15916..a2079f1 100644 --- a/hplc/read_files.py +++ b/binary_parser/hplc/read_files.py @@ -1,13 +1,12 @@ -import parser_hplc as ph +import binary_parser.helper.parser_hplc as ph import pandas as pd -import numpy as np import plotly.graph_objs as go import plotly.express as px import re -import sys from os import listdir from os.path import isfile, join from typeguard import typechecked +from binary_parser.helper.utils import NumList from typing import List @@ -23,9 +22,9 @@ def check_identical_lists(lst: List[List[float]]) -> bool: @typechecked -def read_time(file_path: str, length: int) -> List[float]: +def read_time(file_path: str, length: int) -> NumList: offsetTime = int("0000011a", 16) - time: List[float] = ph.readTime(file_path, offsetTime) + time:NumList = ph.readTime(file_path, offsetTime) step_size: float = (time[1] - time[0]) / (length - 1) res: List[float] = [time[0] + i * step_size for i in range(length)] return res @@ -42,7 +41,7 @@ def read_file_info(file_path: str) -> int: @typechecked -def scale_data(file_path: str, l: List[int]) -> List[float]: +def scale_data(file_path: str, l: NumList) -> NumList: intercept: float = ph.readDouble(file_path, 4724) slope: float = ph.readDouble(file_path, 4732) res: List[float] = [float(i) * slope + intercept for i in l] @@ -58,8 +57,8 @@ def read_chromatograms(path: str) -> pd.DataFrame: ] wavelengths: List[str] = ["Wavelength_" + str(read_file_info(i)) for i in files] offset: int = int("00001800", 16) - result: List[List[int]] = [ph.DeltaCompresion(i, offset, 12) for i in files] - result_scaled: List[List[float]] = [ + result: List[NumList] = [ph.DeltaCompression(i, offset, 12) for i in files] + result_scaled: List[NumList] = [ scale_data(files[i], result[i]) for i in range(0, len(result)) ] times: List[List[float]] = [read_time(i, len(result[0])) for i in files] diff --git a/binary_parser/openlab/__init__.py b/binary_parser/openlab/__init__.py new file mode 100644 index 0000000..85652de --- /dev/null +++ b/binary_parser/openlab/__init__.py @@ -0,0 +1,3 @@ +from binary_parser.openlab.openlab import read_attr, read_lc, read_ms + +__all__ = ["read_attr", "read_lc", "read_ms"] diff --git a/openlab/openlab.py b/binary_parser/openlab/openlab.py similarity index 70% rename from openlab/openlab.py rename to binary_parser/openlab/openlab.py index b4d27bb..e4a6da4 100755 --- a/openlab/openlab.py +++ b/binary_parser/openlab/openlab.py @@ -1,13 +1,18 @@ import os +import re +from typing import List + import netCDF4 as nc -import pandas as pd import numpy as np -import re +import pandas as pd from typeguard import typechecked -from typing import List + @typechecked def get_files(path: str) -> List[str]: + """ + Return list of .cdf files in the given directory, sorted naturally. + """ fs = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".cdf")] assert fs, "No files found" @@ -23,7 +28,8 @@ def natkey(p: str): # Attributes @typechecked -def get_attr(path: str): +def _get_attr(path: str): + """Read global NetCDF attributes from a file.""" with nc.Dataset(path, "r") as dataset: attr = {key: dataset.getncattr(key) for key in dataset.ncattrs()} return attr @@ -31,14 +37,23 @@ def get_attr(path: str): @typechecked def read_attr(path: str) -> pd.DataFrame: + """ + Read all NetCDF global attributes across all .cdf files in the directory. + Returns a normalized DataFrame. + """ fs = get_files(path) - attrs_lc = [pd.DataFrame([get_attr(fs[x])]) for x in range(len(fs))] + attrs_lc = [pd.DataFrame([_get_attr(fs[x])]) for x in range(len(fs))] attrs_lc = pd.concat(attrs_lc, ignore_index=True) return attrs_lc + +# --------------------------------------------------------------------------- # LC Data +# --------------------------------------------------------------------------- + @typechecked def get_lc_data(path: str) -> pd.DataFrame: + """Read LC detector signals from NetCDF.""" with nc.Dataset(path, "r") as dataset: detector_signals = dataset.variables["ordinate_values"][:] global_atts = {key: dataset.getncattr(key) for key in dataset.ncattrs()} @@ -57,6 +72,7 @@ def get_lc_data(path: str) -> pd.DataFrame: @typechecked def process_detector_info(df_list: List[pd.DataFrame]) -> List[pd.DataFrame]: + """Extract wavelength from LC detector metadata.""" for df in df_list: detector_name = df.attrs.get("detector", "") wl_match = ( @@ -71,6 +87,7 @@ def process_detector_info(df_list: List[pd.DataFrame]) -> List[pd.DataFrame]: @typechecked def read_lc(path: str) -> pd.DataFrame: + """Read all LC files containing 'DAD' in filename and concatenate.""" fs = get_files(path) # Filter fs --> Files which contain DAD within their name fs = [f for f in fs if "DAD" in os.path.basename(f)] @@ -80,16 +97,19 @@ def read_lc(path: str) -> pd.DataFrame: return df +# --------------------------------------------------------------------------- # MS Data +# --------------------------------------------------------------------------- + @typechecked -def get_point_counts(path: str) -> np.ma.MaskedArray: +def _get_point_counts(path: str) -> np.ma.MaskedArray: with nc.Dataset(path, "r") as dataset: res = dataset.variables["point_count"][:] return res @typechecked -def get_ms_data(path: str) -> pd.DataFrame: +def _get_ms_data(path: str) -> pd.DataFrame: with nc.Dataset(path, "r") as dataset: mz_values = dataset.variables["mass_values"][:] intensities = dataset.variables["intensity_values"][:] @@ -97,14 +117,14 @@ def get_ms_data(path: str) -> pd.DataFrame: @typechecked -def get_scan_time(path: str) -> np.ma.MaskedArray: +def _get_scan_time(path: str) -> np.ma.MaskedArray: with nc.Dataset(path, "r") as dataset: time = dataset.variables["scan_acquisition_time"][:] return time / 60 @typechecked -def split_data( +def _split_data( data: pd.DataFrame, point_counts: np.ma.MaskedArray ) -> List[pd.DataFrame]: end_indices = np.cumsum(point_counts) @@ -114,7 +134,7 @@ def split_data( @typechecked -def normalise(data_list: List[pd.DataFrame]) -> List[pd.DataFrame]: +def _normalise(data_list: List[pd.DataFrame]) -> List[pd.DataFrame]: return [ df.assign(intensities=df["intensities"] * (100 / df["intensities"].max())) for df in data_list @@ -125,15 +145,15 @@ def normalise(data_list: List[pd.DataFrame]) -> List[pd.DataFrame]: def read_ms(path: str) -> List[pd.DataFrame]: fs = get_files(path) fs_ms = [f for f in fs if "spectra" in os.path.basename(f)] - data_minus = get_ms_data(fs_ms[0]) - point_counts_minus = get_point_counts(fs_ms[0]) - time_minus = get_scan_time(fs_ms[0]) - df_minus = normalise(split_data(data_minus, point_counts_minus)) - - data_plus = get_ms_data(fs_ms[1]) - point_counts_plus = get_point_counts(fs_ms[1]) - time_plus = get_scan_time(fs_ms[1]) - df_plus = normalise(split_data(data_plus, point_counts_plus)) + data_minus = _get_ms_data(fs_ms[0]) + point_counts_minus = _get_point_counts(fs_ms[0]) + time_minus = _get_scan_time(fs_ms[0]) + df_minus = _normalise(_split_data(data_minus, point_counts_minus)) + + data_plus = _get_ms_data(fs_ms[1]) + point_counts_plus = _get_point_counts(fs_ms[1]) + time_plus = _get_scan_time(fs_ms[1]) + df_plus = _normalise(_split_data(data_plus, point_counts_plus)) df_minus = pd.concat([df.assign(time=t) for df, t in zip(df_minus, time_minus)]) df_plus = pd.concat([df.assign(time=t) for df, t in zip(df_plus, time_plus)]) diff --git a/binary_parser/xray/__init__.py b/binary_parser/xray/__init__.py new file mode 100644 index 0000000..40c9115 --- /dev/null +++ b/binary_parser/xray/__init__.py @@ -0,0 +1,4 @@ +from binary_parser.xray.bruker_xray import read_raw + +__all__ = ["read_raw"] + diff --git a/build/lib.linux-x86_64-cpython-312/xray/bruker_xray.py b/binary_parser/xray/bruker_xray.py similarity index 96% rename from build/lib.linux-x86_64-cpython-312/xray/bruker_xray.py rename to binary_parser/xray/bruker_xray.py index 009540d..d164664 100644 --- a/build/lib.linux-x86_64-cpython-312/xray/bruker_xray.py +++ b/binary_parser/xray/bruker_xray.py @@ -1,7 +1,5 @@ -import parser_xray as px +import binary_parser.helper.parser_xray as px import pandas as pd -from typeguard import typechecked -from typing import List search_for = { "GONIOMETER_RADIUS": 217.5, diff --git a/build/lib.linux-x86_64-cpython-312/BinaryParser/__init__.py b/build/lib.linux-x86_64-cpython-312/BinaryParser/__init__.py deleted file mode 100644 index cbe70c7..0000000 --- a/build/lib.linux-x86_64-cpython-312/BinaryParser/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from hplc import read_chromatograms, plot_chromatograms -from chemstation import read_chemstation_file -from openlab import read_attr, read_lc, read_ms - -__all__ = ["read_chromatograms", "plot_chromatograms", "read_chemstation_file"] diff --git a/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/__init__.py b/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/__init__.py deleted file mode 100644 index 41e3eaf..0000000 --- a/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = ["read_attr", "read_lc", "read_ms"] diff --git a/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/openlab.py b/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/openlab.py deleted file mode 100644 index aeedbce..0000000 --- a/build/lib.linux-x86_64-cpython-312/BinaryParser/openlab/openlab.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -import netCDF4 as nc -import pandas as pd -import numpy as np -import re -from typeguard import typechecked -from typing import List - - -@typechecked -def get_files(path: str) -> List[str]: - fs = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".cdf")] - assert len(fs) > 0, "No files found" - return fs - - -# Attributes -@typechecked -def get_attr(path: str): - with nc.Dataset(path, "r") as dataset: - attr = {key: dataset.getncattr(key) for key in dataset.ncattrs()} - return attr - - -@typechecked -def read_attr(path: str) -> pd.DataFrame: - fs = get_files(path) - attrs_lc = [pd.DataFrame([get_attr(fs[x])]) for x in range(len(fs))] - attrs_lc = pd.concat(attrs_lc, ignore_index=True) - return attrs_lc - - -# LC Data -@typechecked -def get_lc_data(path: str) -> pd.DataFrame: - with nc.Dataset(path, "r") as dataset: - detector_signals = dataset.variables["ordinate_values"][:] - global_atts = {key: dataset.getncattr(key) for key in dataset.ncattrs()} - detector = global_atts.get("detector_name", "") - run_time_length = dataset.variables["actual_run_time_length"][...] - - data = pd.DataFrame( - { - "RetentionTime": np.linspace(0, run_time_length, num=len(detector_signals)), - "DetectorSignal": detector_signals, - } - ) - data.attrs["detector"] = detector - return data - - -@typechecked -def process_detector_info(df_list: List[pd.DataFrame]) -> List[pd.DataFrame]: - for df in df_list: - detector_name = df.attrs.get("detector", "") - wl_match = ( - re.search(r"\d+", detector_name.split(",")[1]) - if "," in detector_name - else None - ) - wl = float(wl_match.group()) if wl_match else None - df["wavelength"] = wl - return df_list - - -@typechecked -def read_lc(path: str) -> pd.DataFrame: - fs = get_files(path) - # Filter fs --> Files which contain DAD within their name - fs = [f for f in fs if "DAD" in os.path.basename(f)] - df = [get_lc_data(fs[x]) for x in range(len(fs))] - df = process_detector_info(df) - df = pd.concat(df, ignore_index=True) - return df - - -# MS Data -@typechecked -def get_point_counts(path: str) -> List[int]: - with nc.Dataset(path, "r") as dataset: - return dataset.variables["point_count"][:] - - -@typechecked -def get_ms_data(path: str) -> pd.DataFrame: - with nc.Dataset(path, "r") as dataset: - mz_values = dataset.variables["mass_values"][:] - intensities = dataset.variables["intensity_values"][:] - return pd.DataFrame({"mz": mz_values, "intensities": intensities}) - - -@typechecked -def get_scan_time(path: str) -> List[float]: - with nc.Dataset(path, "r") as dataset: - time = dataset.variables["scan_acquisition_time"][:] - return time / 60 - - -@typechecked -def split_data(data: pd.DataFrame, point_counts: List[int]) -> List[pd.DataFrame]: - end_indices = np.cumsum(point_counts) - start_indices = np.insert(end_indices[:-1], 0, 0) - return [data.iloc[start:end] for start, end in zip(start_indices, end_indices)] - - -@typechecked -def normalise(data_list: List[pd.DataFrame]) -> List[pd.DataFrame]: - return [ - df.assign(intensities=df["intensities"] * (100 / df["intensities"].max())) - for df in data_list - ] - - -@typechecked -def read_ms(path: str) -> List[pd.DataFrame]: - fs = get_files(path) - fs_ms = [f for f in fs if "spectra" in os.path.basename(f)] - data_minus = get_ms_data(fs_ms[0]) - point_counts_minus = get_point_counts(fs_ms[0]) - time_minus = get_scan_time(fs_ms[0]) - df_minus = normalise(split_data(data_minus, point_counts_minus)) - - data_plus = get_ms_data(fs_ms[1]) - point_counts_plus = get_point_counts(fs_ms[1]) - time_plus = get_scan_time(fs_ms[1]) - df_plus = normalise(split_data(data_plus, point_counts_plus)) - - df_minus = pd.concat([df.assign(time=t) for df, t in zip(df_minus, time_minus)]) - df_plus = pd.concat([df.assign(time=t) for df, t in zip(df_plus, time_plus)]) - return [df_minus, df_plus] diff --git a/build/lib.linux-x86_64-cpython-312/BinaryParser/setup.py b/build/lib.linux-x86_64-cpython-312/BinaryParser/setup.py deleted file mode 100644 index 63173cb..0000000 --- a/build/lib.linux-x86_64-cpython-312/BinaryParser/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -from pybind11.setup_helpers import Pybind11Extension, build_ext -from setuptools import setup, find_packages - -__version__ = "0.0.1" - -ext_modules = [ - Pybind11Extension( - "parser_hplc", - ["src/parser_hplc.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), - Pybind11Extension( - "parser_ms", - ["src/parser_ms.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), - Pybind11Extension( - "parser_xray", - ["src/parser_xray.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), -] - -setup( - name="BinaryParser", - version=__version__, - author="Konrad Krämer", - author_email="konrad.kraemer@kit.edu", - description="Parsing binary files", - long_description="", - ext_modules=ext_modules, - extras_require={"test": "pytest"}, - cmdclass={"build_ext": build_ext}, - zip_safe=False, - python_requires=">=3.7", - # packages=find_packages(), - packages=(["BinaryParser"] + ["openlab"] + find_packages()), - # package_dir={"BinaryParser": "."}, - package_dir={ - "BinaryParser": ".", - "BinaryParser.openlab": "./openlab", - }, - setup_requires=["pybind11"], - install_requires=[ - "pybind11", - "pandas", - "numpy", - "typeguard", - "plotly", - "matplotlib", - "seaborn", - "netCDF4", - ], -) diff --git a/build/lib.linux-x86_64-cpython-312/BinaryParser/test_file.py b/build/lib.linux-x86_64-cpython-312/BinaryParser/test_file.py deleted file mode 100644 index 4a66672..0000000 --- a/build/lib.linux-x86_64-cpython-312/BinaryParser/test_file.py +++ /dev/null @@ -1,12 +0,0 @@ -import BinaryParser as bp -import pandas as pd -import plotly.express as px - -file_path = "/home/konrad/Documents/BinaryParser/Chemstation/ChemStationData/LCMS_DatenAgilent_SVS/SVS_1025F1.D/MSD1.MS" -df = bp.read_chemstation_file(file_path) -df.to_csv("output.csv", index=False) -tic_df = df.groupby("retention_time", as_index=False)["intensity"].sum() -fig = px.line(tic_df, x="retention_time", y="intensity", - title="Total Ion Chromatogram (TIC)", - labels={"retention_time": "Retention Time (min)", "intensity": "Total Ion Intensity"}) -fig.show() diff --git a/build/lib.linux-x86_64-cpython-312/chemstation/__init__.py b/build/lib.linux-x86_64-cpython-312/chemstation/__init__.py deleted file mode 100644 index fbadbd5..0000000 --- a/build/lib.linux-x86_64-cpython-312/chemstation/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ - -from .read_ms_file import read_chemstation_file - -__all__ = [ - 'read_chemstation_file', -] diff --git a/build/lib.linux-x86_64-cpython-312/hplc/__init__.py b/build/lib.linux-x86_64-cpython-312/hplc/__init__.py deleted file mode 100644 index c95f498..0000000 --- a/build/lib.linux-x86_64-cpython-312/hplc/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .read_files import read_chromatograms, plot_chromatograms, read_uv, plot_uv - -__all__ = [ - 'read_chromatograms', - 'plot_chromatograms', - 'read_uv', - 'plot_uv' -] \ No newline at end of file diff --git a/build/lib.linux-x86_64-cpython-312/hplc/read_files.py b/build/lib.linux-x86_64-cpython-312/hplc/read_files.py deleted file mode 100644 index ce15916..0000000 --- a/build/lib.linux-x86_64-cpython-312/hplc/read_files.py +++ /dev/null @@ -1,118 +0,0 @@ -import parser_hplc as ph -import pandas as pd -import numpy as np -import plotly.graph_objs as go -import plotly.express as px -import re -import sys -from os import listdir -from os.path import isfile, join -from typeguard import typechecked -from typing import List - - -@typechecked -def check_identical_lists(lst: List[List[float]]) -> bool: - if not lst: - return False - first_sublist: List[float] = lst[0] - for sublist in lst[1:]: - if sublist != first_sublist: - return False - return True - - -@typechecked -def read_time(file_path: str, length: int) -> List[float]: - offsetTime = int("0000011a", 16) - time: List[float] = ph.readTime(file_path, offsetTime) - step_size: float = (time[1] - time[0]) / (length - 1) - res: List[float] = [time[0] + i * step_size for i in range(length)] - return res - - -@typechecked -def read_file_info(file_path: str) -> int: - offsetFileInfo = int("00001080", 16) - res = ph.readUint8(file_path, offsetFileInfo) - res = [x if x != "\x00" else "" for x in res] - res = "".join(res) - matches = int(re.findall(r"Sig=(\d+),", res)[0]) - return matches - - -@typechecked -def scale_data(file_path: str, l: List[int]) -> List[float]: - intercept: float = ph.readDouble(file_path, 4724) - slope: float = ph.readDouble(file_path, 4732) - res: List[float] = [float(i) * slope + intercept for i in l] - return res - - -@typechecked -def read_chromatograms(path: str) -> pd.DataFrame: - files: List[str] = [ - path + "/" + f - for f in listdir(path) - if isfile(join(path, f)) and f.endswith(".ch") - ] - wavelengths: List[str] = ["Wavelength_" + str(read_file_info(i)) for i in files] - offset: int = int("00001800", 16) - result: List[List[int]] = [ph.DeltaCompresion(i, offset, 12) for i in files] - result_scaled: List[List[float]] = [ - scale_data(files[i], result[i]) for i in range(0, len(result)) - ] - times: List[List[float]] = [read_time(i, len(result[0])) for i in files] - if not check_identical_lists(times): - raise ValueError("File Error") - time: List[float] = times[0] - df: pd.DataFrame = pd.DataFrame(result_scaled).transpose() - df.columns = wavelengths - df["time"] = time - return df - - -@typechecked -def plot_chromatograms(path: str): - df = read_chromatograms(path) - time = df["time"] - data = df.drop(columns=["time"]) - wavelengths = df.columns[:-1] - df_melted = df.melt(id_vars="time", var_name="Wavelengths", value_name="Data") - fig = px.line_3d( - df_melted, x="time", y="Wavelengths", z="Data", color="Wavelengths" - ) - fig.update_traces(marker=dict(size=5)) - fig.show() - - -@typechecked -def read_uv(path: str) -> pd.DataFrame: - uv = ph.UVClass(path) - time: pd.DataFrame = pd.DataFrame(uv.getTime()) - wavelengths: List[int] = uv.getWavelengths().astype("int").tolist() - data: pd.DataFrame = pd.DataFrame(uv.getData()) - data.columns = ["Wavelength_" + str(i) for i in wavelengths] - data["time"] = time - df_melted = data.melt(id_vars="time", var_name="Wavelengths", value_name="Data") - max_data = df_melted["Data"].max() - df_melted["Normalized_Data"] = df_melted["Data"] / max_data - df_unmelted = df_melted.pivot_table( - index="time", columns="Wavelengths", values="Normalized_Data" - ).reset_index() - return df_unmelted - - -@typechecked -def plot_uv(path: str): - df = read_uv(path) - time = df["time"] - data = df.drop(columns=["time"]) - wavelengths = df.columns[:-1] - trace = go.Surface(x=wavelengths, y=time, z=data.values) - fig = go.Figure(data=[trace]) - fig.show() - - -# path = "/home/konrad/Documents/GitHub/chromatogramsR/X-Vials/X3346.D/dad1.uv" -# plot_uv(path) diff --git a/build/lib.linux-x86_64-cpython-312/openlab/__init__.py b/build/lib.linux-x86_64-cpython-312/openlab/__init__.py deleted file mode 100644 index c1b4c44..0000000 --- a/build/lib.linux-x86_64-cpython-312/openlab/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .openlab import read_attr -from .openlab import read_lc -from .openlab import read_ms - -__all__ = ["read_attr", "read_lc", "read_ms"] diff --git a/build/lib.linux-x86_64-cpython-312/openlab/openlab.py b/build/lib.linux-x86_64-cpython-312/openlab/openlab.py deleted file mode 100644 index 709c82b..0000000 --- a/build/lib.linux-x86_64-cpython-312/openlab/openlab.py +++ /dev/null @@ -1,134 +0,0 @@ -import os -import netCDF4 as nc -import pandas as pd -import numpy as np -import re -from typeguard import typechecked -from typing import List - - -@typechecked -def get_files(path: str) -> List[str]: - fs = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".cdf")] - assert len(fs) > 0, "No files found" - return fs - - -# Attributes -@typechecked -def get_attr(path: str): - with nc.Dataset(path, "r") as dataset: - attr = {key: dataset.getncattr(key) for key in dataset.ncattrs()} - return attr - - -@typechecked -def read_attr(path: str) -> pd.DataFrame: - fs = get_files(path) - attrs_lc = [pd.DataFrame([get_attr(fs[x])]) for x in range(len(fs))] - attrs_lc = pd.concat(attrs_lc, ignore_index=True) - return attrs_lc - - -# LC Data -@typechecked -def get_lc_data(path: str) -> pd.DataFrame: - with nc.Dataset(path, "r") as dataset: - detector_signals = dataset.variables["ordinate_values"][:] - global_atts = {key: dataset.getncattr(key) for key in dataset.ncattrs()} - detector = global_atts.get("detector_name", "") - run_time_length = dataset.variables["actual_run_time_length"][...] - - data = pd.DataFrame( - { - "RetentionTime": np.linspace(0, run_time_length, num=len(detector_signals)), - "DetectorSignal": detector_signals, - } - ) - data.attrs["detector"] = detector - return data - - -@typechecked -def process_detector_info(df_list: List[pd.DataFrame]) -> List[pd.DataFrame]: - for df in df_list: - detector_name = df.attrs.get("detector", "") - wl_match = ( - re.search(r"\d+", detector_name.split(",")[1]) - if "," in detector_name - else None - ) - wl = float(wl_match.group()) if wl_match else None - df["wavelength"] = wl - return df_list - - -@typechecked -def read_lc(path: str) -> pd.DataFrame: - fs = get_files(path) - # Filter fs --> Files which contain DAD within their name - fs = [f for f in fs if "DAD" in os.path.basename(f)] - df = [get_lc_data(fs[x]) for x in range(len(fs))] - df = process_detector_info(df) - df = pd.concat(df, ignore_index=True) - return df - - -# MS Data -@typechecked -def get_point_counts(path: str) -> np.ma.MaskedArray: - with nc.Dataset(path, "r") as dataset: - res = dataset.variables["point_count"][:] - return res - - -@typechecked -def get_ms_data(path: str) -> pd.DataFrame: - with nc.Dataset(path, "r") as dataset: - mz_values = dataset.variables["mass_values"][:] - intensities = dataset.variables["intensity_values"][:] - return pd.DataFrame({"mz": mz_values, "intensities": intensities}) - - -@typechecked -def get_scan_time(path: str) -> np.ma.MaskedArray: - with nc.Dataset(path, "r") as dataset: - time = dataset.variables["scan_acquisition_time"][:] - return time / 60 - - -@typechecked -def split_data( - data: pd.DataFrame, point_counts: np.ma.MaskedArray -) -> List[pd.DataFrame]: - end_indices = np.cumsum(point_counts) - start_indices = np.insert(end_indices[:-1], 0, 0) - res = [data.iloc[start:end] for start, end in zip(start_indices, end_indices)] - return res - - -@typechecked -def normalise(data_list: List[pd.DataFrame]) -> List[pd.DataFrame]: - return [ - df.assign(intensities=df["intensities"] * (100 / df["intensities"].max())) - for df in data_list - ] - - -@typechecked -def read_ms(path: str) -> List[pd.DataFrame]: - fs = get_files(path) - fs_ms = [f for f in fs if "spectra" in os.path.basename(f)] - data_minus = get_ms_data(fs_ms[0]) - point_counts_minus = get_point_counts(fs_ms[0]) - time_minus = get_scan_time(fs_ms[0]) - df_minus = normalise(split_data(data_minus, point_counts_minus)) - - data_plus = get_ms_data(fs_ms[1]) - point_counts_plus = get_point_counts(fs_ms[1]) - time_plus = get_scan_time(fs_ms[1]) - df_plus = normalise(split_data(data_plus, point_counts_plus)) - - df_minus = pd.concat([df.assign(time=t) for df, t in zip(df_minus, time_minus)]) - df_plus = pd.concat([df.assign(time=t) for df, t in zip(df_plus, time_plus)]) - return [df_minus, df_plus] diff --git a/build/lib.linux-x86_64-cpython-312/parser_hplc.cpython-312-x86_64-linux-gnu.so b/build/lib.linux-x86_64-cpython-312/parser_hplc.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index a9a9368..0000000 Binary files a/build/lib.linux-x86_64-cpython-312/parser_hplc.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/build/lib.linux-x86_64-cpython-312/parser_ms.cpython-312-x86_64-linux-gnu.so b/build/lib.linux-x86_64-cpython-312/parser_ms.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index ff2ea23..0000000 Binary files a/build/lib.linux-x86_64-cpython-312/parser_ms.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/build/lib.linux-x86_64-cpython-312/parser_xray.cpython-312-x86_64-linux-gnu.so b/build/lib.linux-x86_64-cpython-312/parser_xray.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index 90efcf5..0000000 Binary files a/build/lib.linux-x86_64-cpython-312/parser_xray.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/build/lib.linux-x86_64-cpython-312/xray/__init__.py b/build/lib.linux-x86_64-cpython-312/xray/__init__.py deleted file mode 100644 index 1a21900..0000000 --- a/build/lib.linux-x86_64-cpython-312/xray/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .bruker_xray import read_raw - -__all__ = ["read_raw"] - diff --git a/build/temp.linux-x86_64-cpython-312/src/parser_hplc.o b/build/temp.linux-x86_64-cpython-312/src/parser_hplc.o deleted file mode 100644 index 61ae8c8..0000000 Binary files a/build/temp.linux-x86_64-cpython-312/src/parser_hplc.o and /dev/null differ diff --git a/build/temp.linux-x86_64-cpython-312/src/parser_ms.o b/build/temp.linux-x86_64-cpython-312/src/parser_ms.o deleted file mode 100644 index a66ce62..0000000 Binary files a/build/temp.linux-x86_64-cpython-312/src/parser_ms.o and /dev/null differ diff --git a/build/temp.linux-x86_64-cpython-312/src/parser_xray.o b/build/temp.linux-x86_64-cpython-312/src/parser_xray.o deleted file mode 100644 index 9337a11..0000000 Binary files a/build/temp.linux-x86_64-cpython-312/src/parser_xray.o and /dev/null differ diff --git a/chemstation/__init__.py b/chemstation/__init__.py deleted file mode 100644 index fbadbd5..0000000 --- a/chemstation/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ - -from .read_ms_file import read_chemstation_file - -__all__ = [ - 'read_chemstation_file', -] diff --git a/chemstation/read_ms_file.py b/chemstation/read_ms_file.py deleted file mode 100644 index f037b2e..0000000 --- a/chemstation/read_ms_file.py +++ /dev/null @@ -1,36 +0,0 @@ -import parser_ms as pm -import pandas as pd -import plotly.graph_objs as go -import plotly.express as px -from typeguard import typechecked -from typing import List - - -@typechecked -def convert_cycles_to_dfs(cycles: List[dict]) -> List[pd.DataFrame]: - """Convert Chemstation LC-MS cycles into a list of Pandas DataFrames.""" - cycle_dfs = [] - for i, cycle in enumerate(cycles): - df = pd.DataFrame({ - "mz": cycle["mz"], - "intensity": cycle["intensity"], - # Repeat for each row - "retention_time": [cycle["retention_time"]] * len(cycle["mz"]) - }) - df["cycle_id"] = i - cycle_dfs.append(df) - return cycle_dfs - - -@typechecked -def merge_cycles_into_df(cycles: List[dict]) -> pd.DataFrame: - """Convert all cycles into a single Pandas DataFrame with cycle_id.""" - cycle_dfs = convert_cycles_to_dfs(cycles) - return pd.concat(cycle_dfs, ignore_index=True) - - -@typechecked -def read_chemstation_file(file_path: str) -> pd.DataFrame: - cycles = pm.read_cycles(file_path) - cycle_dfs = convert_cycles_to_dfs(cycles) - return merge_cycles_into_df(cycles) diff --git a/hplc/__init__.py b/hplc/__init__.py deleted file mode 100644 index c95f498..0000000 --- a/hplc/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .read_files import read_chromatograms, plot_chromatograms, read_uv, plot_uv - -__all__ = [ - 'read_chromatograms', - 'plot_chromatograms', - 'read_uv', - 'plot_uv' -] \ No newline at end of file diff --git a/openlab/__init__.py b/openlab/__init__.py deleted file mode 100644 index c1b4c44..0000000 --- a/openlab/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .openlab import read_attr -from .openlab import read_lc -from .openlab import read_ms - -__all__ = ["read_attr", "read_lc", "read_ms"] diff --git a/parser_hplc.cpython-312-x86_64-linux-gnu.so b/parser_hplc.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index a9a9368..0000000 Binary files a/parser_hplc.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/parser_ms.cpython-312-x86_64-linux-gnu.so b/parser_ms.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index ff2ea23..0000000 Binary files a/parser_ms.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/parser_xray.cpython-312-x86_64-linux-gnu.so b/parser_xray.cpython-312-x86_64-linux-gnu.so deleted file mode 100755 index 90efcf5..0000000 Binary files a/parser_xray.cpython-312-x86_64-linux-gnu.so and /dev/null differ diff --git a/pyproject.toml b/pyproject.toml index 2b482a3..b423483 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,41 @@ - [build-system] -requires = ["setuptools", "wheel", "pybind11"] +requires = ["setuptools"] build-backend = "setuptools.build_meta" + +[project] +name = "BinaryParser" +version = "0.1.2" +authors = [{ name = "Konrad Krämer", email = "konrad.kraemer@kit.edu" }, { name = "Martin Starman", email = "martin.starman@kit.edu" }, { name = "Nicole Jung", email = "nicole.jung@kit.edu" }] +requires-python = ">= 3.12" +description = "Parsing binary files" +license = "AGPL-3.0-or-later" +license-files = ["LICENSE"] +readme = "README.md" +keywords = ["chemistry", "convert", "file", "format"] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] + +dependencies = [ + "pandas==2.3.3", + "numpy==2.4.1", + "typeguard==4.4.4", + "plotly==6.5.2", + "matplotlib==3.10.8", + "seaborn==0.13.2", + "netCDF4==1.7.4", +] + +[project.optional-dependencies] +dev = [ + "pytest", +] + +[project.urls] +homepage = "https://github.com/ComPlat/BinaryParser" +repository = "https://github.com/ComPlat/BinaryParser" + +[tool.setuptools.packages.find] +include = ["binary_parser*"] + diff --git a/setup.py b/setup.py deleted file mode 100644 index 63173cb..0000000 --- a/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -from pybind11.setup_helpers import Pybind11Extension, build_ext -from setuptools import setup, find_packages - -__version__ = "0.0.1" - -ext_modules = [ - Pybind11Extension( - "parser_hplc", - ["src/parser_hplc.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), - Pybind11Extension( - "parser_ms", - ["src/parser_ms.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), - Pybind11Extension( - "parser_xray", - ["src/parser_xray.cpp"], - define_macros=[("VERSION_INFO", __version__)], - extra_compile_args=["-std=c++17"], - ), -] - -setup( - name="BinaryParser", - version=__version__, - author="Konrad Krämer", - author_email="konrad.kraemer@kit.edu", - description="Parsing binary files", - long_description="", - ext_modules=ext_modules, - extras_require={"test": "pytest"}, - cmdclass={"build_ext": build_ext}, - zip_safe=False, - python_requires=">=3.7", - # packages=find_packages(), - packages=(["BinaryParser"] + ["openlab"] + find_packages()), - # package_dir={"BinaryParser": "."}, - package_dir={ - "BinaryParser": ".", - "BinaryParser.openlab": "./openlab", - }, - setup_requires=["pybind11"], - install_requires=[ - "pybind11", - "pandas", - "numpy", - "typeguard", - "plotly", - "matplotlib", - "seaborn", - "netCDF4", - ], -) diff --git a/src/parser_hplc.cpp b/src/parser_hplc.cpp deleted file mode 100644 index 517eaad..0000000 --- a/src/parser_hplc.cpp +++ /dev/null @@ -1,297 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; - -#define STRINGIFY(x) #x -#define MACRO_STRINGIFY(x) STRINGIFY(x) - -template void endianSwap16(T &x) { - x = (((x) >> 8) & 0xFF) | (((x) & 0xFF) << 8); -} - -template void endianSwap32(T &x) { - x = ((x >> 24) & 0xFF) | ((x << 8) & 0xFF0000) | ((x >> 8) & 0xFF00) | - (x << 24); -} - -void endianSwapU32(uint32_t &x) { - x = ((x << 24) & 0xFF000000) | ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | ((x >> 24) & 0x000000FF); -} - -uint16_t endianSwapU16(uint16_t value) { - return ((value & 0xFF) << 8) | ((value >> 8) & 0xFF); -} - -pybind11::list DeltaCompresion(std::string filepath, int offset, int n) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t sizeFile = file.tellg(); - std::vector res(sizeFile / 2); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - - int16_t buffer1 = 0; - int32_t buffer2 = 0; - int16_t buffer3 = 0; - int32_t buffer4 = 0; - - int iter = 0; - while (currentPos < sizeFile) { - file.read(reinterpret_cast(&buffer1), sizeof(int16_t)); - endianSwap16(buffer1); - buffer2 = buffer4; - - if (buffer1 << 12 == 0) { - res.resize(iter); - break; - } - - for (int i = 0; i < (buffer1 & 4095); i++) { - file.read(reinterpret_cast(&buffer3), sizeof(int16_t)); - endianSwap16(buffer3); - if (buffer3 != -32768) { - buffer2 = buffer2 + (int32_t)buffer3; - res[iter] = buffer2; - iter++; - } else { - file.read(reinterpret_cast(&buffer2), sizeof(int32_t)); - endianSwap32(buffer2); - res[iter] = buffer2; - iter++; - } - } - buffer4 = buffer2; - currentPos = file.tellg(); - file.seekg(currentPos, std::ios::beg); - } - return pybind11::cast(res); -} - -double readDouble(std::string &filepath, int offset) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - double buffer = 0; - - for (int i = 0; i < 1; i++) { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(double)); - uint64_t *ptr = reinterpret_cast(&buffer); - *ptr = __builtin_bswap64(*ptr); - } - file.close(); - return buffer; -} - -double readInt(std::string &filepath, int offset) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - int32_t buffer = 0; - - for (int i = 0; i < 1; i++) { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(int32_t)); - } - file.close(); - return buffer; -} - -pybind11::list readUint8(std::string &filepath, int offset) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - uint8_t buffer = 0; - std::vector res; - - for (int i = 0; i < 40; i++) { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(uint8_t)); - res.push_back(std::string(1, static_cast(buffer))); - } - file.close(); - return pybind11::cast(res); -} - -pybind11::list readTime(std::string &filepath, int offset) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - int32_t buffer = 0; - std::vector res(2); - for (int i = 0; i < 2; i++) { - file.read(reinterpret_cast(&buffer), sizeof(int32_t)); - endianSwap32(buffer); - res[i] = static_cast(buffer) / 60000.0; - } - file.close(); - return pybind11::cast(res); -} - -size_t updatePos(std::ifstream &file, int offset) { - size_t currentPos = file.tellg(); - file.seekg(currentPos + offset, std::ios::beg); - currentPos = file.tellg(); - return currentPos; -} - -double readInt32(std::string &filepath, int offset) { - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - int32_t buffer = 0; - - for (int i = 0; i < 1; i++) { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(int32_t)); - } - file.close(); - endianSwap32(buffer); - return buffer; -} - -struct UVClass { - UVClass(std::string filepath_) : filepath(filepath_) { - int offset = 0x1002; - int nscansOffset = 0x116; - int nscans = readInt32(filepath, nscansOffset); - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - std::runtime_error("Error opening file"); - size_t sizeFile = file.tellg(); - size_t currentPos = 0; - uint16_t buffer1 = 0; - uint32_t buffer2 = 0; - uint16_t buffer3 = 0; - uint16_t buffer4 = 0; - uint16_t buffer5 = 0; - int16_t buffer6 = 0; - int32_t buffer7 = 0; - time.resize(nscans); - ndata.resize(nscans); - - for (int i = 0; i < nscans; i++) { - file.seekg(currentPos + offset, std::ios::beg); - file.read(reinterpret_cast(&buffer1), sizeof(uint16_t)); // 2 - offset += buffer1; - file.read(reinterpret_cast(&buffer2), sizeof(uint32_t)); // 4 - time[i] = static_cast(buffer2) / 60000.0; - file.read(reinterpret_cast(&buffer3), sizeof(uint16_t)); // 2 - file.read(reinterpret_cast(&buffer4), sizeof(uint16_t)); // 2 - file.read(reinterpret_cast(&buffer5), sizeof(uint16_t)); // 2 - for (int wv = buffer3; wv < buffer4; wv += buffer5) { - double current_w = static_cast(wv) / 20.0; - auto it = std::find(wavelengths.begin(), wavelengths.end(), current_w); - if (it == wavelengths.end()) { - wavelengths.push_back(current_w); - } - } - auto max_wavelength_it = - std::max_element(wavelengths.begin(), wavelengths.end()); - int max_index = std::distance(wavelengths.begin(), max_wavelength_it); - std::vector wv_index_map; - for (int val = max_index + 1; val < wavelengths.size(); ++val) { - wv_index_map.push_back(val); - } - for (int val = 0; val <= max_index; ++val) { - wv_index_map.push_back(val); - } - ndata[i].resize(wavelengths.size()); - for (int j = 0; j < wv_index_map.size(); j++) { - file.read(reinterpret_cast(&buffer6), sizeof(int16_t)); // 2 - if (buffer6 == -32768) { - file.read(reinterpret_cast(&buffer7), sizeof(int32_t)); // 4 - } else { - buffer7 += buffer6; - } - ndata[i][j] = buffer7; // / 2000.0; // correct? - } - } - file.close(); - } - - py::array_t getTime() const { return py::cast(time); } - - py::array_t getWavelengths() const { return py::cast(wavelengths); } - - py::array_t getData() const { - std::size_t nRows = ndata.size(); - std::size_t nCols = ndata.empty() ? 0 : ndata[0].size(); - pybind11::array_t npArray({nRows, nCols}); - auto ptr = npArray.mutable_data(); - for (std::size_t i = 0; i < nRows; ++i) { - for (std::size_t j = 0; j < nCols; ++j) { - ptr[i * nCols + j] = ndata[i][j]; - } - } - return npArray; - } - - std::string filepath; - std::vector time; - std::vector wavelengths; - std::vector> ndata; -}; - -PYBIND11_MODULE(parser_hplc, m) { - py::class_(m, "UVClass") - .def(py::init()) - .def("getTime", &UVClass::getTime) - .def("getWavelengths", &UVClass::getWavelengths) - .def("getData", &UVClass::getData); - - m.doc() = R"pbdoc( - Pybind11 example plugin - ----------------------- - - .. currentmodule:: parser_hplc - - .. autosummary:: - :toctree: _generate - - readInt - readDouble - DeltaCompresionCpp - )pbdoc"; - m.def("DeltaCompresion", &DeltaCompresion, R"pbdoc( - read content of ch file and conduct delta compression on data - )pbdoc"); - m.def("readDouble", &readInt, R"pbdoc( - Reads a double at a specific location of a ch file - )pbdoc"); - m.def("readInt", &readInt, R"pbdoc( - Reads an int32_t at a specific location of a ch file - )pbdoc"); - m.def("readUint8", &readUint8, R"pbdoc( - Reads an uint8_t at a specific location of a ch file - )pbdoc"); - m.def("readTime", &readTime, R"pbdoc( - Reads the time of a ch file - )pbdoc"); - -#ifdef VERSION_INFO - m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); -#else - m.attr("__version__") = "dev"; -#endif -} diff --git a/src/parser_ms.cpp b/src/parser_ms.cpp deleted file mode 100644 index 48b29da..0000000 --- a/src/parser_ms.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// TODO: find min mz and max mz in meta data -// TODO: read meta data -// NOTE: Reading data of MSD1.MS file from -// Agilent ChemStation -#include -#include -#include -#include -#include -#include -#include -#include - -uint16_t endianSwapU16(uint16_t value) { - return ((value & 0xFF) << 8) | ((value >> 8) & 0xFF); -} - -void endianSwapU32(uint32_t &x) { - x = ((x << 24) & 0xFF000000) | ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | ((x >> 24) & 0x000000FF); -} - -std::vector ReadFile(const std::string &file_path) { - std::ifstream file(file_path, std::ios::binary | std::ios::ate); - if (!file.is_open()) { - throw std::runtime_error("Error opening file"); - } - std::size_t size = file.tellg(); - if (size == 0) { - throw std::runtime_error("File is empty"); - } - std::vector buffer(size); - file.seekg(0, std::ios::beg); - file.read(buffer.data(), size); - if (!file) { - throw std::runtime_error("Error reading file"); - } - file.close(); - return buffer; -} - -std::uint16_t CastToUint16(std::vector &buffer, std::size_t offset) { - std::uint16_t res = *reinterpret_cast(&buffer[offset]); - return endianSwapU16(res); -} - -std::uint32_t CastToUint32(std::vector &buffer, std::size_t offset) { - std::uint32_t res = *reinterpret_cast(&buffer[offset]); - endianSwapU32(res); - return res; -} - -std::size_t NumberOfCycles(std::vector &buffer) { - int data_start = 0x116; - return CastToUint32(buffer, data_start); -} - -std::size_t FindDataStart(std::vector &buffer) { - int data_start = 0x10A; - int offset_correction = CastToUint16(buffer, data_start); - int where = offset_correction * 2 - 2; - return where; -} - -std::vector ConvertMZIntensity(std::vector &data) { - std::vector mz_intensity; - mz_intensity.resize(data.size()); - for (std::size_t i = 0; i < data.size(); i++) { - if (i % 2 != 0) { // Intensity - uint16_t head_bits = data[i] >> 14; // Shift right by 14 bits - uint16_t tail_bits = - data[i] & 0x3FFF; // Extract tail: 0x3FFF = 0011111111111111 (14 bits) - mz_intensity[i] = std::pow(8, head_bits) * tail_bits; - } else { // MZ - mz_intensity[i] = static_cast(data[i]) / 20; - } - } - return mz_intensity; -} - -struct Cycle { - std::vector mz; - std::vector intensity; - double retention_time; - - // Convert Cycle to a Python dictionary - std::map to_dict() const { - return {{"mz", pybind11::cast(mz)}, - {"intensity", pybind11::cast(intensity)}, - {"retention_time", pybind11::cast(retention_time)}}; - } -}; - -void ReadCycleData(Cycle &cycle, std::vector &buffer, - std::size_t data_start, std::size_t cycle_size) { - std::vector data; - data.resize(cycle_size); - for (std::size_t i = 0; i < cycle_size; i++) { - data[i] = CastToUint16(buffer, data_start); - data_start += 2; - } - std::vector mz_intensity = ConvertMZIntensity(data); - cycle.mz.resize(mz_intensity.size() / 2); - cycle.intensity.resize(mz_intensity.size() / 2); - for (std::size_t i = 0; i < mz_intensity.size(); i++) { - if (i % 2 == 0) { - cycle.mz[i / 2] = mz_intensity[i]; - } else { - cycle.intensity[i / 2] = mz_intensity[i]; - } - } -} - -std::vector readCycles(const std::string &file_path) { - std::vector buffer = ReadFile(file_path); - std::size_t data_start = FindDataStart(buffer); - std::size_t number_of_cycles = NumberOfCycles(buffer); - std::vector cycles; - cycles.resize(number_of_cycles); - std::size_t counter = data_start; - for (std::size_t i = 0; i < number_of_cycles; i++) { - if (counter >= buffer.size()) { - throw std::runtime_error("Error extracting data"); - } - counter += 2; - std::size_t time = CastToUint32(buffer, counter); - counter += 10; - std::size_t temp = counter; - std::size_t cycle_size = CastToUint16(buffer, counter); - counter += 6; - ReadCycleData(cycles[i], buffer, counter, cycle_size * 2); - cycles[i].retention_time = static_cast(time) / 60000; - counter += cycle_size * 4; - counter += 10; - } - return cycles; -} - -namespace py = pybind11; - -std::vector> -py_readCycles(const std::string &file_path) { - std::vector cycles = readCycles(file_path); - std::vector> result; - for (const auto &cycle : cycles) { - result.push_back(cycle.to_dict()); - } - return result; -} - -PYBIND11_MODULE(parser_ms, m) { - m.doc() = "Chemstation MS data extraction module"; - m.def("read_cycles", &py_readCycles, - "Extract cycles from an Chemstation MS file"); -} diff --git a/src/parser_xray.cpp b/src/parser_xray.cpp deleted file mode 100644 index 5206e27..0000000 --- a/src/parser_xray.cpp +++ /dev/null @@ -1,159 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; - -#define STRINGIFY(x) #x -#define MACRO_STRINGIFY(x) STRINGIFY(x) - -template -void endianSwap16(T &x) -{ - x = (((x) >> 8) & 0xFF) | (((x) & 0xFF) << 8); -} - -template -void endianSwap32(T &x) -{ - x = ((x >> 24) & 0xFF) | ((x << 8) & 0xFF0000) | ((x >> 8) & 0xFF00) | (x << 24); -} - -void endianSwapU32(uint32_t &x) -{ - x = ((x << 24) & 0xFF000000) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | ((x >> 24) & 0x000000FF); -} - -uint16_t endianSwapU16(uint16_t value) -{ - return ((value & 0xFF) << 8) | ((value >> 8) & 0xFF); -} - -std::vector readInt32(std::string &filepath, int offset) -{ - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) - throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - file.seekg(currentPos + offset, std::ios::beg); - int32_t buffer = 0; - int n = 1000; - std::vector res(n); - for (int i = 0; i < n; i++) - { - auto pos = file.tellg(); - buffer = 0; - file.read(reinterpret_cast(&buffer), sizeof(int32_t)); - endianSwap32(buffer); - res[i] = buffer; - } - file.close(); - return res; -} - -pybind11::list readDoubles(std::string &filepath, int offset) -{ - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - size_t fileSize = file.tellg(); - size_t size = fileSize / sizeof(double); - file.seekg(currentPos + offset, std::ios::beg); - double buffer = 0; - std::vector res(size); - - for (int i = 0; i < size; i++) - { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(double)); - res[i] = buffer; - } - file.close(); - return pybind11::cast(res); -} - -pybind11::list readFloates(std::string &filepath, int offset) -{ - std::ifstream file(filepath, std::ios::binary | std::ios::ate); - if (!file.is_open()) throw std::runtime_error("Error opening file"); - size_t currentPos = 0; - size_t fileSize = file.tellg(); - size_t size = fileSize / sizeof(double); - file.seekg(currentPos + offset, std::ios::beg); - float buffer = 0; - std::vector res(size); - - for (int i = 0; i < size; i++) - { - auto pos = file.tellg(); - file.read(reinterpret_cast(&buffer), sizeof(float)); - res[i] = buffer; - } - file.close(); - return pybind11::cast(res); -} - - -void readChars(std::string &filepath) { - std::ifstream file(filepath, std::ios::binary); - file.seekg(0, std::ios::end); - size_t fileSize = file.tellg(); - file.seekg(0, std::ios::beg); - std::vector buffer(fileSize); - file.read(buffer.data(), fileSize); - file.close(); - - int address = 0x00000000; - size_t n = 8; - size_t size = buffer.size(); - for(size_t i = 0; i < (size / n); i++) { - for(size_t j = 0; j < n; j++) { - std::cout << std::hex << std::setfill('0') << std::setw(3) << address << " "; - address++; - } - std::cout << std::endl; - for(size_t j = 0; j < n; j++) { - std::cout << std::dec << "'" << buffer[i*n + j] << "'" << " "; - address++; - } - std::cout << std::endl; - } -} - -PYBIND11_MODULE(parser_xray, m) -{ - - m.doc() = R"pbdoc( - Pybind11 example plugin - ----------------------- - - .. currentmodule:: parser_xray - - .. autosummary:: - :toctree: _generate - - test - )pbdoc"; - m.def("readChars", &readChars, R"pbdoc( - Read content of file as chars - )pbdoc"); - m.def("readDoubles", &readDoubles, R"pbdoc( - Read content of file as doubles - )pbdoc"); - m.def("readFloates", &readFloates, R"pbdoc( - Read content of file as floates - )pbdoc"); - -#ifdef VERSION_INFO - m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); -#else - m.attr("__version__") = "dev"; -#endif -} diff --git a/tests/chemstation_test.py b/tests/chemstation_test.py index 5bb72a4..bbe601e 100644 --- a/tests/chemstation_test.py +++ b/tests/chemstation_test.py @@ -1,4 +1,4 @@ -import BinaryParser as bp +import binary_parser as bp import pandas as pd import numpy as np @@ -57,7 +57,6 @@ def compare_spectras(df, spectra_true, time): def test_svs1025f1(): file_path = "./tests/Chemstation/SVS_1025F1.D/MSD1.MS" df = bp.read_chemstation_file(file_path) - # Compute TIC tic_df = df.groupby("retention_time", as_index=False)["intensity"].sum() tic_true = pd.read_csv( "./tests/Chemstation/TIC_SVS1025F1.CSV", @@ -77,8 +76,6 @@ def test_svs1025f1(): compare_spectras(df, spectra_true, time) -test_svs1025f1() - def test_scs776roh(): file_path = "./tests/Chemstation/SVS-776ROH.D/MSD1.MS" @@ -100,6 +97,3 @@ def test_scs776roh(): delimiter=",", encoding="utf-16", header=None ) compare_spectras(df, spectra_true, time) - - -test_scs776roh() diff --git a/tests/chemstation_visualisation.py b/tests/chemstation_visualisation.py index 8980d5e..81947fc 100644 --- a/tests/chemstation_visualisation.py +++ b/tests/chemstation_visualisation.py @@ -1,6 +1,5 @@ import matplotlib.pyplot as plt -import BinaryParser as bp -import pandas as pd +import binary_parser as bp file_path = "./tests/Chemstation/SVS_1025F1.D/MSD1.MS" df = bp.read_chemstation_file(file_path) diff --git a/tests/hplc_test.py b/tests/hplc_test.py index 0a55177..c1be20b 100644 --- a/tests/hplc_test.py +++ b/tests/hplc_test.py @@ -1,4 +1,4 @@ -import BinaryParser as bp +import binary_parser as bp def test_read_chromatograms(): diff --git a/tests/openlab_test.py b/tests/openlab_test.py index 445897a..c331025 100755 --- a/tests/openlab_test.py +++ b/tests/openlab_test.py @@ -1,4 +1,4 @@ -import BinaryParser as bp +import binary_parser.openlab as bp path = "./tests/OpenLab/" @@ -8,7 +8,6 @@ def test_read_attr(): assert attr.shape == (12, 49) assert attr["detector_unit"][1] == "mAU" -test_read_attr() def test_read_ls(): @@ -17,7 +16,6 @@ def test_read_ls(): assert data.columns.tolist() == ["RetentionTime", "DetectorSignal", "wavelength"] assert all(data["wavelength"].unique() == [210, 230, 254, 280, 366, 450, 550, 580]) -test_read_ls() def test_read_ms(): @@ -26,4 +24,3 @@ def test_read_ms(): assert ms[0].shape == (1358778, 3) assert ms[1].shape == (1324471, 3) -test_read_ms() diff --git a/xray/__init__.py b/xray/__init__.py deleted file mode 100644 index 1a21900..0000000 --- a/xray/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .bruker_xray import read_raw - -__all__ = ["read_raw"] - diff --git a/xray/bruker_xray.py b/xray/bruker_xray.py deleted file mode 100644 index 009540d..0000000 --- a/xray/bruker_xray.py +++ /dev/null @@ -1,97 +0,0 @@ -import parser_xray as px -import pandas as pd -from typeguard import typechecked -from typing import List - -search_for = { - "GONIOMETER_RADIUS": 217.5, - "FIXED_DIVSLIT": 0.6, - "SAMPLESLIT": 0.0, - "DETSLIT": 10.39, - "ANTISLIT": 6.17, - "START": 20.0, - "THETA": 10.0, - "THETA2": 20.0, - "TIMESTARTED": 14, - "TEMP_RATE": -1, - "TEMP_DELAY": -1, - "KV": 35, - "MA": 45, - "WL1": 1.540600, - "WL2": 1.544390, - "WL3": 1.392220, -} - -res = px.readDoubles( - "/home/konrad/Documents/GitHub/RProjects/chromatogramsR/Bruker/PD.raw", 0 -) -df = pd.DataFrame(res) -df.columns = ["data"] - -for key, value in search_for.items(): - result_df = df[df["data"].eq(value)] - if not result_df.empty: - print(f"Match found for {key} at index {result_df.index[0]}") - -print() -print() -print() - -res = px.readFloates( - "/home/konrad/Documents/GitHub/RProjects/chromatogramsR/Bruker/PD.raw", 0 -) -df = pd.DataFrame(res) -df.columns = ["data"] - -for key, value in search_for.items(): - result_df = df[df["data"].eq(value)] - if not result_df.empty: - print(f"Match found for {key} at index {result_df.index[0]}") - - -# data starts at 0x420 --> 1056 --> in float index 264 --> 263 in python -# data is read as float until end of file -print() -print() -print() - - -path = "/home/konrad/Documents/GitHub/RProjects/chromatogramsR/Bruker/WeitereDaten/XRD/7_80_3_001651_Cu_SSZ13_05_7.raw" -# _WL1=1.540600 -# _WL2=1.544390 -# _WL3=0.00000 -# _WLRATIO=0.500000 -# _START=7.000000 -# _THETA=3.500000 -# _2THETA=7.000000 - -# xxd -# 000003c8: 0000 5c00 0000 0200 ..\..... --> 968 -# 000003d0: 0000 3254 6865 7461 ..2Theta --> 976 -# 000003d8: 0000 0000 0000 0000 ........ -# 000003e0: 0000 0000 0000 0000 ........ -# 000003e8: 0000 0000 0000 0000 ........ -# 000003f0: 0000 0000 0000 0000 ........ -# 000003f8: 0000 0100 0000 0000 ........ -# 00000400: 0000 0000 1c40 0000 .....@.. -# 00000408: 0000 0000 0000 0000 ........ -# 00000410: 0000 0000 0000 0000 ........ -# 00000418: 0000 0000 0000 0000 ........ -# 00000420: 0000 3200 0000 5c00 ..2...\. - - -res = px.readDoubles(path, 0) -df = pd.DataFrame(res) -df.columns = ["data"] -print(df.iloc[100:130]) -df = df[(df.round() != 0.0) & (df < 1000.0) & (df > 0.0)].dropna() -df = df.iloc[0:60] - - -res = px.readFloates(path, 0) - -df = pd.DataFrame(res) -print(df.iloc[240:250]) -df.columns = ["data"] -df = df[(df.round() != 0.0) & (df < 1000.0) & (df > 0.0)].dropna() -df = df.iloc[0:60]