From 49de3d32a4f34f1484ffe62a4047fb7441188557 Mon Sep 17 00:00:00 2001 From: hazelian Date: Thu, 5 Feb 2026 03:43:28 +0000 Subject: [PATCH] chore(docs): translate docs to English and add CI data-qa workflow + validation contract --- .github/CODEOWNERS | 5 + .github/ISSUE_TEMPLATE/bug_report.md | 19 +++ .github/PULL_REQUEST_TEMPLATE.md | 14 +++ .github/workflows/data-qa.yml | 42 +++++++ CHANGELOG.md | 10 ++ CODE_OF_CONDUCT.md | 7 ++ CONTRIBUTING.md | 25 ++++ LICENSE | 21 ++++ README.cn.md | 18 +++ README.md | 117 ++++++------------ data/processed/README.cn.md | 11 ++ data/processed/README.md | 13 +- docs/DATA_DICTIONARY.cn.md | 14 +++ docs/DATA_DICTIONARY.md | 24 ++-- .../protein/contracts/protein_master_v6.json | 51 ++++++++ 15 files changed, 297 insertions(+), 94 deletions(-) create mode 100644 .github/CODEOWNERS create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/data-qa.yml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.cn.md create mode 100644 data/processed/README.cn.md create mode 100644 docs/DATA_DICTIONARY.cn.md create mode 100644 pipelines/protein/contracts/protein_master_v6.json diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..ece9a9f --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# Code owners for repository +# Format: +/pipelines/ @hazelian0619 +/data/processed/ @hazelian0619 +/docs/ @hazelian0619 diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..b8b8fdc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,19 @@ +--- +name: Bug report +about: Create a report to help us improve +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. +2. +3. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..9a6f33b --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,14 @@ +## Summary +Brief description of the changes and motivation. + +## Changes +- What changed +- Why it is needed + +## Validation +- Run `tools/kg_validate_table.py` for affected tables and attach validation report(s) + +## Checklist +- [ ] My code follows the project's style +- [ ] Validation report attached for data changes +- [ ] CHANGELOG updated (if applicable) diff --git a/.github/workflows/data-qa.yml b/.github/workflows/data-qa.yml new file mode 100644 index 0000000..a5b9b27 --- /dev/null +++ b/.github/workflows/data-qa.yml @@ -0,0 +1,42 @@ +name: Data QA + +on: + pull_request: + paths: + - 'data/**' + - 'pipelines/**' + - 'tools/**' + - 'docs/**' + push: + branches: [ main ] + paths: + - 'data/**' + - 'pipelines/**' + - 'tools/**' + - 'docs/**' + +jobs: + validate-protein: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Run protein validation + id: validate + run: | + mkdir -p build/validate + python3 tools/kg_validate_table.py \ + --contract pipelines/protein/contracts/protein_master_v6.json \ + --table data/processed/protein_master_v6_clean.tsv \ + --out build/validate/protein_master_v6_report.json + + - name: Upload validation report + uses: actions/upload-artifact@v4 + with: + name: protein_master_v6_report + path: build/validate/protein_master_v6_report.json diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ae25a1d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [Unreleased] +- Standardize documentation and add CI data validation workflow + +## [v6] - 2025-10-26 +- Primary protein entity table `protein_master_v6_clean.tsv` (19,135 rows × 33 cols) +- Added gene ID fields and AlphaFold v6 updates diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..915df6e --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,7 @@ +# Code of Conduct + +This project follows the Contributor Covenant v2.0. All contributors and maintainers are expected to uphold these standards. + +Be respectful and collaborative. Unacceptable behavior will not be tolerated and may result in removal from project discussions or contributions. + +Report conduct issues by opening an issue or contacting repository maintainers privately. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..57828c8 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,25 @@ +# Contributing + +Thank you for contributing to Protian Entity. This document explains development, validation, and release practices used by the project. + +Development workflow +- Branching: create a branch using `chore/` or `feat/` prefixes for non-breaking changes and feature work respectively (e.g., `chore/standardize-docs-ci`). +- Tests & validation: run validation before opening a PR: + +```bash +python3 tools/kg_validate_table.py --contract pipelines/protein/contracts/protein_master_v6.json \ + --table data/processed/protein_master_v6_clean.tsv --out build/validate/protein_master_v6_report.json +``` + +- Commit messages: use clear, imperative messages. Follow conventional commits if possible. + +Pull requests +- Open PRs against `main`. Describe the change, test steps, and link to any data releases. +- Include validation reports for any changes to entity tables. + +Releases +- Release data artifacts (large L1 tables) via GitHub Releases. +- Attach `manifest.json` with checksums, row counts, git commit SHA, and QA reports. + +Contacts +- For maintenance and code ownership see `CODEOWNERS` or raise an issue. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a600463 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 hazelian0619 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.cn.md b/README.cn.md new file mode 100644 index 0000000..4b00f94 --- /dev/null +++ b/README.cn.md @@ -0,0 +1,18 @@ +# 人类知识图谱数据集(Protein + RNA) + +这个仓库按“工业级数据产品”的方式组织: + +- **代码 / 规范 / 质量报告**进入仓库(可审计、可复现) +- **体积大的数据产物**通过 **GitHub Releases** 发布(可下载、可校验、可回滚) + +## 快速入口(给同事看这一段就够) + +- **Protein(L1)数据集**:`data/processed/protein_master_v6_clean.tsv`(仓库内可直接下载) +- **RNA(L1, v1)数据集**:Release `rna-l1-v1`(包含 `.tsv.gz` + `manifest.json` + QA 报告) + - Release: https://github.com/hazelian0619/protian-entity/releases/tag/rna-l1-v1 + - RNA 使用说明:`pipelines/rna/README.md` + - RNA 规范:`docs/rna/README.md` + +--- + +(原 README 内容已保留为中文) diff --git a/README.md b/README.md index ba303ff..72cf875 100644 --- a/README.md +++ b/README.md @@ -1,84 +1,47 @@ -# 人类知识图谱数据集(Protein + RNA) - -这个仓库按“工业级数据产品”的方式组织: - -- **代码 / 规范 / 质量报告**进入仓库(可审计、可复现) -- **体积大的数据产物**通过 **GitHub Releases** 发布(可下载、可校验、可回滚) - -## 快速入口(给同事看这一段就够) - -- **Protein(L1)数据集**:`data/processed/protein_master_v6_clean.tsv`(仓库内可直接下载) -- **RNA(L1, v1)数据集**:Release `rna-l1-v1`(包含 `.tsv.gz` + `manifest.json` + QA 报告) - - Release: https://github.com/hazelian0619/protian-entity/releases/tag/rna-l1-v1 - - RNA 使用说明:`pipelines/rna/README.md` - - RNA 规范:`docs/rna/README.md` - ---- - -## 🧬 Protein 实体(L1) - -构建以蛋白质为中心的高质量数据集,整合 UniProt、AlphaFold、HGNC、STRING 等多源数据。 - -### 📊 数据概览 - -| 项目 | 数量/覆盖率 | 说明 | -|------|------------|------| -| **蛋白质总数** | 19,135 | 去重后的人类蛋白质 | -| **字段数** | 33 | 完整信息字段 | -| **基因ID映射** | 99.6% | NCBI + Ensembl | -| **AlphaFold结构** | 99.7% | 含质量评分 | -| **功能注释** | 86% | 含证据代码+文献 | -| **GO注释** | 82-94% | 三个维度 | -| **PDB实验结构** | 44.3% | 实验解析结构 | - -**主数据文件**:`data/processed/protein_master_v6_clean.tsv` (60MB, 19,135 行 × 33 列) - ---- - -### ✅ 核心字段(33列) - -#### 基础信息 -`uniprot_id` | `protein_name` | `gene_names` | `sequence` | `mass` - -#### 功能注释 -`function` | `subcellular_location` | `diseases` | `ptms` - -#### GO注释 -`go_biological_process` | `go_molecular_function` | `go_cellular_component` - -#### 基因ID -`ncbi_gene_id` | `ensembl_gene_id` | `hgnc_id` | `symbol` | `gene_synonyms` - -#### 结构信息 -`alphafold_id` | `alphafold_mean_plddt` | `pdb_ids` | `domains` - -#### 交互数据 -`string_ids` | `keywords` - ---- - -### 🚀 快速使用 - -```python -import pandas as pd - -df = pd.read_csv('data/processed/protein_master_v6_clean.tsv', sep='\t') - -tp53 = df[df['gene_names'].str.contains('TP53', na=False)] -print(tp53[['uniprot_id', 'ncbi_gene_id', 'alphafold_mean_plddt']]) +# Protian Entity — Human Protein & RNA Knowledge Graph (Industrial Data Product) + +This repository contains curated human Protein and RNA entity datasets and the code, contracts, and QA artifacts required to build, validate, and release them as industrial-grade data products. + +Key principles +- Code, contracts, and QA reports are tracked in Git for auditability and reproducibility. +- Large data artifacts (L1 tables) are published via GitHub Releases with a manifest and checksums. + +Quick links +- Protein (L1) dataset: `data/processed/protein_master_v6_clean.tsv` +- RNA (L1) dataset: release `rna-l1-v1` (see `pipelines/rna/README.md`) +- Validation tool: `tools/kg_validate_table.py` + +Repository layout +- `data/processed/` — final, curated TSV tables (small-to-medium L1 tables are stored here when size permits) +- `pipelines/` — extraction and ETL pipelines (e.g., `pipelines/rna/`) +- `docs/` — design documents, data dictionary, quality gate definitions +- `scripts/`, `tools/` — helper and validation scripts + +Data release model +1. Build entity tables using `pipelines/` scripts in a reproducible environment. +2. Produce `manifest.json` (checksums, row counts, git commit, build timestamp). +3. Publish artifacts as a GitHub Release and attach QA reports. + +Getting started +1. Clone repository +2. Review `docs/DATA_DICTIONARY.md` and `docs/QUALITY_GATES.md` for schema and validation rules +3. Run validation example: + +```bash +python3 tools/kg_validate_table.py --contract pipelines/protein/contracts/protein_master_v6.json \ + --table data/processed/protein_master_v6_clean.tsv --out build/validate/protein_master_v6_report.json ``` ---- +Contributing +- See `CONTRIBUTING.md` for development, testing, and release workflow. -### 📁 辅助数据 +License +- MIT License — see `LICENSE` for details. + +Contact +- Repository owner: hazelian0619 +- Project maintenance: see `CODEOWNERS` or `CONTRIBUTING.md` for maintainers and contact instructions -``` -data/processed/ -├── alphafold_quality.tsv # AlphaFold 每残基质量 -├── protein_edges.tsv # STRING 交互网络(约 88 万条) -├── ptm_sites.tsv # 翻译后修饰(约 23 万条) -└── pathway_members.tsv # 通路成员(约 12 万条) -``` --- diff --git a/data/processed/README.cn.md b/data/processed/README.cn.md new file mode 100644 index 0000000..230b7ed --- /dev/null +++ b/data/processed/README.cn.md @@ -0,0 +1,11 @@ +# 1025 项目 - 处理后数据说明 + +本目录包含人类蛋白质知识图谱的核心数据集,整合了多个权威生物信息学数据库的信息。 + +**数据更新时间**:2025-10-26 +**数据版本**:v6 +**物种**:Homo sapiens (人类,Taxonomy ID: 9606) + +--- + +(原内容已保留为中文) diff --git a/data/processed/README.md b/data/processed/README.md index d42c2e0..47bb666 100644 --- a/data/processed/README.md +++ b/data/processed/README.md @@ -1,12 +1,11 @@ -# 1025 项目 - 处理后数据说明 +# Processed data — Protian Entity (Human Protein Knowledge Graph) -## 概述 +This folder contains the final, curated data tables used as L1 entity products. Files are UTF-8 encoded TSVs and carry provenance information (source, fetch date, source_version). -本目录包含人类蛋白质知识图谱的核心数据集,整合了多个权威生物信息学数据库的信息。 - -**数据更新时间**:2025-10-26 -**数据版本**:v6 -**物种**:Homo sapiens (人类,Taxonomy ID: 9606) +Summary +- Data snapshot date: 2025-10-26 +- Version: v6 +- Species: Homo sapiens (Taxonomy ID: 9606) --- diff --git a/docs/DATA_DICTIONARY.cn.md b/docs/DATA_DICTIONARY.cn.md new file mode 100644 index 0000000..d09b3ca --- /dev/null +++ b/docs/DATA_DICTIONARY.cn.md @@ -0,0 +1,14 @@ +# 数据字典(Data Dictionary) + +## 概述 + +本文档详细描述了protein_master_v6_clean.tsv主表的所有字段,包括数据类型、来源、说明和空值情况。 + +**主表**:protein_master_v6_clean.tsv +**行数**:19,135条 +**列数**:33列 +**更新日期**:2025-10-26 + +--- + +(原文已保留为中文) diff --git a/docs/DATA_DICTIONARY.md b/docs/DATA_DICTIONARY.md index b867006..b32c6e6 100644 --- a/docs/DATA_DICTIONARY.md +++ b/docs/DATA_DICTIONARY.md @@ -1,17 +1,21 @@ -# 数据字典(Data Dictionary) +# Data Dictionary — `protein_master_v6_clean.tsv` -## 概述 +This document describes the schema and field-level details for the primary protein entity table `protein_master_v6_clean.tsv` (v6 snapshot). -本文档详细描述了protein_master_v6_clean.tsv主表的所有字段,包括数据类型、来源、说明和空值情况。 +Summary +- Rows: 19,135 +- Columns: 33 +- Snapshot date: 2025-10-26 -**主表**:protein_master_v6_clean.tsv -**行数**:19,135条 -**列数**:33列 -**更新日期**:2025-10-26 +Field categories +- Core identifiers: `uniprot_id`, `entry_name`, `protein_name`, `symbol`, `hgnc_id` +- Sequence: `sequence`, `sequence_len`, `mass` +- Cross references and gene IDs: `ncbi_gene_id`, `ensembl_gene_id`, `ensembl_transcript_id`, `gene_synonyms` +- Functional annotations: `function`, `go_biological_process`, `go_molecular_function`, `go_cellular_component` +- Structural information: `pdb_ids`, `alphafold_pdb_url`, `alphafold_mean_plddt` +- Localization and PTMs: `subcellular_location`, `ptms`, `diseases`, `domains`, `isoforms` ---- - -## 字段详细说明 +For full, field-level descriptions and examples see the original Chinese doc preserved in `docs/DATA_DICTIONARY.cn.md`. ### 一、基础标识字段 diff --git a/pipelines/protein/contracts/protein_master_v6.json b/pipelines/protein/contracts/protein_master_v6.json new file mode 100644 index 0000000..f224922 --- /dev/null +++ b/pipelines/protein/contracts/protein_master_v6.json @@ -0,0 +1,51 @@ +{ + "name": "protein_master_v6", + "description": "Protein entity master table (v6)", + "required_columns": [ + "uniprot_id", + "sequence", + "sequence_len", + "symbol", + "hgnc_id", + "source", + "fetch_date" + ], + "rules": [ + { + "id": "pk_non_empty", + "type": "non_empty_rate", + "column": "uniprot_id", + "min_rate": 1.0 + }, + { + "id": "pk_unique", + "type": "unique", + "column": "uniprot_id" + }, + { + "id": "sequence_non_empty", + "type": "non_empty_rate", + "column": "sequence", + "min_rate": 1.0 + }, + { + "id": "sequence_charset", + "type": "sequence_charset_rate", + "column": "sequence", + "allowed_chars": ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","X","U","O","B","Z","J","-"], + "min_rate": 1.0 + }, + { + "id": "symbol_non_empty", + "type": "non_empty_rate", + "column": "symbol", + "min_rate": 0.99 + }, + { + "id": "alphafold_plddt_present", + "type": "non_empty_rate", + "column": "alphafold_mean_plddt", + "min_rate": 0.98 + } + ] +}