From 49de3d32a4f34f1484ffe62a4047fb7441188557 Mon Sep 17 00:00:00 2001
From: hazelian <hazelian0619@163.com>
Date: Thu, 5 Feb 2026 03:43:28 +0000
Subject: [PATCH] chore(docs): translate docs to English and add CI data-qa
 workflow + validation contract

---
 .github/CODEOWNERS                            |   5 +
 .github/ISSUE_TEMPLATE/bug_report.md          |  19 +++
 .github/PULL_REQUEST_TEMPLATE.md              |  14 +++
 .github/workflows/data-qa.yml                 |  42 +++++++
 CHANGELOG.md                                  |  10 ++
 CODE_OF_CONDUCT.md                            |   7 ++
 CONTRIBUTING.md                               |  25 ++++
 LICENSE                                       |  21 ++++
 README.cn.md                                  |  18 +++
 README.md                                     | 117 ++++++------------
 data/processed/README.cn.md                   |  11 ++
 data/processed/README.md                      |  13 +-
 docs/DATA_DICTIONARY.cn.md                    |  14 +++
 docs/DATA_DICTIONARY.md                       |  24 ++--
 .../protein/contracts/protein_master_v6.json  |  51 ++++++++
 15 files changed, 297 insertions(+), 94 deletions(-)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/workflows/data-qa.yml
 create mode 100644 CHANGELOG.md
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.cn.md
 create mode 100644 data/processed/README.cn.md
 create mode 100644 docs/DATA_DICTIONARY.cn.md
 create mode 100644 pipelines/protein/contracts/protein_master_v6.json
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..ece9a9f
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,5 @@
+# Code owners for repository
+# Format: <file pattern> <owner>
+/pipelines/ @hazelian0619
+/data/processed/ @hazelian0619
+/docs/ @hazelian0619
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..b8b8fdc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,19 @@
+---
+name: Bug report
+about: Create a report to help us improve
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. 
+2. 
+3. 
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..9a6f33b
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,14 @@
+## Summary
+Brief description of the changes and motivation.
+
+## Changes
+- What changed
+- Why it is needed
+
+## Validation
+- Run `tools/kg_validate_table.py` for affected tables and attach validation report(s)
+
+## Checklist
+- [ ] My code follows the project's style
+- [ ] Validation report attached for data changes
+- [ ] CHANGELOG updated (if applicable)
diff --git a/.github/workflows/data-qa.yml b/.github/workflows/data-qa.yml
new file mode 100644
index 0000000..a5b9b27
--- /dev/null
+++ b/.github/workflows/data-qa.yml
@@ -0,0 +1,42 @@
+name: Data QA
+
+on:
+  pull_request:
+    paths:
+      - 'data/**'
+      - 'pipelines/**'
+      - 'tools/**'
+      - 'docs/**'
+  push:
+    branches: [ main ]
+    paths:
+      - 'data/**'
+      - 'pipelines/**'
+      - 'tools/**'
+      - 'docs/**'
+
+jobs:
+  validate-protein:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Run protein validation
+        id: validate
+        run: |
+          mkdir -p build/validate
+          python3 tools/kg_validate_table.py \
+            --contract pipelines/protein/contracts/protein_master_v6.json \
+            --table data/processed/protein_master_v6_clean.tsv \
+            --out build/validate/protein_master_v6_report.json
+
+      - name: Upload validation report
+        uses: actions/upload-artifact@v4
+        with:
+          name: protein_master_v6_report
+          path: build/validate/protein_master_v6_report.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..ae25a1d
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+## [Unreleased]
+- Standardize documentation and add CI data validation workflow
+
+## [v6] - 2025-10-26
+- Primary protein entity table `protein_master_v6_clean.tsv` (19,135 rows × 33 cols)
+- Added gene ID fields and AlphaFold v6 updates
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..915df6e
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,7 @@
+# Code of Conduct
+
+This project follows the Contributor Covenant v2.0. All contributors and maintainers are expected to uphold these standards.
+
+Be respectful and collaborative. Unacceptable behavior will not be tolerated and may result in removal from project discussions or contributions.
+
+Report conduct issues by opening an issue or contacting repository maintainers privately.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..57828c8
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,25 @@
+# Contributing
+
+Thank you for contributing to Protian Entity. This document explains development, validation, and release practices used by the project.
+
+Development workflow
+- Branching: create a branch using `chore/` or `feat/` prefixes for non-breaking changes and feature work respectively (e.g., `chore/standardize-docs-ci`).
+- Tests & validation: run validation before opening a PR:
+
+```bash
+python3 tools/kg_validate_table.py --contract pipelines/protein/contracts/protein_master_v6.json \
+  --table data/processed/protein_master_v6_clean.tsv --out build/validate/protein_master_v6_report.json
+```
+
+- Commit messages: use clear, imperative messages. Follow conventional commits if possible.
+
+Pull requests
+- Open PRs against `main`. Describe the change, test steps, and link to any data releases.
+- Include validation reports for any changes to entity tables.
+
+Releases
+- Release data artifacts (large L1 tables) via GitHub Releases.
+- Attach `manifest.json` with checksums, row counts, git commit SHA, and QA reports.
+
+Contacts
+- For maintenance and code ownership see `CODEOWNERS` or raise an issue.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a600463
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 hazelian0619
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.cn.md b/README.cn.md
new file mode 100644
index 0000000..4b00f94
--- /dev/null
+++ b/README.cn.md
@@ -0,0 +1,18 @@
+# 人类知识图谱数据集（Protein + RNA）
+
+这个仓库按“工业级数据产品”的方式组织：
+
+- **代码 / 规范 / 质量报告**进入仓库（可审计、可复现）
+- **体积大的数据产物**通过 **GitHub Releases** 发布（可下载、可校验、可回滚）
+
+## 快速入口（给同事看这一段就够）
+
+- **Protein（L1）数据集**：`data/processed/protein_master_v6_clean.tsv`（仓库内可直接下载）
+- **RNA（L1, v1）数据集**：Release `rna-l1-v1`（包含 `.tsv.gz` + `manifest.json` + QA 报告）
+  - Release: https://github.com/hazelian0619/protian-entity/releases/tag/rna-l1-v1
+  - RNA 使用说明：`pipelines/rna/README.md`
+  - RNA 规范：`docs/rna/README.md`
+
+---
+
+（原 README 内容已保留为中文）
diff --git a/README.md b/README.md
index ba303ff..72cf875 100644
--- a/README.md
+++ b/README.md
@@ -1,84 +1,47 @@
-# 人类知识图谱数据集（Protein + RNA）
-
-这个仓库按“工业级数据产品”的方式组织：
-
-- **代码 / 规范 / 质量报告**进入仓库（可审计、可复现）
-- **体积大的数据产物**通过 **GitHub Releases** 发布（可下载、可校验、可回滚）
-
-## 快速入口（给同事看这一段就够）
-
-- **Protein（L1）数据集**：`data/processed/protein_master_v6_clean.tsv`（仓库内可直接下载）
-- **RNA（L1, v1）数据集**：Release `rna-l1-v1`（包含 `.tsv.gz` + `manifest.json` + QA 报告）
-  - Release: https://github.com/hazelian0619/protian-entity/releases/tag/rna-l1-v1
-  - RNA 使用说明：`pipelines/rna/README.md`
-  - RNA 规范：`docs/rna/README.md`
-
----
-
-## 🧬 Protein 实体（L1）
-
-构建以蛋白质为中心的高质量数据集，整合 UniProt、AlphaFold、HGNC、STRING 等多源数据。
-
-### 📊 数据概览
-
-| 项目 | 数量/覆盖率 | 说明 |
-|------|------------|------|
-| **蛋白质总数** | 19,135 | 去重后的人类蛋白质 |
-| **字段数** | 33 | 完整信息字段 |
-| **基因ID映射** | 99.6% | NCBI + Ensembl |
-| **AlphaFold结构** | 99.7% | 含质量评分 |
-| **功能注释** | 86% | 含证据代码+文献 |
-| **GO注释** | 82-94% | 三个维度 |
-| **PDB实验结构** | 44.3% | 实验解析结构 |
-
-**主数据文件**：`data/processed/protein_master_v6_clean.tsv` (60MB, 19,135 行 × 33 列)
-
----
-
-### ✅ 核心字段（33列）
-
-#### 基础信息
-`uniprot_id` | `protein_name` | `gene_names` | `sequence` | `mass`
-
-#### 功能注释
-`function` | `subcellular_location` | `diseases` | `ptms`
-
-#### GO注释
-`go_biological_process` | `go_molecular_function` | `go_cellular_component`
-
-#### 基因ID
-`ncbi_gene_id` | `ensembl_gene_id` | `hgnc_id` | `symbol` | `gene_synonyms`
-
-#### 结构信息
-`alphafold_id` | `alphafold_mean_plddt` | `pdb_ids` | `domains`
-
-#### 交互数据
-`string_ids` | `keywords`
-
----
-
-### 🚀 快速使用
-
-```python
-import pandas as pd
-
-df = pd.read_csv('data/processed/protein_master_v6_clean.tsv', sep='\t')
-
-tp53 = df[df['gene_names'].str.contains('TP53', na=False)]
-print(tp53[['uniprot_id', 'ncbi_gene_id', 'alphafold_mean_plddt']])
+# Protian Entity — Human Protein & RNA Knowledge Graph (Industrial Data Product)
+
+This repository contains curated human Protein and RNA entity datasets and the code, contracts, and QA artifacts required to build, validate, and release them as industrial-grade data products.
+
+Key principles
+- Code, contracts, and QA reports are tracked in Git for auditability and reproducibility.
+- Large data artifacts (L1 tables) are published via GitHub Releases with a manifest and checksums.
+
+Quick links
+- Protein (L1) dataset: `data/processed/protein_master_v6_clean.tsv`
+- RNA (L1) dataset: release `rna-l1-v1` (see `pipelines/rna/README.md`)
+- Validation tool: `tools/kg_validate_table.py`
+
+Repository layout
+- `data/processed/` — final, curated TSV tables (small-to-medium L1 tables are stored here when size permits)
+- `pipelines/` — extraction and ETL pipelines (e.g., `pipelines/rna/`)
+- `docs/` — design documents, data dictionary, quality gate definitions
+- `scripts/`, `tools/` — helper and validation scripts
+
+Data release model
+1. Build entity tables using `pipelines/` scripts in a reproducible environment.
+2. Produce `manifest.json` (checksums, row counts, git commit, build timestamp).
+3. Publish artifacts as a GitHub Release and attach QA reports.
+
+Getting started
+1. Clone repository
+2. Review `docs/DATA_DICTIONARY.md` and `docs/QUALITY_GATES.md` for schema and validation rules
+3. Run validation example:
+
+```bash
+python3 tools/kg_validate_table.py --contract pipelines/protein/contracts/protein_master_v6.json \
+  --table data/processed/protein_master_v6_clean.tsv --out build/validate/protein_master_v6_report.json
 ```
 
----
+Contributing
+- See `CONTRIBUTING.md` for development, testing, and release workflow.
 
-### 📁 辅助数据
+License
+- MIT License — see `LICENSE` for details.
+
+Contact
+- Repository owner: hazelian0619
+- Project maintenance: see `CODEOWNERS` or `CONTRIBUTING.md` for maintainers and contact instructions
 
-```
-data/processed/
-├── alphafold_quality.tsv       # AlphaFold 每残基质量
-├── protein_edges.tsv           # STRING 交互网络（约 88 万条）
-├── ptm_sites.tsv               # 翻译后修饰（约 23 万条）
-└── pathway_members.tsv         # 通路成员（约 12 万条）
-```
 
 ---
 
diff --git a/data/processed/README.cn.md b/data/processed/README.cn.md
new file mode 100644
index 0000000..230b7ed
--- /dev/null
+++ b/data/processed/README.cn.md
@@ -0,0 +1,11 @@
+# 1025 项目 - 处理后数据说明
+
+本目录包含人类蛋白质知识图谱的核心数据集，整合了多个权威生物信息学数据库的信息。
+
+**数据更新时间**：2025-10-26
+**数据版本**：v6
+**物种**：Homo sapiens (人类，Taxonomy ID: 9606)
+
+---
+
+（原内容已保留为中文）
diff --git a/data/processed/README.md b/data/processed/README.md
index d42c2e0..47bb666 100644
--- a/data/processed/README.md
+++ b/data/processed/README.md
@@ -1,12 +1,11 @@
-# 1025 项目 - 处理后数据说明
+# Processed data — Protian Entity (Human Protein Knowledge Graph)
 
-## 概述
+This folder contains the final, curated data tables used as L1 entity products. Files are UTF-8 encoded TSVs and carry provenance information (source, fetch date, source_version).
 
-本目录包含人类蛋白质知识图谱的核心数据集，整合了多个权威生物信息学数据库的信息。
-
-**数据更新时间**：2025-10-26
-**数据版本**：v6
-**物种**：Homo sapiens (人类，Taxonomy ID: 9606)
+Summary
+- Data snapshot date: 2025-10-26
+- Version: v6
+- Species: Homo sapiens (Taxonomy ID: 9606)
 
 ---
 
diff --git a/docs/DATA_DICTIONARY.cn.md b/docs/DATA_DICTIONARY.cn.md
new file mode 100644
index 0000000..d09b3ca
--- /dev/null
+++ b/docs/DATA_DICTIONARY.cn.md
@@ -0,0 +1,14 @@
+# 数据字典（Data Dictionary）
+
+## 概述
+
+本文档详细描述了protein_master_v6_clean.tsv主表的所有字段，包括数据类型、来源、说明和空值情况。
+
+**主表**：protein_master_v6_clean.tsv
+**行数**：19,135条
+**列数**：33列
+**更新日期**：2025-10-26
+
+---
+
+（原文已保留为中文）
diff --git a/docs/DATA_DICTIONARY.md b/docs/DATA_DICTIONARY.md
index b867006..b32c6e6 100644
--- a/docs/DATA_DICTIONARY.md
+++ b/docs/DATA_DICTIONARY.md
@@ -1,17 +1,21 @@
-# 数据字典（Data Dictionary）
+# Data Dictionary — `protein_master_v6_clean.tsv`
 
-## 概述
+This document describes the schema and field-level details for the primary protein entity table `protein_master_v6_clean.tsv` (v6 snapshot).
 
-本文档详细描述了protein_master_v6_clean.tsv主表的所有字段，包括数据类型、来源、说明和空值情况。
+Summary
+- Rows: 19,135
+- Columns: 33
+- Snapshot date: 2025-10-26
 
-**主表**：protein_master_v6_clean.tsv
-**行数**：19,135条
-**列数**：33列
-**更新日期**：2025-10-26
+Field categories
+- Core identifiers: `uniprot_id`, `entry_name`, `protein_name`, `symbol`, `hgnc_id`
+- Sequence: `sequence`, `sequence_len`, `mass`
+- Cross references and gene IDs: `ncbi_gene_id`, `ensembl_gene_id`, `ensembl_transcript_id`, `gene_synonyms`
+- Functional annotations: `function`, `go_biological_process`, `go_molecular_function`, `go_cellular_component`
+- Structural information: `pdb_ids`, `alphafold_pdb_url`, `alphafold_mean_plddt`
+- Localization and PTMs: `subcellular_location`, `ptms`, `diseases`, `domains`, `isoforms`
 
----
-
-## 字段详细说明
+For full, field-level descriptions and examples see the original Chinese doc preserved in `docs/DATA_DICTIONARY.cn.md`.
 
 ### 一、基础标识字段
 
diff --git a/pipelines/protein/contracts/protein_master_v6.json b/pipelines/protein/contracts/protein_master_v6.json
new file mode 100644
index 0000000..f224922
--- /dev/null
+++ b/pipelines/protein/contracts/protein_master_v6.json
@@ -0,0 +1,51 @@
+{
+  "name": "protein_master_v6",
+  "description": "Protein entity master table (v6)",
+  "required_columns": [
+    "uniprot_id",
+    "sequence",
+    "sequence_len",
+    "symbol",
+    "hgnc_id",
+    "source",
+    "fetch_date"
+  ],
+  "rules": [
+    {
+      "id": "pk_non_empty",
+      "type": "non_empty_rate",
+      "column": "uniprot_id",
+      "min_rate": 1.0
+    },
+    {
+      "id": "pk_unique",
+      "type": "unique",
+      "column": "uniprot_id"
+    },
+    {
+      "id": "sequence_non_empty",
+      "type": "non_empty_rate",
+      "column": "sequence",
+      "min_rate": 1.0
+    },
+    {
+      "id": "sequence_charset",
+      "type": "sequence_charset_rate",
+      "column": "sequence",
+      "allowed_chars": ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","X","U","O","B","Z","J","-"],
+      "min_rate": 1.0
+    },
+    {
+      "id": "symbol_non_empty",
+      "type": "non_empty_rate",
+      "column": "symbol",
+      "min_rate": 0.99
+    },
+    {
+      "id": "alphafold_plddt_present",
+      "type": "non_empty_rate",
+      "column": "alphafold_mean_plddt",
+      "min_rate": 0.98
+    }
+  ]
+}