Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions PyTorch/build-in/Detection/DETR/DETR/coverage.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
all api: ['_cdist_forward', '_has_compatible_shallow_copy_type', '_index_put_impl_', '_local_scalar_dense', '_log_softmax', '_log_softmax_backward_data', '_reshape_alias', '_softmax', '_softmax_backward_data', '_unique2', 'abs', 'add', 'add_', 'addcdiv', 'addcmul', 'addmm', 'alias', 'any', 'arange', 'as_strided', 'as_strided_', 'baddbmm', 'bmm', 'cat', 'clamp', 'clip_grad_norm_', 'clone', 'contiguous', 'convolution', 'convolution_backward', 'copy_', 'copy_stride', 'cos', 'cumsum', 'div', 'dropout', 'dropout_backward', 'dropout_forward', 'empty', 'empty_strided', 'eq', 'fill_', 'floor_divide', 'ge', 'gt', 'index', 'lerp', 'lt', 'masked_fill_', 'max_pool2d', 'maximum', 'maxpool2d_forward', 'mean', 'minimum', 'mm', 'mul', 'native_batch_norm', 'native_batch_norm_backward', 'native_layer_norm', 'native_layer_norm_backward', 'neg', 'nll_loss_backward', 'nll_loss_forward', 'nonzero', 'normal_', 'pow', 'relu', 'relu_', 'repeat', 'resize_', 'rsqrt', 'sgn', 'sigmoid', 'sigmoid_backward', 'sin', 'sqrt', 'sub', 'sum', 'threshold_backward', 'unfold', 'uniform_', 'upsample_nearest2d', 'view', 'where', 'zero_'], total: 85
fallback op: [], total: 0
coverage rate: 100.00%
1 change: 1 addition & 0 deletions PyTorch/build-in/Detection/DETR/DETR/detr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config_path: configs/detr/detr_r18_8xb2-500e_coco.py
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions PyTorch/build-in/Detection/DETR/DETR/detr_loss.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
=== CUDA ===
38.231200 36.005900 25.152300 33.938700 27.764200 26.467700 26.076700 23.761000 22.786200 23.913200
20.859900 22.397500 23.919500 17.078800 18.648800 21.657000 22.126200 20.751300 21.690100 20.243800
16.501200 17.695200 25.384200 22.926800 20.861700 19.734600 17.584400 23.120200 27.424900 15.223700
17.800800 14.525000 19.361100 17.989900 15.531300 15.874200 17.008200 17.509000 12.476900 14.244500
15.951300 22.558300 15.652600 14.586600 36.734000 22.218000 13.832100 20.740700 15.046400 22.039400
24.241800 14.041000 21.103900 19.932600 19.211300 29.035500 16.062000 20.241100 15.616700 26.986000
20.122300 19.437700 18.781600 16.944800 16.304300 14.098600 17.264200 11.881000 12.672500 13.087700
26.163100 14.416000 21.086400 14.339600 30.819200 14.830100 17.145700 21.762800 22.290500 20.639500
16.783000 12.142900 22.276100 21.663000 20.148700 23.399100 16.065500 25.793900 19.534500 15.481100
18.417200 29.240400 26.813200 20.714700 16.573500 15.920300 11.736500 15.671800 21.566800 15.415500

=== SDAA ===
38.155500 35.847700 25.290900 34.077600 28.056500 27.333400 27.245700 24.674800 24.031000 24.543300
21.577000 22.813300 24.258400 17.953600 19.588300 21.946500 22.920500 21.330800 21.934800 20.683500
17.127100 18.251500 25.342100 23.220600 21.176300 20.921600 17.864500 22.862900 27.780200 16.012300
18.127900 15.174900 19.547200 18.099600 15.118700 16.839100 17.463400 18.308400 13.398400 14.442000
15.778000 22.878100 15.702200 14.105500 36.312000 22.291200 14.447000 20.712900 14.950300 21.473900
24.162500 15.334700 19.957200 19.858300 19.447600 27.797600 17.807300 19.956200 16.368500 25.730600
19.670400 19.475100 18.741900 16.230200 16.254400 14.425900 17.572300 11.999400 14.841200 14.084200
25.596800 14.013400 20.505500 13.881800 30.370800 14.669300 17.191400 21.878000 21.697400 20.155200
16.238800 12.159200 21.770300 21.984000 18.893000 21.576300 16.560900 25.402600 18.790300 15.143900
17.865500 28.471400 25.204800 20.280100 17.794100 17.128200 12.210300 17.919500 22.253800 16.508700

=== RESULT ===
MeanRelativeError: 0.011905069239380396
MeanAbsoluteError: 0.16299300000000003
Rule,mean_relative_error 0.011905069239380396
pass mean_relative_error=np.float64(0.011905069239380396) <= 0.05 or mean_absolute_error=np.float64(0.16299300000000003) <= 0.0002
1 change: 1 addition & 0 deletions PyTorch/build-in/Detection/DETR/DETR/run
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
bash ../sdaaTest.sh detr 8 0
291 changes: 291 additions & 0 deletions PyTorch/build-in/Detection/DETR/DETR/weloTrainStep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import argparse
import random
import yaml
import numpy as np
import torch
import shutil

from mmengine.config import Config, DictAction
from mmengine.runner import Runner
from mmengine.hooks import Hook
from mmengine.registry import HOOKS
from mmdet.utils import register_all_modules

# ==============================================================================
# 1. 基础路径配置
# ==============================================================================
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
LOCAL_MMDET = os.path.join(CURRENT_DIR, '../mmdetection')

if os.path.exists(LOCAL_MMDET):
print(f"[Mode] 检测到本地 mmdetection,进入【独立/打包模式】")
MMDET_ROOT = LOCAL_MMDET
else:
print(f"[Mode] 未检测到本地 mmdetection,进入【开发模式】")
MMDET_ROOT = '/data/application/wangwl/Detection/mmdetection'

# ==============================================================================
# 2. 辅助工具
# ==============================================================================
def load_model_yaml(model_name):
yaml_file = os.path.join(CURRENT_DIR, f"{model_name}.yml")
if not os.path.exists(yaml_file):
print(f"❌ [Error] 找不到配置文件: {yaml_file}")
sys.exit(1)
with open(yaml_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
return data.get('config_path')

def auto_find_dataset_root(base_path):
target_file = 'instances_train2017.json'
base_path = os.path.abspath(base_path)
if os.path.exists(os.path.join(base_path, 'annotations', target_file)):
return base_path
for root, _, files in os.walk(base_path):
if target_file in files:
return os.path.dirname(root)
return base_path

def check_device_availability():
if not torch.cuda.is_available():
print("❌ [Error] 未检测到 CUDA 设备")
sys.exit(1)

# ==============================================================================
# 3. 环境设置
# ==============================================================================
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["PYTHONHASHSEED"] = "12345"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

def set_global_seed(seed: int = 42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print(f"[Setup] Global Seed: {seed}")

# ==============================================================================
# 4. 日志 Hook
# ==============================================================================
@HOOKS.register_module()
class SdaaLogHook(Hook):
def __init__(self, log_file='./train.log', total_steps=100):
self.log_file = os.path.abspath(log_file)
self.total_steps = total_steps
os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
with open(self.log_file, 'w') as f:
f.write(f"==== Training Start (Total Steps: {total_steps}) ====\n")
print(f"==== Epoch 0 (IterBased Mode) | Log: {self.log_file} ====", flush=True)

def after_train_iter(self, runner, batch_idx, data_batch=None, outputs=None):
current_step = runner.iter + 1
loss = outputs['loss'].item() if 'loss' in outputs else 0.0
lr = runner.optim_wrapper.get_lr()['lr'][0]
log_str = f"Iter[{current_step}] step_train_loss {loss:.4f} lr {lr:.2e}"
print(log_str, flush=True)
with open(self.log_file, 'a') as f:
f.write(log_str + "\n")

# ==============================================================================
# 5. Config 构建
# ==============================================================================
def build_auto_config(config_rel_path, args, final_work_dir):
cfg = Config.fromfile(os.path.join(MMDET_ROOT, config_rel_path))

# 类别数修改
if hasattr(cfg.model, 'roi_head'):
cfg.model.roi_head.bbox_head.num_classes = 2
if hasattr(cfg.model, 'bbox_head'):
cfg.model.bbox_head.num_classes = 2

if args.load_weights:
print(f"[Init] Backbone initialized with: {args.load_weights}")
cfg.model.backbone.init_cfg = dict(type='Pretrained', checkpoint=args.load_weights)
else:
# 如果没有指定预训练权重,默认是 None (随机初始化)
# 但为了对齐,建议一定要指定一个
cfg.model.backbone.init_cfg = None

cfg.model.backbone.init_cfg = None
metainfo = dict(classes=('dog', 'cat'))

# ============================================================
# 🔥 关键修复 1: 获取原始配置中的 pipeline
# ============================================================
# 必须确保 pipeline 中包含 PackDetInputs,通常配置文件里都有
train_pipeline = cfg.train_pipeline
test_pipeline = cfg.test_pipeline

# ============================================================
# 🔥 关键修复 2: 函数接收 pipeline 参数,并传给 dataset
# ============================================================
def get_dataloader_cfg(shuffle=False, pipeline=None):
return dict(
batch_size=args.batch_size,
num_workers=0,
pin_memory=not args.no_pin_memory,
persistent_workers=False,
sampler=dict(type='DefaultSampler', shuffle=shuffle),
dataset=dict(
type='CocoDataset',
data_root=args.datapath,
metainfo=metainfo,
ann_file='annotations/instances_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=pipeline # <--- ❌ 之前报错就是因为缺了这一行!
)
)

# 传入对应的 pipeline
cfg.train_dataloader = get_dataloader_cfg(True, train_pipeline)
cfg.val_dataloader = get_dataloader_cfg(False, test_pipeline)
cfg.test_dataloader = cfg.val_dataloader
# 定义标注文件的相对路径(需与 get_dataloader_cfg 中保持一致)
ann_file_rel = 'annotations/instances_train2017.json'
ann_file_full = os.path.join(args.datapath, ann_file_rel)

# ---------------- no-validate ----------------
if args.no_validate:
cfg.train_cfg = dict(type='IterBasedTrainLoop', max_iters=args.steps)
cfg.val_dataloader = None
cfg.val_evaluator = None
cfg.test_dataloader = None
cfg.test_evaluator = None
cfg.val_cfg = None
cfg.test_cfg = None
print("[Config] Validation disabled")
else:
cfg.train_cfg = dict(type='IterBasedTrainLoop', max_iters=args.steps, val_interval=args.steps)
cfg.val_evaluator = dict(
type='CocoMetric',
metric='bbox',
ann_file=ann_file_full # <--- 必须加上这一行!
)
cfg.test_evaluator = cfg.val_evaluator

cfg.work_dir = final_work_dir
cfg.experiment_name = '.'
cfg.log_level = 'WARNING'

cfg.log_level = 'WARNING'
cfg.default_hooks.logger = dict(type='LoggerHook', interval=1)

if hasattr(args, 'log_file') and args.log_file:
target_log_file = args.log_file
os.makedirs(os.path.dirname(target_log_file), exist_ok=True)
else:
target_log_file = os.path.join(final_work_dir, 'train_loss.txt')

cfg.custom_hooks = [
dict(
type='SdaaLogHook',
total_steps=args.steps,
log_file=target_log_file
)
]

cfg.default_hooks.checkpoint = dict(
type='CheckpointHook',
interval=args.steps,
by_epoch=False,
max_keep_ckpts=1
)

# ---------------- AMP ----------------
if args.amp:
cfg.optim_wrapper.type = 'AmpOptimWrapper'
cfg.optim_wrapper.loss_scale = 'dynamic'

# ---------------- auto-scale-lr ----------------
if args.auto_scale_lr:
cfg.auto_scale_lr = dict(enable=True, base_batch_size=16)
print("[Config] Auto scale LR enabled")

# ---------------- cfg-options ----------------
if args.cfg_options:
print(f"[Config] Apply cfg-options: {args.cfg_options}")
cfg.merge_from_dict(args.cfg_options)

return cfg

# ==============================================================================
# 6. 参数解析
# ==============================================================================
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
parser.add_argument('--steps', type=int, default=100)
parser.add_argument('--batch-size', type=int, default=2)
parser.add_argument('--val-num', type=int, default=50)
parser.add_argument('--datapath', type=str, default='./data')
parser.add_argument('--load-weights', type=str, default=False)
parser.add_argument('--name', type=str, default='train')
parser.add_argument('--seed', type=int, default=2025)
parser.add_argument('--amp', action='store_true')

# 🔥 新增参数
parser.add_argument('--no-validate', action='store_true')
parser.add_argument('--auto-scale-lr', action='store_true')
parser.add_argument('--no-pin-memory', action='store_true')
parser.add_argument('--cfg-options', nargs='+', action=DictAction)

return parser.parse_args()

# ==============================================================================
# 7. Main
# ==============================================================================
def to_camel_case(snake_str):
"""faster_rcnn -> FasterRcnn"""
components = snake_str.split('_')
return ''.join(x.title() for x in components)

def main():
args = parse_args()
set_global_seed(args.seed)
register_all_modules(init_default_scope=False)
check_device_availability()

config_rel_path = load_model_yaml(args.model)
args.datapath = auto_find_dataset_root(args.datapath)

work_root = os.getcwd()
final_work_dir = os.path.join(work_root, 'result', args.name.capitalize())
os.makedirs(final_work_dir, exist_ok=True)

model_camel = to_camel_case(args.model)
run_name_cap = args.name.capitalize()
log_name = f"{model_camel}{run_name_cap}.log"

# model_root_dir = os.path.join(CURRENT_DIR, model_camel)
# args.log_file = os.path.join(model_root_dir, log_name)
args.log_file = os.path.join(final_work_dir, log_name)

cfg = build_auto_config(config_rel_path, args, final_work_dir)
# cfg.optim_wrapper.optimizer.lr = 1e-4

if args.load_weights:
cfg.load_from = args.load_weights
print(f"[Load] weights from {args.load_weights}")

runner = Runner.from_cfg(cfg)

if not args.load_weights:
save_path = './random_init_weights.pth' # 当前目录
print(f"[Init] Saving random initialization to: {save_path}")
torch.save(runner.model.state_dict(), save_path)
print("[Init] Done. Exiting...")

runner.train()

if __name__ == '__main__':
main()
34 changes: 34 additions & 0 deletions PyTorch/build-in/Detection/DETR/mmdetection/.circleci/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
version: 2.1

# this allows you to use CircleCI's dynamic configuration feature
setup: true

# the path-filtering orb is required to continue a pipeline based on
# the path of an updated fileset
orbs:
path-filtering: circleci/path-filtering@0.1.2

workflows:
# the always-run workflow is always triggered, regardless of the pipeline parameters.
always-run:
jobs:
# the path-filtering/filter job determines which pipeline
# parameters to update.
- path-filtering/filter:
name: check-updated-files
# 3-column, whitespace-delimited mapping. One mapping per
# line:
# <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
mapping: |
mmdet/.* lint_only false
requirements/.* lint_only false
tests/.* lint_only false
tools/.* lint_only false
configs/.* lint_only false
.circleci/.* lint_only false
base-revision: dev-3.x
# this is the path of the configuration we should trigger once
# path filtering and pipeline parameter value updates are
# complete. In this case, we are using the parent dynamic
# configuration itself.
config-path: .circleci/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ARG PYTORCH="1.8.1"
ARG CUDA="10.2"
ARG CUDNN="7"

FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

# To fix GPG key error when running apt-get update
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub

RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
Loading