Skip to content

first commit17

first commit17 #20

name: "Markdown to PDF Converter with Tiled Watermark"
on:
push:
branches: ["main", "v*"]
tags: ["*"]
pull_request:
branches: ["main", "v*"]
workflow_dispatch:
permissions:
contents: write
pages: write
id-token: write
jobs:
convert-md-to-pdf:
runs-on: ubuntu-latest
steps:
- name: "Checkout repository"
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: "Install dependencies"
run: |
sudo apt-get update
sudo apt-get install -y \
pandoc \
texlive-xetex \
texlive-latex-extra \
texlive-fonts-recommended \
texlive-latex-recommended \
texlive-lang-chinese \
fonts-noto-cjk \
fonts-noto-cjk-extra \
fonts-wqy-microhei \
fonts-wqy-zenhei \
lmodern \
tree \
jq \
python3-pip \
python3-pygments
pip3 install beautifulsoup4 markdown-it-py
- name: "Clean HTML to Markdown converter with fixed link handling"
run: |
cat > convert_html_clean.py << 'EOF'
import os
import re
import sys
def protect_code_blocks(content):
code_blocks = []
protected_content = content
def replace_code_block(match):
code_blocks.append(match.group(0))
return f"@@@CODE_BLOCK_{len(code_blocks)-1}@@@"
code_pattern = re.compile(r'```(?:[a-zA-Z0-9_+-]*\n)?.*?\n```', re.DOTALL)
protected_content = code_pattern.sub(replace_code_block, protected_content)
inline_code_blocks = []
def replace_inline_code(match):
inline_code_blocks.append(match.group(0))
return f"@@@INLINE_CODE_{len(inline_code_blocks)-1}@@@"
inline_pattern = re.compile(r'`[^`\n]+`')
protected_content = inline_pattern.sub(replace_inline_code, protected_content)
quote_blocks = []
def replace_quote_block(match):
quote_blocks.append(match.group(0))
return f"@@@QUOTE_BLOCK_{len(quote_blocks)-1}@@@"
quote_pattern = re.compile(r'^(?:>.*\n?)+', re.MULTILINE)
protected_content = quote_pattern.sub(replace_quote_block, protected_content)
return protected_content, code_blocks, inline_code_blocks, quote_blocks
def restore_code_blocks(content, code_blocks, inline_code_blocks, quote_blocks):
restored_content = content
for i, code_block in enumerate(code_blocks):
placeholder = f"@@@CODE_BLOCK_{i}@@@"
restored_content = restored_content.replace(placeholder, code_block)
for i, inline_code in enumerate(inline_code_blocks):
placeholder = f"@@@INLINE_CODE_{i}@@@"
restored_content = restored_content.replace(placeholder, inline_code)
for i, quote_block in enumerate(quote_blocks):
placeholder = f"@@@QUOTE_BLOCK_{i}@@@"
restored_content = restored_content.replace(placeholder, quote_block)
return restored_content
def process_images_in_content(content):
lines = content.split('\n')
processed_lines = []
in_table = False
for line in lines:
if line.strip().startswith('|') and '|' in line[1:]:
in_table = True
processed_lines.append(line)
elif line.strip() == '':
in_table = False
processed_lines.append(line)
elif in_table:
processed_lines.append(line)
else:
processed_line = process_non_table_images(line)
processed_lines.append(processed_line)
return '\n'.join(processed_lines)
def process_non_table_images(line):
img_pattern = re.compile(r'<img\s+([^>]+)>', re.IGNORECASE)
def replace_img_tag(match):
attrs_str = match.group(1)
src_match = re.search(r'src\s*=\s*["\']([^"\']+)["\']', attrs_str)
alt_match = re.search(r'alt\s*=\s*["\']([^"\']*)["\']', attrs_str)
src = src_match.group(1) if src_match else ''
alt = alt_match.group(1) if alt_match else ''
if src:
src = re.sub(r'^\./', '', src)
src = re.sub(r'^/', '', src)
return f'\n![{alt}]({src})\n'
line = img_pattern.sub(replace_img_tag, line)
md_img_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
def ensure_img_newlines(match):
alt = match.group(1)
src = match.group(2)
return f'\n![{alt}]({src})\n'
line = md_img_pattern.sub(ensure_img_newlines, line)
line = re.sub(r'\s*\{width=[^}]+\}', '', line)
return line
def remove_image_labels_and_figures(content):
lines = content.split('\n')
processed_lines = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
if re.search(r'!\[.*\]\(.*\)', line):
processed_lines.append(line)
i += 1
for lookahead in range(min(3, len(lines) - i)):
next_line = lines[i + lookahead]
next_stripped = next_line.strip()
if re.match(r'^(图\d*[::]\s*.*|图片\d*[::]\s*.*|图\s*\d+\s*[::]\s*.*)$', next_stripped):
i += 1
else:
break
continue
if re.match(r'^(图\d*[::]\s*.*|图片\d*[::]\s*.*|图\s*\d+\s*[::]\s*.*)$', stripped):
i += 1
continue
if stripped.startswith('|') and '|' in stripped[1:]:
processed_lines.append(line)
i += 1
continue
if re.match(r'^<center>图\d*[::].*</center>$', stripped, re.IGNORECASE):
i += 1
continue
cleaned_line = re.sub(r'</?center>', '', line, flags=re.IGNORECASE)
processed_lines.append(cleaned_line)
i += 1
return '\n'.join(processed_lines)
def fix_tilde_in_text(content):
code_pattern = re.compile(r'```.*?```', re.DOTALL)
code_blocks = []
def replace_code(match):
code_blocks.append(match.group(0))
return f"@@@CODE_BLOCK_PROTECT_{len(code_blocks)-1}@@@"
protected_content = code_pattern.sub(replace_code, content)
inline_pattern = re.compile(r'`[^`\n]+`')
inline_blocks = []
def replace_inline(match):
inline_blocks.append(match.group(0))
return f"@@@INLINE_CODE_PROTECT_{len(inline_blocks)-1}@@@"
protected_content = inline_pattern.sub(replace_inline, protected_content)
protected_content = re.sub(r'(?<=\d)~(?=\d)', r'-', protected_content)
for i, code_block in enumerate(code_blocks):
placeholder = f"@@@CODE_BLOCK_PROTECT_{i}@@@"
protected_content = protected_content.replace(placeholder, code_block)
for i, inline_block in enumerate(inline_blocks):
placeholder = f"@@@INLINE_CODE_PROTECT_{i}@@@"
protected_content = protected_content.replace(placeholder, inline_block)
return protected_content
def preserve_markdown_links(content):
md_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\(([^)]+)\)')
html_link_pattern = re.compile(r'<a\s+([^>]+)>(.*?)</a>', re.IGNORECASE | re.DOTALL)
special_link_pattern = re.compile(r'<(https?://[^>]+)>')
protected_content = content
def replace_html_link(match):
attrs_str = match.group(1)
link_text = match.group(2).strip()
href_match = re.search(r'href\s*=\s*["\']([^"\']+)["\']', attrs_str)
if href_match:
href = href_match.group(1)
if not href.startswith(('http://', 'https://', 'mailto:')):
href = re.sub(r'^\./', '', href)
href = re.sub(r'^/', '', href)
return f'[{link_text}]({href})'
return link_text
protected_content = html_link_pattern.sub(replace_html_link, protected_content)
def replace_special_link(match):
url = match.group(1)
return f'[{url}]({url})'
protected_content = special_link_pattern.sub(replace_special_link, protected_content)
return protected_content
def fix_special_characters(content):
content = re.sub(r'(0,100)', r'(0-100)', content)
content = re.sub(r'(0\.100)', r'(0-100)', content)
content = re.sub(r'(\d):(\d)', r'\1-\2', content)
content = re.sub(r'((\d+)[.,](\d{2,3}))', r'(\1-\2)', content)
return content
def fix_text_code_blocks(content):
text_pattern = re.compile(r'```text\n(.*?)\n```', re.DOTALL)
def fix_text_content(match):
text_content = match.group(1)
lines = text_content.split('\n')
fixed_lines = []
for line in lines:
line = line.rstrip()
if len(line) > 60:
for i in range(0, len(line), 50):
fixed_lines.append(line[i:i+50])
else:
fixed_lines.append(line)
fixed_content = '\n'.join(fixed_lines)
return f'```text\n{fixed_content}\n```'
return text_pattern.sub(fix_text_content, content)
def process_markdown_file(filepath):
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
content = fix_special_characters(content)
content = fix_text_code_blocks(content)
protected_content, code_blocks, inline_code_blocks, quote_blocks = protect_code_blocks(content)
protected_content = preserve_markdown_links(protected_content)
processed_content = process_images_in_content(protected_content)
processed_content = remove_image_labels_and_figures(processed_content)
processed_content = fix_tilde_in_text(processed_content)
processed_content = re.sub(r'<(?!@@@CODE_BLOCK_|@@@INLINE_CODE_|@@@QUOTE_BLOCK_)(?!\/?code>)(?!\/?pre>)(?!\/?span>)(?!\/?div>)(?!\/?p>)(?!\/?br>)[^>]+>', '', processed_content)
final_content = restore_code_blocks(processed_content, code_blocks, inline_code_blocks, quote_blocks)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(final_content)
return True
except Exception as e:
return False
processed = 0
for root, dirs, files in os.walk('.'):
skip_dirs = ['.git', 'pdf_output', '__pycache__', '.github', 'node_modules', 'output']
dirs[:] = [d for d in dirs if d not in skip_dirs]
for file in files:
if file.endswith('.md'):
filepath = os.path.join(root, file)
if process_markdown_file(filepath):
processed += 1
EOF
python3 convert_html_clean.py
- name: "Create LaTeX template"
run: |
cat > header.tex << 'EOF'
\usepackage{xeCJK}
\usepackage[UTF8]{ctex}
\setCJKmainfont{Noto Serif CJK SC}
\setCJKsansfont{Noto Sans CJK SC}
\setCJKmonofont{Noto Sans Mono CJK SC}
\usepackage[margin=2.0cm]{geometry}
\usepackage{hyperref}
\hypersetup{
colorlinks=true,
linkcolor={rgb}{0.03,0.4,0.84},
urlcolor={rgb}{0.03,0.4,0.84},
citecolor={rgb}{0.03,0.4,0.84},
pdfborder={0 0 0},
}
\usepackage{minted}
\usepackage{graphicx}
\usepackage{float}
\usepackage{placeins}
\usepackage{mdframed}
\usepackage{xspace}
\usepackage{caption}
\captionsetup[figure]{labelformat=empty, textformat=empty}
\usepackage{subcaption}
\captionsetup[subfigure]{labelformat=empty, textformat=empty}
% 定义颜色
\definecolor{codebg}{rgb}{0.98,0.98,0.98}
\definecolor{framecolor}{rgb}{0.85,0.85,0.85}
% 配置 minted 样式
\setminted{
style=tango,
bgcolor=codebg,
frame=single,
framesep=6pt,
framerule=0.8pt,
rulecolor=framecolor,
breaklines=true,
breakanywhere=true,
tabsize=2,
fontsize=\small,
xleftmargin=10pt,
xrightmargin=10pt,
numbers=none,
autogobble,
}
\usepackage{float}
\floatplacement{figure}{H}
\usepackage{booktabs}
\setlength{\heavyrulewidth}{0.08em}
\setlength{\lightrulewidth}{0.05em}
\setlength{\aboverulesep}{0.2em}
\setlength{\belowrulesep}{0.2em}
\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}
\usepackage{parskip}
\setlength{\parindent}{0pt}
\setlength{\parskip}{0.5em}
\setlength{\itemsep}{0pt}
\setlength{\topsep}{0pt}
\usepackage{titlesec}
\titleformat{\section}{\Large\bfseries\sffamily}{\thesection}{1em}{}
\titleformat{\subsection}{\large\bfseries\sffamily}{\thesubsection}{1em}{}
\titleformat{\subsubsection}{\normalsize\bfseries\sffamily}{\thesubsubsection}{1em}{}
\titlespacing*{\section}{0pt}{12pt}{6pt}
\titlespacing*{\subsection}{0pt}{8pt}{4pt}
\titlespacing*{\subsubsection}{0pt}{6pt}{2pt}
\renewcommand{\familydefault}{\sfdefault}
\sloppy
\emergencystretch=3em
\usepackage{eso-pic}
\usepackage{tikz}
\usetikzlibrary{calc}
\newcommand\BackgroundPicture{%
\begin{tikzpicture}[remember picture,overlay]
\coordinate (page center) at (current page.center);
\pgfmathsetmacro{\pagewidth}{\paperwidth}
\pgfmathsetmacro{\pageheight}{\paperheight}
\pgfmathsetmacro{\hcount}{3}
\pgfmathsetmacro{\vcount}{4}
\pgfmathsetmacro{\hspace}{\paperwidth/(\hcount+1)}
\pgfmathsetmacro{\vspace}{\paperheight/(\vcount+1)}
\foreach \i in {1,...,\hcount} {
\foreach \j in {1,...,\vcount} {
\pgfmathsetmacro{\x}{-\paperwidth/2 + \i*\hspace}
\pgfmathsetmacro{\y}{-\paperheight/2 + \j*\vspace}
\node at ($(page center) + (\x pt, \y pt)$) [
rotate=45,
scale=1.5,
text=gray!30,
opacity=0.45,
font=\sffamily\bfseries\large
] {emakefun};
}
}
\end{tikzpicture}%
}
\AddToShipoutPictureBG{\BackgroundPicture}
\graphicspath{{./}{./resource/}}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\makeatother
\let\oldincludegraphics\includegraphics
\renewcommand{\includegraphics}[2][]{%
\begingroup
\centering
\oldincludegraphics[#1,width=\maxwidth]{#2}%
\endgroup
}
\usepackage{titletoc}
\setcounter{tocdepth}{-1}
\usepackage{fvextra}
\usepackage{csquotes}
\definecolor{quotebg}{RGB}{246,248,250}
\definecolor{quoteframe}{RGB}{225,228,232}
\newmdenv[
backgroundcolor=quotebg,
linecolor=quoteframe,
linewidth=0.5pt,
roundcorner=6pt,
innerleftmargin=12pt,
innerrightmargin=12pt,
innertopmargin=8pt,
innerbottommargin=8pt,
skipabove=6pt,
skipbelow=6pt,
]{quotebox}
\newcommand{\quoteblock}[1]{%
\begin{quotebox}
#1
\end{quotebox}
}
EOF
- name: "Create Pandoc filter"
run: |
cat > quote-filter.lua << 'EOF'
function Block(el)
if el.t == "BlockQuote" then
local content = ""
for i, item in ipairs(el.content) do
content = content .. pandoc.write(pandoc.Pandoc({item}), "latex")
end
return pandoc.RawBlock('latex', '\\begin{quotebox}\n' .. content .. '\\end{quotebox}')
end
if el.t == "HorizontalRule" then
return pandoc.RawBlock('latex', '\\vspace{0.5em}\\HRule\\vspace{0.5em}')
end
return el
end
function CodeBlock(el)
if el.classes[1] == "text" then
return pandoc.RawBlock('latex',
'\\begin{lstlisting}[style=textstyle]\n' ..
el.text ..
'\n\\end{lstlisting}')
end
return el
end
function Str(el)
local text = el.text
text = text:gsub('(0,100)', '(0-100)')
text = text:gsub('(0%.100)', '(0-100)')
text = text:gsub('((%d+)[.,](%d%d%d)', '(%1-%2)')
return pandoc.Str(text)
end
function Code(el)
return pandoc.Code(el.text)
end
function Image(el)
el.caption = {}
return el
end
EOF
- name: "Convert Markdown to PDF"
run: |
MD_FILES=$(find . -name "*.md" -not -path "./.*" -not -path "*/node_modules/*" -not -path "./output/*" | sed 's/^\.\///')
IFS=$'\n'
converted_count=0
failed_count=0
CURRENT_DATE=$(date '+%Y-%m-%d')
for md_file in $MD_FILES; do
[ -z "$md_file" ] && continue
dir_name=$(dirname "$md_file")
base_name=$(basename "$md_file" .md)
if [ "$dir_name" != "." ]; then
mkdir -p "$dir_name"
fi
md_dir=$(dirname "$md_file")
resource_paths="$md_dir:$md_dir/../resource:.:./resource"
set +e
pandoc "$md_file" \
--resource-path="$resource_paths" \
-o "$dir_name/$base_name.pdf" \
--pdf-engine=xelatex \
--pdf-engine-opt=-shell-escape \
--highlight-style=pygments \
--include-in-header=header.tex \
-V mainfont="Noto Sans CJK SC" \
-V sansfont="Noto Sans CJK SC" \
-V monofont="Noto Sans Mono CJK SC" \
-V geometry:margin=2.0cm \
-V geometry:a4paper \
-V colorlinks=true \
-V linkcolor=blue \
-V urlcolor=blue \
-V graphics \
--wrap=auto \
-f markdown \
-M date="$CURRENT_DATE" \
2>&1 >/dev/null
conversion_status=$?
set -e
if [ $conversion_status -eq 0 ] && [ -f "$dir_name/$base_name.pdf" ] && [ -s "$dir_name/$base_name.pdf" ]; then
converted_count=$((converted_count + 1))
else
failed_count=$((failed_count + 1))
fi
done
if [ $converted_count -eq 0 ] && [ $failed_count -gt 0 ]; then
exit 1
fi
- name: "Copy resource directories"
run: |
pdf_dirs=$(find . -name "*.pdf" -type f -not -path "./.*" -not -path "*/node_modules/*" -not -path "*/resource/*" | xargs -I {} dirname {} | sort -u)
for pdf_dir in $pdf_dirs; do
if [ -d "$pdf_dir/resource" ]; then
continue
fi
if [ -d "./resource" ] && [ "$pdf_dir" != "." ]; then
if command -v rsync >/dev/null 2>&1; then
rsync -av --exclude='resource/' ./resource/ "$pdf_dir/resource/" >/dev/null 2>&1 || true
else
mkdir -p "$pdf_dir/resource"
find "./resource" -type f -exec cp --parents {} "$pdf_dir/" 2>/dev/null \; || true
fi
fi
parent_dir=$(dirname "$pdf_dir")
if [ -d "$parent_dir/resource" ] && [ "$parent_dir" != "." ]; then
if command -v rsync >/dev/null 2>&1; then
rsync -av --exclude='resource/' "$parent_dir/resource/" "$pdf_dir/resource/" >/dev/null 2>&1 || true
else
mkdir -p "$pdf_dir/resource"
find "$parent_dir/resource" -type f -exec cp --parents {} "$pdf_dir/" 2>/dev/null \; || true
fi
fi
if [ -d "$pdf_dir/resource/resource" ]; then
rm -rf "$pdf_dir/resource/resource"
fi
done
- name: "Verify PDF files"
run: |
total_pdfs=$(find . -name "*.pdf" -type f -not -path "./.*" -not -path "*/node_modules/*" 2>/dev/null | wc -l)
if [ $total_pdfs -eq 0 ]; then
exit 1
fi
- name: "Create zip archive for release"
if: startsWith(github.ref, 'refs/tags/')
run: |
TAG_NAME="${GITHUB_REF#refs/tags/}"
mkdir -p pdf_collection
PDF_FILES=$(find . -name "*.pdf" -type f \
-not -path "./.*" \
-not -path "*/node_modules/*" \
-not -path "./pdf_collection/*" \
-not -path "*/resource/*" | sed 's/^\.\///')
echo "$PDF_FILES" | while read pdf_file; do
[ -z "$pdf_file" ] && continue
dir_name=$(dirname "$pdf_file")
base_name=$(basename "$pdf_file" .pdf)
mkdir -p "pdf_collection/$dir_name"
cp "$pdf_file" "pdf_collection/$pdf_file"
if [ -d "$dir_name/resource" ]; then
mkdir -p "pdf_collection/$dir_name/resource"
find "$dir_name/resource" -type f \
-exec cp --parents {} "pdf_collection/" \; 2>/dev/null || true
fi
current_dir="$dir_name"
for i in 1 2 3; do
parent_dir=$(dirname "$current_dir")
if [ "$parent_dir" = "." ]; then
if [ -d "./resource" ]; then
mkdir -p "pdf_collection/$dir_name/resource"
find "./resource" -type f \
-exec cp --parents {} "pdf_collection/$dir_name/" \; 2>/dev/null || true
fi
break
elif [ -d "$parent_dir/resource" ]; then
mkdir -p "pdf_collection/$dir_name/resource"
find "$parent_dir/resource" -type f \
-exec cp --parents {} "pdf_collection/$dir_name/" \; 2>/dev/null || true
break
fi
current_dir="$parent_dir"
done
if [ -d "./resource" ]; then
if [ ! -d "pdf_collection/resource" ]; then
mkdir -p "pdf_collection/resource"
find "./resource" -type f \
-exec cp {} "pdf_collection/resource/" \; 2>/dev/null || true
fi
fi
done
cd pdf_collection
zip -r "../${{ github.event.repository.name }}-$TAG_NAME.zip" . >/dev/null
cd ..
rm -rf pdf_collection
- name: "Upload PDF artifacts"
uses: actions/upload-artifact@v4
with:
name: pdf-documents
path: |
**/*.pdf
**/resource/
retention-days: 7
- name: "Create GitHub Release"
if: startsWith(github.ref, 'refs/tags/')
id: create_release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ github.ref_name }}
name: "${{ github.ref_name }}"
files: |
${{ github.event.repository.name }}-${{ github.ref_name }}.zip
draft: false
prerelease: false
generate_release_notes: true