Markdown to PDF Converter with Tiled Watermark

first commit17 #20

Workflow file for this run

.github/workflows/markdown_to_pdf.yml at b017825

	name: "Markdown to PDF Converter with Tiled Watermark"

	on:
	push:
	branches: ["main", "v*"]
	tags: ["*"]
	pull_request:
	branches: ["main", "v*"]
	workflow_dispatch:

	permissions:
	contents: write
	pages: write
	id-token: write

	jobs:
	convert-md-to-pdf:
	runs-on: ubuntu-latest

	steps:
	- name: "Checkout repository"
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: "Install dependencies"
	run: \|
	sudo apt-get update
	sudo apt-get install -y \
	pandoc \
	texlive-xetex \
	texlive-latex-extra \
	texlive-fonts-recommended \
	texlive-latex-recommended \
	texlive-lang-chinese \
	fonts-noto-cjk \
	fonts-noto-cjk-extra \
	fonts-wqy-microhei \
	fonts-wqy-zenhei \
	lmodern \
	tree \
	jq \
	python3-pip \
	python3-pygments

	pip3 install beautifulsoup4 markdown-it-py

	- name: "Clean HTML to Markdown converter with fixed link handling"
	run: \|
	cat > convert_html_clean.py << 'EOF'
	import os
	import re
	import sys

	def protect_code_blocks(content):
	code_blocks = []
	protected_content = content

	def replace_code_block(match):
	code_blocks.append(match.group(0))
	return f"@@@CODE_BLOCK_{len(code_blocks)-1}@@@"

	code_pattern = re.compile(r'```(?:[a-zA-Z0-9_+-]\n)?.?\n```', re.DOTALL)
	protected_content = code_pattern.sub(replace_code_block, protected_content)

	inline_code_blocks = []
	def replace_inline_code(match):
	inline_code_blocks.append(match.group(0))
	return f"@@@INLINE_CODE_{len(inline_code_blocks)-1}@@@"

	inline_pattern = re.compile(r'`[^`\n]+`')
	protected_content = inline_pattern.sub(replace_inline_code, protected_content)

	quote_blocks = []
	def replace_quote_block(match):
	quote_blocks.append(match.group(0))
	return f"@@@QUOTE_BLOCK_{len(quote_blocks)-1}@@@"

	quote_pattern = re.compile(r'^(?:>.*\n?)+', re.MULTILINE)
	protected_content = quote_pattern.sub(replace_quote_block, protected_content)

	return protected_content, code_blocks, inline_code_blocks, quote_blocks

	def restore_code_blocks(content, code_blocks, inline_code_blocks, quote_blocks):
	restored_content = content

	for i, code_block in enumerate(code_blocks):
	placeholder = f"@@@CODE_BLOCK_{i}@@@"
	restored_content = restored_content.replace(placeholder, code_block)

	for i, inline_code in enumerate(inline_code_blocks):
	placeholder = f"@@@INLINE_CODE_{i}@@@"
	restored_content = restored_content.replace(placeholder, inline_code)

	for i, quote_block in enumerate(quote_blocks):
	placeholder = f"@@@QUOTE_BLOCK_{i}@@@"
	restored_content = restored_content.replace(placeholder, quote_block)

	return restored_content

	def process_images_in_content(content):
	lines = content.split('\n')
	processed_lines = []
	in_table = False

	for line in lines:
	if line.strip().startswith('\|') and '\|' in line[1:]:
	in_table = True
	processed_lines.append(line)
	elif line.strip() == '':
	in_table = False
	processed_lines.append(line)
	elif in_table:
	processed_lines.append(line)
	else:
	processed_line = process_non_table_images(line)
	processed_lines.append(processed_line)

	return '\n'.join(processed_lines)

	def process_non_table_images(line):
	img_pattern = re.compile(r'<img\s+([^>]+)>', re.IGNORECASE)

	def replace_img_tag(match):
	attrs_str = match.group(1)

	src_match = re.search(r'src\s=\s["\']([^"\']+)["\']', attrs_str)
	alt_match = re.search(r'alt\s=\s["\']([^"\']*)["\']', attrs_str)

	src = src_match.group(1) if src_match else ''
	alt = alt_match.group(1) if alt_match else ''

	if src:
	src = re.sub(r'^\./', '', src)
	src = re.sub(r'^/', '', src)

	return f'\n![{alt}]({src})\n'

	line = img_pattern.sub(replace_img_tag, line)

	md_img_pattern = re.compile(r'!\[([^\]]*)\]$([^)]+)$')

	def ensure_img_newlines(match):
	alt = match.group(1)
	src = match.group(2)
	return f'\n![{alt}]({src})\n'

	line = md_img_pattern.sub(ensure_img_newlines, line)

	line = re.sub(r'\s*\{width=[^}]+\}', '', line)

	return line

	def remove_image_labels_and_figures(content):
	lines = content.split('\n')
	processed_lines = []

	i = 0
	while i < len(lines):
	line = lines[i]
	stripped = line.strip()

	if re.search(r'!\[.\]$.$', line):
	processed_lines.append(line)
	i += 1

	for lookahead in range(min(3, len(lines) - i)):
	next_line = lines[i + lookahead]
	next_stripped = next_line.strip()

	if re.match(r'^(图\d[:：]\s.\|图片\d[:：]\s.\|图\s\d+\s[:：]\s.)$', next_stripped):
	i += 1
	else:
	break
	continue

	if re.match(r'^(图\d[:：]\s.\|图片\d[:：]\s.\|图\s\d+\s[:：]\s.)$', stripped):
	i += 1
	continue

	if stripped.startswith('\|') and '\|' in stripped[1:]:
	processed_lines.append(line)
	i += 1
	continue

	if re.match(r'^<center>图\d[:：].</center>$', stripped, re.IGNORECASE):
	i += 1
	continue

	cleaned_line = re.sub(r'</?center>', '', line, flags=re.IGNORECASE)
	processed_lines.append(cleaned_line)
	i += 1

	return '\n'.join(processed_lines)

	def fix_tilde_in_text(content):
	code_pattern = re.compile(r'```.*?```', re.DOTALL)
	code_blocks = []

	def replace_code(match):
	code_blocks.append(match.group(0))
	return f"@@@CODE_BLOCK_PROTECT_{len(code_blocks)-1}@@@"

	protected_content = code_pattern.sub(replace_code, content)

	inline_pattern = re.compile(r'`[^`\n]+`')
	inline_blocks = []

	def replace_inline(match):
	inline_blocks.append(match.group(0))
	return f"@@@INLINE_CODE_PROTECT_{len(inline_blocks)-1}@@@"

	protected_content = inline_pattern.sub(replace_inline, protected_content)

	protected_content = re.sub(r'(?<=\d)~(?=\d)', r'-', protected_content)

	for i, code_block in enumerate(code_blocks):
	placeholder = f"@@@CODE_BLOCK_PROTECT_{i}@@@"
	protected_content = protected_content.replace(placeholder, code_block)

	for i, inline_block in enumerate(inline_blocks):
	placeholder = f"@@@INLINE_CODE_PROTECT_{i}@@@"
	protected_content = protected_content.replace(placeholder, inline_block)

	return protected_content

	def preserve_markdown_links(content):
	md_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]$([^)]+)$')

	html_link_pattern = re.compile(r'<a\s+([^>]+)>(.*?)</a>', re.IGNORECASE \| re.DOTALL)

	special_link_pattern = re.compile(r'<(https?://[^>]+)>')

	protected_content = content

	def replace_html_link(match):
	attrs_str = match.group(1)
	link_text = match.group(2).strip()

	href_match = re.search(r'href\s=\s["\']([^"\']+)["\']', attrs_str)
	if href_match:
	href = href_match.group(1)
	if not href.startswith(('http://', 'https://', 'mailto:')):
	href = re.sub(r'^\./', '', href)
	href = re.sub(r'^/', '', href)
	return f'[{link_text}]({href})'
	return link_text

	protected_content = html_link_pattern.sub(replace_html_link, protected_content)

	def replace_special_link(match):
	url = match.group(1)
	return f'[{url}]({url})'

	protected_content = special_link_pattern.sub(replace_special_link, protected_content)

	return protected_content

	def fix_special_characters(content):
	content = re.sub(r'（0,100）', r'（0-100）', content)
	content = re.sub(r'（0\.100）', r'（0-100）', content)
	content = re.sub(r'(\d):(\d)', r'\1-\2', content)
	content = re.sub(r'（(\d+)[.,](\d{2,3})）', r'（\1-\2）', content)
	return content

	def fix_text_code_blocks(content):
	text_pattern = re.compile(r'```text\n(.*?)\n```', re.DOTALL)

	def fix_text_content(match):
	text_content = match.group(1)
	lines = text_content.split('\n')
	fixed_lines = []

	for line in lines:
	line = line.rstrip()
	if len(line) > 60:
	for i in range(0, len(line), 50):
	fixed_lines.append(line[i:i+50])
	else:
	fixed_lines.append(line)

	fixed_content = '\n'.join(fixed_lines)
	return f'```text\n{fixed_content}\n```'

	return text_pattern.sub(fix_text_content, content)

	def process_markdown_file(filepath):
	try:
	with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
	content = f.read()

	content = fix_special_characters(content)
	content = fix_text_code_blocks(content)

	protected_content, code_blocks, inline_code_blocks, quote_blocks = protect_code_blocks(content)

	protected_content = preserve_markdown_links(protected_content)

	processed_content = process_images_in_content(protected_content)

	processed_content = remove_image_labels_and_figures(processed_content)

	processed_content = fix_tilde_in_text(processed_content)

	processed_content = re.sub(r'<(?!@@@CODE_BLOCK_\|@@@INLINE_CODE_\|@@@QUOTE_BLOCK_)(?!\/?code>)(?!\/?pre>)(?!\/?span>)(?!\/?div>)(?!\/?p>)(?!\/?br>)[^>]+>', '', processed_content)

	final_content = restore_code_blocks(processed_content, code_blocks, inline_code_blocks, quote_blocks)

	with open(filepath, 'w', encoding='utf-8') as f:
	f.write(final_content)

	return True
	except Exception as e:
	return False

	processed = 0
	for root, dirs, files in os.walk('.'):
	skip_dirs = ['.git', 'pdf_output', '__pycache__', '.github', 'node_modules', 'output']
	dirs[:] = [d for d in dirs if d not in skip_dirs]

	for file in files:
	if file.endswith('.md'):
	filepath = os.path.join(root, file)
	if process_markdown_file(filepath):
	processed += 1
	EOF

	python3 convert_html_clean.py

	- name: "Create LaTeX template"
	run: \|
	cat > header.tex << 'EOF'
	\usepackage{xeCJK}
	\usepackage[UTF8]{ctex}
	\setCJKmainfont{Noto Serif CJK SC}
	\setCJKsansfont{Noto Sans CJK SC}
	\setCJKmonofont{Noto Sans Mono CJK SC}

	\usepackage[margin=2.0cm]{geometry}

	\usepackage{hyperref}
	\hypersetup{
	colorlinks=true,
	linkcolor={rgb}{0.03,0.4,0.84},
	urlcolor={rgb}{0.03,0.4,0.84},
	citecolor={rgb}{0.03,0.4,0.84},
	pdfborder={0 0 0},
	}

	\usepackage{minted}
	\usepackage{graphicx}
	\usepackage{float}
	\usepackage{placeins}
	\usepackage{mdframed}
	\usepackage{xspace}

	\usepackage{caption}
	\captionsetup[figure]{labelformat=empty, textformat=empty}
	\usepackage{subcaption}
	\captionsetup[subfigure]{labelformat=empty, textformat=empty}

	% 定义颜色
	\definecolor{codebg}{rgb}{0.98,0.98,0.98}
	\definecolor{framecolor}{rgb}{0.85,0.85,0.85}

	% 配置 minted 样式
	\setminted{
	style=tango,
	bgcolor=codebg,
	frame=single,
	framesep=6pt,
	framerule=0.8pt,
	rulecolor=framecolor,
	breaklines=true,
	breakanywhere=true,
	tabsize=2,
	fontsize=\small,
	xleftmargin=10pt,
	xrightmargin=10pt,
	numbers=none,
	autogobble,
	}

	\usepackage{float}
	\floatplacement{figure}{H}

	\usepackage{booktabs}
	\setlength{\heavyrulewidth}{0.08em}
	\setlength{\lightrulewidth}{0.05em}
	\setlength{\aboverulesep}{0.2em}
	\setlength{\belowrulesep}{0.2em}

	\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}

	\usepackage{parskip}
	\setlength{\parindent}{0pt}
	\setlength{\parskip}{0.5em}
	\setlength{\itemsep}{0pt}
	\setlength{\topsep}{0pt}

	\usepackage{titlesec}
	\titleformat{\section}{\Large\bfseries\sffamily}{\thesection}{1em}{}
	\titleformat{\subsection}{\large\bfseries\sffamily}{\thesubsection}{1em}{}
	\titleformat{\subsubsection}{\normalsize\bfseries\sffamily}{\thesubsubsection}{1em}{}
	\titlespacing*{\section}{0pt}{12pt}{6pt}
	\titlespacing*{\subsection}{0pt}{8pt}{4pt}
	\titlespacing*{\subsubsection}{0pt}{6pt}{2pt}

	\renewcommand{\familydefault}{\sfdefault}

	\sloppy
	\emergencystretch=3em

	\usepackage{eso-pic}
	\usepackage{tikz}
	\usetikzlibrary{calc}

	\newcommand\BackgroundPicture{%
	\begin{tikzpicture}[remember picture,overlay]
	\coordinate (page center) at (current page.center);
	\pgfmathsetmacro{\pagewidth}{\paperwidth}
	\pgfmathsetmacro{\pageheight}{\paperheight}

	\pgfmathsetmacro{\hcount}{3}
	\pgfmathsetmacro{\vcount}{4}
	\pgfmathsetmacro{\hspace}{\paperwidth/(\hcount+1)}
	\pgfmathsetmacro{\vspace}{\paperheight/(\vcount+1)}

	\foreach \i in {1,...,\hcount} {
	\foreach \j in {1,...,\vcount} {
	\pgfmathsetmacro{\x}{-\paperwidth/2 + \i*\hspace}
	\pgfmathsetmacro{\y}{-\paperheight/2 + \j*\vspace}

	\node at ($(page center) + (\x pt, \y pt)$) [
	rotate=45,
	scale=1.5,
	text=gray!30,
	opacity=0.45,
	font=\sffamily\bfseries\large
	] {emakefun};
	}
	}
	\end{tikzpicture}%
	}

	\AddToShipoutPictureBG{\BackgroundPicture}

	\graphicspath{{./}{./resource/}}

	\makeatletter
	\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
	\makeatother

	\let\oldincludegraphics\includegraphics
	\renewcommand{\includegraphics}[2][]{%
	\begingroup
	\centering
	\oldincludegraphics[#1,width=\maxwidth]{#2}%
	\endgroup
	}

	\usepackage{titletoc}
	\setcounter{tocdepth}{-1}

	\usepackage{fvextra}
	\usepackage{csquotes}

	\definecolor{quotebg}{RGB}{246,248,250}
	\definecolor{quoteframe}{RGB}{225,228,232}
	\newmdenv[
	backgroundcolor=quotebg,
	linecolor=quoteframe,
	linewidth=0.5pt,
	roundcorner=6pt,
	innerleftmargin=12pt,
	innerrightmargin=12pt,
	innertopmargin=8pt,
	innerbottommargin=8pt,
	skipabove=6pt,
	skipbelow=6pt,
	]{quotebox}

	\newcommand{\quoteblock}[1]{%
	\begin{quotebox}
	#1
	\end{quotebox}
	}
	EOF

	- name: "Create Pandoc filter"
	run: \|
	cat > quote-filter.lua << 'EOF'
	function Block(el)
	if el.t == "BlockQuote" then
	local content = ""
	for i, item in ipairs(el.content) do
	content = content .. pandoc.write(pandoc.Pandoc({item}), "latex")
	end
	return pandoc.RawBlock('latex', '\\begin{quotebox}\n' .. content .. '\\end{quotebox}')
	end

	if el.t == "HorizontalRule" then
	return pandoc.RawBlock('latex', '\\vspace{0.5em}\\HRule\\vspace{0.5em}')
	end

	return el
	end

	function CodeBlock(el)
	if el.classes[1] == "text" then
	return pandoc.RawBlock('latex',
	'\\begin{lstlisting}[style=textstyle]\n' ..
	el.text ..
	'\n\\end{lstlisting}')
	end
	return el
	end

	function Str(el)
	local text = el.text
	text = text:gsub('（0,100）', '（0-100）')
	text = text:gsub('（0%.100）', '（0-100）')
	text = text:gsub('（(%d+)[.,](%d%d%d）', '（%1-%2）')
	return pandoc.Str(text)
	end

	function Code(el)
	return pandoc.Code(el.text)
	end

	function Image(el)
	el.caption = {}
	return el
	end
	EOF

	- name: "Convert Markdown to PDF"
	run: \|
	MD_FILES=$(find . -name ".md" -not -path "./." -not -path "/node_modules/" -not -path "./output/*" \| sed 's/^\.\///')

	IFS=$'\n'
	converted_count=0
	failed_count=0

	CURRENT_DATE=$(date '+%Y-%m-%d')

	for md_file in $MD_FILES; do
	[ -z "$md_file" ] && continue

	dir_name=$(dirname "$md_file")
	base_name=$(basename "$md_file" .md)

	if [ "$dir_name" != "." ]; then
	mkdir -p "$dir_name"
	fi

	md_dir=$(dirname "$md_file")
	resource_paths="$md_dir:$md_dir/../resource:.:./resource"

	set +e

	pandoc "$md_file" \
	--resource-path="$resource_paths" \
	-o "$dir_name/$base_name.pdf" \
	--pdf-engine=xelatex \
	--pdf-engine-opt=-shell-escape \
	--highlight-style=pygments \
	--include-in-header=header.tex \
	-V mainfont="Noto Sans CJK SC" \
	-V sansfont="Noto Sans CJK SC" \
	-V monofont="Noto Sans Mono CJK SC" \
	-V geometry:margin=2.0cm \
	-V geometry:a4paper \
	-V colorlinks=true \
	-V linkcolor=blue \
	-V urlcolor=blue \
	-V graphics \
	--wrap=auto \
	-f markdown \
	-M date="$CURRENT_DATE" \
	2>&1 >/dev/null

	conversion_status=$?

	set -e

	if [ $conversion_status -eq 0 ] && [ -f "$dir_name/$base_name.pdf" ] && [ -s "$dir_name/$base_name.pdf" ]; then
	converted_count=$((converted_count + 1))
	else
	failed_count=$((failed_count + 1))
	fi
	done

	if [ $converted_count -eq 0 ] && [ $failed_count -gt 0 ]; then
	exit 1
	fi

	- name: "Copy resource directories"
	run: \|
	pdf_dirs=$(find . -name ".pdf" -type f -not -path "./." -not -path "/node_modules/" -not -path "/resource/" \| xargs -I {} dirname {} \| sort -u)

	for pdf_dir in $pdf_dirs; do
	if [ -d "$pdf_dir/resource" ]; then
	continue
	fi

	if [ -d "./resource" ] && [ "$pdf_dir" != "." ]; then
	if command -v rsync >/dev/null 2>&1; then
	rsync -av --exclude='resource/' ./resource/ "$pdf_dir/resource/" >/dev/null 2>&1 \|\| true
	else
	mkdir -p "$pdf_dir/resource"
	find "./resource" -type f -exec cp --parents {} "$pdf_dir/" 2>/dev/null \; \|\| true
	fi
	fi

	parent_dir=$(dirname "$pdf_dir")
	if [ -d "$parent_dir/resource" ] && [ "$parent_dir" != "." ]; then
	if command -v rsync >/dev/null 2>&1; then
	rsync -av --exclude='resource/' "$parent_dir/resource/" "$pdf_dir/resource/" >/dev/null 2>&1 \|\| true
	else
	mkdir -p "$pdf_dir/resource"
	find "$parent_dir/resource" -type f -exec cp --parents {} "$pdf_dir/" 2>/dev/null \; \|\| true
	fi
	fi

	if [ -d "$pdf_dir/resource/resource" ]; then
	rm -rf "$pdf_dir/resource/resource"
	fi
	done

	- name: "Verify PDF files"
	run: \|
	total_pdfs=$(find . -name ".pdf" -type f -not -path "./." -not -path "/node_modules/" 2>/dev/null \| wc -l)

	if [ $total_pdfs -eq 0 ]; then
	exit 1
	fi

	- name: "Create zip archive for release"
	if: startsWith(github.ref, 'refs/tags/')
	run: \|
	TAG_NAME="${GITHUB_REF#refs/tags/}"

	mkdir -p pdf_collection

	PDF_FILES=$(find . -name "*.pdf" -type f \
	-not -path "./.*" \
	-not -path "/node_modules/" \
	-not -path "./pdf_collection/*" \
	-not -path "/resource/" \| sed 's/^\.\///')

	echo "$PDF_FILES" \| while read pdf_file; do
	[ -z "$pdf_file" ] && continue

	dir_name=$(dirname "$pdf_file")
	base_name=$(basename "$pdf_file" .pdf)

	mkdir -p "pdf_collection/$dir_name"

	cp "$pdf_file" "pdf_collection/$pdf_file"

	if [ -d "$dir_name/resource" ]; then
	mkdir -p "pdf_collection/$dir_name/resource"
	find "$dir_name/resource" -type f \
	-exec cp --parents {} "pdf_collection/" \; 2>/dev/null \|\| true
	fi

	current_dir="$dir_name"
	for i in 1 2 3; do
	parent_dir=$(dirname "$current_dir")
	if [ "$parent_dir" = "." ]; then
	if [ -d "./resource" ]; then
	mkdir -p "pdf_collection/$dir_name/resource"
	find "./resource" -type f \
	-exec cp --parents {} "pdf_collection/$dir_name/" \; 2>/dev/null \|\| true
	fi
	break
	elif [ -d "$parent_dir/resource" ]; then
	mkdir -p "pdf_collection/$dir_name/resource"
	find "$parent_dir/resource" -type f \
	-exec cp --parents {} "pdf_collection/$dir_name/" \; 2>/dev/null \|\| true
	break
	fi
	current_dir="$parent_dir"
	done

	if [ -d "./resource" ]; then
	if [ ! -d "pdf_collection/resource" ]; then
	mkdir -p "pdf_collection/resource"
	find "./resource" -type f \
	-exec cp {} "pdf_collection/resource/" \; 2>/dev/null \|\| true
	fi
	fi
	done

	cd pdf_collection
	zip -r "../${{ github.event.repository.name }}-$TAG_NAME.zip" . >/dev/null
	cd ..

	rm -rf pdf_collection

	- name: "Upload PDF artifacts"
	uses: actions/upload-artifact@v4
	with:
	name: pdf-documents
	path: \|
	*/.pdf
	**/resource/
	retention-days: 7

	- name: "Create GitHub Release"
	if: startsWith(github.ref, 'refs/tags/')
	id: create_release
	uses: softprops/action-gh-release@v1
	with:
	tag_name: ${{ github.ref_name }}
	name: "${{ github.ref_name }}"
	files: \|
	${{ github.event.repository.name }}-${{ github.ref_name }}.zip
	draft: false
	prerelease: false
	generate_release_notes: true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

first commit17 #20

Workflow file

first commit17 #20

Uh oh!

Workflow file for this run