forked from santifer/career-ops
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate-pdf.mjs
More file actions
177 lines (149 loc) · 5.53 KB
/
generate-pdf.mjs
File metadata and controls
177 lines (149 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env node
/**
* generate-pdf.mjs — HTML → PDF via Playwright
*
* Usage:
* node career-ops/generate-pdf.mjs <input.html> <output.pdf> [--format=letter|a4]
*
* Requires: @playwright/test (or playwright) installed.
* Uses Chromium headless to render the HTML and produce a clean, ATS-parseable PDF.
*/
import { chromium } from 'playwright';
import { resolve, dirname } from 'path';
import { readFile } from 'fs/promises';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
/**
* Normalize text for ATS compatibility by converting problematic Unicode.
*
* ATS parsers and legacy systems often fail on em-dashes, smart quotes,
* zero-width characters, and non-breaking spaces. These cause mojibake,
* parsing errors, or display issues. See issue #1.
*
* Only touches body text — preserves CSS, JS, tag attributes, and URLs.
* Returns { html, replacements } so the caller can log what was changed.
*/
function normalizeTextForATS(html) {
const replacements = {};
const bump = (key, n) => { replacements[key] = (replacements[key] || 0) + n; };
const masks = [];
const masked = html.replace(
/<(style|script)\b[^>]*>[\s\S]*?<\/\1>/gi,
(match) => {
const token = `\u0000MASK${masks.length}\u0000`;
masks.push(match);
return token;
}
);
let out = '';
let i = 0;
while (i < masked.length) {
const lt = masked.indexOf('<', i);
if (lt === -1) { out += sanitizeText(masked.slice(i)); break; }
out += sanitizeText(masked.slice(i, lt));
const gt = masked.indexOf('>', lt);
if (gt === -1) { out += masked.slice(lt); break; }
out += masked.slice(lt, gt + 1);
i = gt + 1;
}
const restored = out.replace(/\u0000MASK(\d+)\u0000/g, (_, n) => masks[Number(n)]);
return { html: restored, replacements };
function sanitizeText(text) {
if (!text) return text;
let t = text;
t = t.replace(/\u2014/g, () => { bump('em-dash', 1); return '-'; });
t = t.replace(/\u2013/g, () => { bump('en-dash', 1); return '-'; });
t = t.replace(/[\u201C\u201D\u201E\u201F]/g, () => { bump('smart-double-quote', 1); return '"'; });
t = t.replace(/[\u2018\u2019\u201A\u201B]/g, () => { bump('smart-single-quote', 1); return "'"; });
t = t.replace(/\u2026/g, () => { bump('ellipsis', 1); return '...'; });
t = t.replace(/[\u200B\u200C\u200D\u2060\uFEFF]/g, () => { bump('zero-width', 1); return ''; });
t = t.replace(/\u00A0/g, () => { bump('nbsp', 1); return ' '; });
return t;
}
}
async function generatePDF() {
const args = process.argv.slice(2);
// Parse arguments
let inputPath, outputPath, format = 'a4';
for (const arg of args) {
if (arg.startsWith('--format=')) {
format = arg.split('=')[1].toLowerCase();
} else if (!inputPath) {
inputPath = arg;
} else if (!outputPath) {
outputPath = arg;
}
}
if (!inputPath || !outputPath) {
console.error('Usage: node generate-pdf.mjs <input.html> <output.pdf> [--format=letter|a4]');
process.exit(1);
}
inputPath = resolve(inputPath);
outputPath = resolve(outputPath);
// Validate format
const validFormats = ['a4', 'letter'];
if (!validFormats.includes(format)) {
console.error(`Invalid format "${format}". Use: ${validFormats.join(', ')}`);
process.exit(1);
}
console.log(`📄 Input: ${inputPath}`);
console.log(`📁 Output: ${outputPath}`);
console.log(`📏 Format: ${format.toUpperCase()}`);
// Read HTML to inject font paths as absolute file:// URLs
let html = await readFile(inputPath, 'utf-8');
// Resolve font paths relative to career-ops/fonts/
const fontsDir = resolve(__dirname, 'fonts');
html = html.replace(
/url\(['"]?\.\/fonts\//g,
`url('file://${fontsDir}/`
);
// Close any unclosed quotes from the replacement
html = html.replace(
/file:\/\/([^'")]+)\.woff2['"]\)/g,
`file://$1.woff2')`
);
// Normalize text for ATS compatibility (issue #1)
const normalized = normalizeTextForATS(html);
html = normalized.html;
const totalReplacements = Object.values(normalized.replacements).reduce((a, b) => a + b, 0);
if (totalReplacements > 0) {
const breakdown = Object.entries(normalized.replacements).map(([k, v]) => `${k}=${v}`).join(', ');
console.log(`🧹 ATS normalization: ${totalReplacements} replacements (${breakdown})`);
}
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
// Set content with file base URL for any relative resources
await page.setContent(html, {
waitUntil: 'networkidle',
baseURL: `file://${dirname(inputPath)}/`,
});
// Wait for fonts to load
await page.evaluate(() => document.fonts.ready);
// Generate PDF
const pdfBuffer = await page.pdf({
format: format,
printBackground: true,
margin: {
top: '0.6in',
right: '0.6in',
bottom: '0.6in',
left: '0.6in',
},
preferCSSPageSize: false,
});
// Write PDF
const { writeFile } = await import('fs/promises');
await writeFile(outputPath, pdfBuffer);
// Count pages (approximate from PDF structure)
const pdfString = pdfBuffer.toString('latin1');
const pageCount = (pdfString.match(/\/Type\s*\/Page[^s]/g) || []).length;
await browser.close();
console.log(`✅ PDF generated: ${outputPath}`);
console.log(`📊 Pages: ${pageCount}`);
console.log(`📦 Size: ${(pdfBuffer.length / 1024).toFixed(1)} KB`);
return { outputPath, pageCount, size: pdfBuffer.length };
}
generatePDF().catch((err) => {
console.error('❌ PDF generation failed:', err.message);
process.exit(1);
});