diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..7b3cfad --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true +indent_style = space +indent_size = 2 + +[*.{diff,md}] +trim_trailing_whitespace = false diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b4d59e..9659964 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -name: CI Push +name: Push to main on: push: @@ -19,5 +19,6 @@ jobs: version: 8 - run: pnpm install - run: pnpm lint + - run: pnpm format - run: pnpm test - run: pnpm build diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 167a155..94740e1 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -30,6 +30,9 @@ jobs: - name: Run lint run: pnpm lint + - name: Run format + run: pnpm format + - name: Run tests run: pnpm test diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..f06235c --- /dev/null +++ b/.prettierignore @@ -0,0 +1,2 @@ +node_modules +dist diff --git a/.prettierrc.json b/.prettierrc.json new file mode 100644 index 0000000..eba5a68 --- /dev/null +++ b/.prettierrc.json @@ -0,0 +1,7 @@ +{ + "semi": true, + "singleQuote": true, + "trailingComma": "es5", + "printWidth": 80, + "tabWidth": 2 +} diff --git a/README.md b/README.md index 3834e90..03280ed 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Convert HTML into plain text while optionally preserving formatting and keeping - Convert HTML to plain text - Preserve formatting such as paragraphs, headings, lists, bold, italic, links, blockquotes, and tables - Optionally ignore specific tags to keep them in the output +- Wrap output by word count or character length - Handles self-closing tags and nested content - Strips unknown tags and decodes common HTML entities (` `, `&`, `<`, `>`) @@ -25,30 +26,60 @@ yarn add html-textify ## Usage ```ts -import { textify } from "html-textify"; +import { textify } from 'html-textify'; // Simple usage -const html = "
Hello World
"; +const html = 'Hello World
'; const plain = textify({ html }); -console.log(plain); // "**Hello** World" +console.log(plain); // "Hello **World**" // Preserve formatting but ignore certain tags -const html2 = "Paragraph bold italic
"; +const html2 = 'Paragraph bold italic
'; const result = textify({ html: html2, preserveFormatting: true, - ignoreTags: ["b", "i"], + ignoreTags: ['b', 'i'], }); console.log(result); // "Paragraph bolditalic" // Strip all tags except ignored ones -const html3 = "Paragraph highlighted
"; +const html3 = 'Paragraph highlighted
'; const stripped = textify({ html: html3, preserveFormatting: false, - ignoreTags: ["mark"], + ignoreTags: ['mark'], }); console.log(stripped); // "Paragraph highlighted" + +// Wrap by words (max 2 words per line) +const html4 = 'one two three four five
'; +const wrappedWords = textify({ + html: html4, + preserveFormatting: false, + wrapWords: 2, +}); +console.log(wrappedWords); +/* Output: +one two +three four +five +*/ + +// Wrap by characters (max 10 characters per line) +const html5 = 'This is a test sentence for wrapping.
'; +const wrappedChars = textify({ + html: html5, + preserveFormatting: false, + wrapLength: 10, +}); +console.log(wrappedChars); +/* Output: +This is a +test +sentence +for +wrapping. +*/ ``` ## API @@ -58,11 +89,13 @@ console.log(stripped); // "Paragraph highlighted" - `options.html (string)` – HTML string to convert - `options.preserveFormatting (boolean, default: true)` – Whether to keep formatting like lists, headings, blockquotes, bold/italic - `options.ignoreTags (string[], optional)` – Tags to keep intact in output (e.g., ["b", "mark"]) +- `options.wrapWords (number, optional)` – Maximum words per line (takes priority over wrapLength) +- `options.wrapLength (number, optional)` – Maximum characters per line ## Examples ```ts -import { textify } from "html-textify"; +import { textify } from 'html-textify'; const html = `Hello world
", preserveFormatting: false }); + * // => "Hello world" + * + * @example + * textify({ html: "one two three four five
", wrapWords: 2 }); + * // => "one two\nthree four\nfive" + * + * @example + * textify({ html: "one two three four five
", wrapLength: 10 }); + * // => "one two\nthree four\nfive" + */ export function textify({ html, preserveFormatting = true, ignoreTags = [], + wrapLength, + wrapWords, }: TextifyOptions): string { - // Ignore rest of the function if it's already empty - if (!html) return ""; + if (!html) return ''; + // Strip or preserve HTML formatting if (preserveFormatting) { - // Keep readable formatting html = preserveFormat({ html, ignoreTags }); } else { if (ignoreTags.length === 0) { - // Strip all tags - html = html.replace(/<[^>]+>/g, "").trim(); + html = html.replace(/<[^>]+>/g, '').trim(); } else { - // Regex to match all tags except the ignored ones const IG = new Set(ignoreTags.map((t) => t.toLowerCase())); html = html .replace(/<\/?([a-z][a-z0-9-]*)\b[^>]*>/gi, (match, tag) => - IG.has(tag.toLowerCase()) ? match : "" + IG.has(tag.toLowerCase()) ? match : '' ) .trim(); } } + + // Wrap output text (word-based wrapping takes priority) + if (wrapWords && wrapWords > 0) { + html = wrapByWords(html, wrapWords); + } else if (wrapLength && wrapLength > 0) { + html = wrapByLength(html, wrapLength); + } + return html; } diff --git a/src/textify.test.ts b/src/textify.test.ts index 1e678cc..80ed5a1 100644 --- a/src/textify.test.ts +++ b/src/textify.test.ts @@ -1,98 +1,131 @@ -import { textify } from "./index"; +import { textify } from './index'; -describe("textify", () => { - test("returns empty string if html is empty", () => { - expect(textify({ html: "" })).toBe(""); - expect(textify({ html: null as unknown as string })).toBe(""); +describe('textify', () => { + test('returns empty string if html is empty', () => { + expect(textify({ html: '' })).toBe(''); + expect(textify({ html: null as unknown as string })).toBe(''); }); - test("strips all tags except ignored ones", () => { + test('strips all tags except ignored ones', () => { const html = - "Paragraph bold
Paragraph bold
Paragraph bold italic underlined
"; + test('handles multiple ignored tags', () => { + const html = 'Paragraph bold italic underlined
'; const result = textify({ html, preserveFormatting: false, - ignoreTags: ["b", "u"], + ignoreTags: ['b', 'u'], }); - expect(result).toBe("Paragraph bold italic underlined"); + expect(result).toBe('Paragraph bold italic underlined'); }); - test("trims whitespace after stripping tags", () => { - const html = "Test
"; + test('trims whitespace after stripping tags', () => { + const html = 'Test
'; const result = textify({ html, preserveFormatting: false }); - expect(result).toBe("Test"); + expect(result).toBe('Test'); }); - test("preserveFormat has no effect when they are in ignoreTags", () => { - const html = "Paragraph bold italic
"; + test('preserveFormat has no effect when they are in ignoreTags', () => { + const html = 'Paragraph bold italic
'; const result = textify({ html, preserveFormatting: true, - ignoreTags: ["b", "i"], + ignoreTags: ['b', 'i'], }); - expect(result).toBe("Paragraph bolditalic"); + expect(result).toBe('Paragraph bolditalic'); }); - test("removes all tags when ignoreTags is empty", () => { - const html = "Text with bold tag
"; + test('case-insensitive matching for ignoreTags', () => { + const html = 'Text with bold tag
'; const result = textify({ html, preserveFormatting: false, - ignoreTags: ["b"], + ignoreTags: ['b'], }); - expect(result).toBe("Text with bold tag"); + expect(result).toBe('Text with bold tag'); }); - test("self-closing ignored tags are preserved", () => { - const html = "Line breakHello world
"; + test('preserveFormatting=true delegates to preserveFormat', () => { + const html = 'Hello world
'; const result = textify({ html, preserveFormatting: true }); // since preserveFormat handles it, just check it returns something non-empty - expect(result).not.toBe(""); + expect(result).not.toBe(''); + }); + + test('wraps text by word count when wrapWords is set', () => { + const html = 'one two three four five six seven
'; + const result = textify({ html, preserveFormatting: false, wrapWords: 3 }); + expect(result).toBe('one two three\nfour five six\nseven'); + }); + + test('wraps text by character length when wrapLength is set', () => { + const html = 'This is a test sentence for wrapping.
'; + const result = textify({ html, preserveFormatting: false, wrapLength: 10 }); + expect(result).toBe('This is a\ntest\nsentence\nfor\nwrapping.'); + }); + + test('wrapWords takes priority over wrapLength', () => { + const html = 'one two three four five
'; + const result = textify({ + html, + preserveFormatting: false, + wrapWords: 2, + wrapLength: 5, + }); + expect(result).toBe('one two\nthree four\nfive'); + }); + + test('does not wrap when wrapWords or wrapLength is zero or negative', () => { + const html = 'one two three
'; + expect(() => + textify({ html, preserveFormatting: false, wrapWords: 0 }) + ).not.toThrow(); + expect(() => + textify({ html, preserveFormatting: false, wrapLength: 0 }) + ).not.toThrow(); }); }); diff --git a/src/utils/preserveFormat.test.ts b/src/utils/preserveFormat.test.ts index b878fd5..90d04ab 100644 --- a/src/utils/preserveFormat.test.ts +++ b/src/utils/preserveFormat.test.ts @@ -1,111 +1,111 @@ -import preserveFormat from "./preserveFormat"; +import { preserveFormat } from './preserveFormat'; -describe("preserveFormat", () => { - it("should return empty string for empty input", () => { - expect(preserveFormat({ html: "" })).toBe(""); +describe('preserveFormat', () => { + it('should return empty string for empty input', () => { + expect(preserveFormat({ html: '' })).toBe(''); }); - it("should handle line breaks and paragraphs", () => { - const html = "First paragraph
First paragraph
Quote line 1"; - const expected = "> Quote line 1\n> Quote line 2"; + it('should handle blockquotes', () => { + const html = '
Quote line 2
Quote line 1'; + const expected = '> Quote line 1\n> Quote line 2'; expect(preserveFormat({ html })).toBe(expected); }); - it("should handle tables", () => { + it('should handle tables', () => { const html = `
Quote line 2
| A1 | B1 |
| A2 | B2 |
Line 1 & test
Line 2
"; - const expected = "Line 1 & test\n\nLine 2"; + it('should handle multiple newlines and HTML entities', () => { + const html = 'Line 1 & test
Line 2
'; + const expected = 'Line 1 & test\n\nLine 2'; expect(preserveFormat({ html })).toBe(expected); }); - it("should strip unknown tags", () => { - const html = "Paragraph
"; - const expected = "Paragraph
"; - expect(preserveFormat({ html, ignoreTags: ["h1", "p"] })).toBe(expected); + it('should ignore headings and paragraphs when listed in ignoreTags', () => { + const html = 'Paragraph
'; + const expected = 'Paragraph
'; + expect(preserveFormat({ html, ignoreTags: ['h1', 'p'] })).toBe(expected); }); - it("should ignore links when listed in ignoreTags", () => { + it('should ignore links when listed in ignoreTags', () => { const html = 'Click here'; const expected = 'Click here'; - expect(preserveFormat({ html, ignoreTags: ["a"] })).toBe(expected); + expect(preserveFormat({ html, ignoreTags: ['a'] })).toBe(expected); }); - it("should ignore blockquotes when listed in ignoreTags", () => { - const html = "Quote line 1"; - const expected = "
Quote line 2
Quote line 1\nQuote line 2"; - expect(preserveFormat({ html, ignoreTags: ["blockquote"] })).toBe(expected); + it('should ignore blockquotes when listed in ignoreTags', () => { + const html = '
Quote line 1'; + const expected = '
Quote line 2
Quote line 1\nQuote line 2'; + expect(preserveFormat({ html, ignoreTags: ['blockquote'] })).toBe(expected); }); - it("should ignore bold and italic when listed in ignoreTags", () => { - const html = "Bold and Italic"; - const expected = "Bold and Italic"; - expect(preserveFormat({ html, ignoreTags: ["b", "i"] })).toBe(expected); + it('should ignore bold and italic when listed in ignoreTags', () => { + const html = 'Bold and Italic'; + const expected = 'Bold and Italic'; + expect(preserveFormat({ html, ignoreTags: ['b', 'i'] })).toBe(expected); }); - it("should ignore table rows and cells when listed in ignoreTags", () => { + it('should ignore table rows and cells when listed in ignoreTags', () => { const html = `
| A1 | B1 |
| A2 | B2 |
| A1 | B1 |
| A2 | B2 |
Paragraph with bold and italic text
`) with `> ` prefix. + * - Converts tables to tab-delimited rows. + * - Decodes common HTML entities. + * - Collapses multiple newlines to a maximum of two. + * + * @param {Object} options - Options for preserving format. + * @param {string} options.html - The input HTML string to format. + * @param {string[]} [options.ignoreTags] - List of tags to leave intact (default: `[]`). + * @returns {string} The formatted plain-text representation of the HTML. + * + * @example + * preserveFormat({ html: 'Hello world
' }); + * // => 'Hello **world**' + * + * @example + * preserveFormat({ html: '' }); + * // => '- One\n- Two' + * + * @example + * preserveFormat({ html: 'Link', ignoreTags: ['a'] }); + * // => 'Link' + */ +export function preserveFormat({ html, ignoreTags = [], }: PreserveFormatOptions): string { - if (!html) return ""; + if (!html) return ''; // Normalize spaces between tags - html = html.replace(/>\s+<"); + html = html.replace(/>\s+<'); // Convert
- One
- Two
to newline - html = !ignoreTags.includes("br") ? html.replace(/
/gi, "\n") : html; + html = !ignoreTags.includes('br') ? html.replace(/
/gi, '\n') : html; // Headings and paragraphs -> double newline html = html.replace(/<\/(h[1-6]|p)>/gi, (match, tag) => - ignoreTags.includes(tag.toLowerCase()) ? match : "\n\n" + ignoreTags.includes(tag.toLowerCase()) ? match : '\n\n' ); // Bold @@ -33,7 +63,7 @@ export default function preserveFormat({ ); // Links - html = !ignoreTags.includes("a") + html = !ignoreTags.includes('a') ? html.replace( /(.*?)<\/a>/gi, (_m, href: string, text: string) => `${text} (${href})` @@ -42,32 +72,32 @@ export default function preserveFormat({ // Ordered lists html = html.replace(/(.*?)<\/ol>/gis, (match, content: string) => { - if (ignoreTags.includes("ol")) return match; // leave
as-is + if (ignoreTags.includes('ol')) return match; // leave
as-is let counter = 0; return content.replace(/
- (.*?)<\/li>/gi, (liMatch, liContent: string) => - ignoreTags.includes("li") ? liMatch : `${++counter}. ${liContent}\n` + ignoreTags.includes('li') ? liMatch : `${++counter}. ${liContent}\n` ); }); // Unordered lists html = html.replace(/
(.*?)<\/ul>/gis, (match, content: string) => { - if (ignoreTags.includes("ul")) return match; // keep whole
block + if (ignoreTags.includes('ul')) return match; // keep whole
block return content.replace(/
- (.*?)<\/li>/gi, (liMatch, liContent: string) => - ignoreTags.includes("li") ? liMatch : `- ${liContent}\n` + ignoreTags.includes('li') ? liMatch : `- ${liContent}\n` ); }); // Blockquotes - html = !ignoreTags.includes("blockquote") + html = !ignoreTags.includes('blockquote') ? html.replace( /
(.*?)<\/blockquote>/gis, (_m, content: string) => content - .replace(/
/gi, "\n") + .replace(/
/gi, '\n') .trim() - .split("\n") + .split('\n') .map((line) => `> ${line.trim()}`) - .join("\n") + .join('\n') ) : html; @@ -75,21 +105,21 @@ export default function preserveFormat({ html = html.replace( /(.*?)<\/table>/gis, (match, tableContent: string) => { - if (ignoreTags.includes("table")) return match; // keep whole table + if (ignoreTags.includes('table')) return match; // keep whole table return tableContent .replace(/
(.*?)<\/tr>/gi, (trMatch, rowContent: string) => { - if (ignoreTags.includes("tr")) return trMatch; + if (ignoreTags.includes('tr')) return trMatch; return ( rowContent .replace( / (.*?)<\/t[dh]>/gi, (cellMatch, cellContent: string) => - ignoreTags.includes("td") || ignoreTags.includes("th") + ignoreTags.includes('td') || ignoreTags.includes('th') ? cellMatch : `${cellContent}\t` ) .trim() - .replace(/\t$/, "") + "\n" + .replace(/\t$/, '') + '\n' ); }) .trim(); @@ -98,22 +128,22 @@ export default function preserveFormat({ // Remove all remaining tags if (ignoreTags.length === 0) { - html = html.replace(/<[^>]+>/g, ""); + html = html.replace(/<[^>]+>/g, ''); } else { html = html.replace(/<\/?([a-z0-9]+)[^>]*>/gi, (match, tag: string) => - ignoreTags.includes(tag.toLowerCase()) ? match : "" + ignoreTags.includes(tag.toLowerCase()) ? match : '' ); } // Decode common HTML entities html = html - .replace(/ /gi, " ") - .replace(/&/gi, "&") - .replace(/</gi, "<") - .replace(/>/gi, ">"); + .replace(/ /gi, ' ') + .replace(/&/gi, '&') + .replace(/</gi, '<') + .replace(/>/gi, '>'); // Collapse multiple newlines to max two - html = html.replace(/\n{3,}/g, "\n\n").trim(); + html = html.replace(/\n{3,}/g, '\n\n').trim(); return html; } diff --git a/src/utils/wrapByLength.test.ts b/src/utils/wrapByLength.test.ts new file mode 100644 index 0000000..22005a4 --- /dev/null +++ b/src/utils/wrapByLength.test.ts @@ -0,0 +1,45 @@ +import { wrapByLength } from './wrapByLength'; + +describe('wrapByLength', () => { + test('wraps text at given character length without breaking words', () => { + const text = 'This is a very long sentence'; + expect(wrapByLength(text, 10)).toBe('This is a\nvery long\nsentence'); + }); + + test('returns text unchanged if shorter than length', () => { + expect(wrapByLength('short text', 20)).toBe('short text'); + }); + + test('handles exact line length', () => { + expect(wrapByLength('12345 67890', 11)).toBe('12345 67890'); + }); + + test('splits multiple lines correctly', () => { + const text = 'one two three four five six seven eight nine'; + expect(wrapByLength(text, 13)).toBe( + 'one two three\nfour five six\nseven eight\nnine' + ); + }); + + test('ignores extra whitespace', () => { + const text = ' alpha beta gamma '; + expect(wrapByLength(text, 8)).toBe('alpha\nbeta\ngamma'); + }); + + test('handles single long word exceeding limit', () => { + const text = 'supercalifragilisticexpialidocious'; + // since function does not force-break words, it stays as is + expect(wrapByLength(text, 10)).toBe(text); + }); + + test('throws error if length is zero or negative', () => { + expect(() => wrapByLength('abc', 0)).toThrow( + 'wrap length must be greater than 0' + ); + expect(() => wrapByLength('abc', -5)).toThrow(); + }); + + test('returns empty string when input is empty', () => { + expect(wrapByLength('', 10)).toBe(''); + }); +}); diff --git a/src/utils/wrapByLength.ts b/src/utils/wrapByLength.ts new file mode 100644 index 0000000..7e871c3 --- /dev/null +++ b/src/utils/wrapByLength.ts @@ -0,0 +1,34 @@ +/** + * Wraps text into lines with a maximum number of characters. + * Breaks at word boundaries when possible. + * + * @param {string} text - The input text to wrap. + * @param {number} length - Maximum allowed characters per line. + * @returns {string} The wrapped text, with lines separated by newline characters. + * + * @example + * wrapByLength("This is a very long sentence", 10); + * // => "This is a\nvery long\nsentence" + */ +export function wrapByLength(text: string, length: number): string { + if (length <= 0) { + throw new Error('wrap length must be greater than 0'); + } + + const words = text.trim().split(/\s+/); + const lines: string[] = []; + let line = ''; + + for (const word of words) { + if ((line + ' ' + word).trim().length > length) { + if (line) lines.push(line.trim()); + line = word; + } else { + line += ' ' + word; + } + } + + if (line) lines.push(line.trim()); + + return lines.join('\n'); +} diff --git a/src/utils/wrapByWords.test.ts b/src/utils/wrapByWords.test.ts new file mode 100644 index 0000000..7ad9e19 --- /dev/null +++ b/src/utils/wrapByWords.test.ts @@ -0,0 +1,31 @@ +import { wrapByWords } from './wrapByWords'; + +describe('wrapByWords', () => { + test('splits text into lines of given word count', () => { + const text = 'one two three four five six seven'; + expect(wrapByWords(text, 3)).toBe('one two three\nfour five six\nseven'); + }); + + test('handles text shorter than word count', () => { + const text = 'hello world'; + expect(wrapByWords(text, 5)).toBe('hello world'); + }); + + test('handles exact multiples', () => { + const text = 'a b c d'; + expect(wrapByWords(text, 2)).toBe('a b\nc d'); + }); + + test('ignores extra whitespace', () => { + const text = ' alpha beta gamma '; + expect(wrapByWords(text, 2)).toBe('alpha beta\ngamma'); + }); + + test('handles single word', () => { + expect(wrapByWords('word', 2)).toBe('word'); + }); + + test('returns empty string for empty input', () => { + expect(wrapByWords('', 3)).toBe(''); + }); +}); diff --git a/src/utils/wrapByWords.ts b/src/utils/wrapByWords.ts new file mode 100644 index 0000000..1117671 --- /dev/null +++ b/src/utils/wrapByWords.ts @@ -0,0 +1,21 @@ +/** + * Wraps text into lines containing a fixed number of words. + * + * @param {string} text - The input text to wrap. + * @param {number} count - Maximum number of words per line. Must be greater than 0. + * @returns {string} The wrapped text, with lines separated by newline characters. + * + * @example + * wrapByWords("one two three four five", 2); + * // => "one two\nthree four\nfive" + */ +export function wrapByWords(text: string, count: number): string { + const words = text.trim().split(/\s+/); + const lines: string[] = []; + + for (let i = 0; i < words.length; i += count) { + lines.push(words.slice(i, i + count).join(' ')); + } + + return lines.join('\n'); +}