diff --git a/README.md b/README.md index 3834e90..03280ed 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Convert HTML into plain text while optionally preserving formatting and keeping - Convert HTML to plain text - Preserve formatting such as paragraphs, headings, lists, bold, italic, links, blockquotes, and tables - Optionally ignore specific tags to keep them in the output +- Wrap output by word count or character length - Handles self-closing tags and nested content - Strips unknown tags and decodes common HTML entities (` `, `&`, `<`, `>`) @@ -25,30 +26,60 @@ yarn add html-textify ## Usage ```ts -import { textify } from "html-textify"; +import { textify } from 'html-textify'; // Simple usage -const html = "

Hello World

"; +const html = '

Hello World

'; const plain = textify({ html }); -console.log(plain); // "**Hello** World" +console.log(plain); // "Hello **World**" // Preserve formatting but ignore certain tags -const html2 = "

Paragraph bold italic

"; +const html2 = '

Paragraph bold italic

'; const result = textify({ html: html2, preserveFormatting: true, - ignoreTags: ["b", "i"], + ignoreTags: ['b', 'i'], }); console.log(result); // "Paragraph bolditalic" // Strip all tags except ignored ones -const html3 = "

Paragraph highlighted

"; +const html3 = '

Paragraph highlighted

'; const stripped = textify({ html: html3, preserveFormatting: false, - ignoreTags: ["mark"], + ignoreTags: ['mark'], }); console.log(stripped); // "Paragraph highlighted" + +// Wrap by words (max 2 words per line) +const html4 = '

one two three four five

'; +const wrappedWords = textify({ + html: html4, + preserveFormatting: false, + wrapWords: 2, +}); +console.log(wrappedWords); +/* Output: +one two +three four +five +*/ + +// Wrap by characters (max 10 characters per line) +const html5 = '

This is a test sentence for wrapping.

'; +const wrappedChars = textify({ + html: html5, + preserveFormatting: false, + wrapLength: 10, +}); +console.log(wrappedChars); +/* Output: +This is a +test +sentence +for +wrapping. +*/ ``` ## API @@ -58,11 +89,13 @@ console.log(stripped); // "Paragraph highlighted" - `options.html (string)` – HTML string to convert - `options.preserveFormatting (boolean, default: true)` – Whether to keep formatting like lists, headings, blockquotes, bold/italic - `options.ignoreTags (string[], optional)` – Tags to keep intact in output (e.g., ["b", "mark"]) +- `options.wrapWords (number, optional)` – Maximum words per line (takes priority over wrapLength) +- `options.wrapLength (number, optional)` – Maximum characters per line ## Examples ```ts -import { textify } from "html-textify"; +import { textify } from 'html-textify'; const html = `

Title

diff --git a/package.json b/package.json index f29bbd1..183baaf 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "html-textify", - "version": "0.1.2", + "version": "1.0.0", "description": "Convert html to plain text", "main": "dist/index.js", "module": "dist/index.mjs", diff --git a/src/index.ts b/src/index.ts index a6360d5..969ce69 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,28 +1,54 @@ -import preserveFormat from './utils/preserveFormat'; +import { preserveFormat } from './utils/preserveFormat'; +import { wrapByLength } from './utils/wrapByLength'; +import { wrapByWords } from './utils/wrapByWords'; export interface TextifyOptions { html: string; preserveFormatting?: boolean; // optional, default true ignoreTags?: string[]; // optional tags to keep intact + wrapLength?: number; // max chars per line + wrapWords?: number; // max words per line } +/** + * Converts HTML to plain text with optional formatting and wrapping. + * + * @param {Object} options - Configuration options. + * @param {string} options.html - The input HTML string to convert. + * @param {boolean} [options.preserveFormatting=true] - Whether to preserve readable formatting. + * @param {string[]} [options.ignoreTags=[]] - List of HTML tags to keep intact. + * @param {number} [options.wrapLength] - Maximum characters per line (ignored if wrapWords is set). + * @param {number} [options.wrapWords] - Maximum words per line. Takes priority over wrapLength. + * @returns {string} The plain text result with optional wrapping. + * + * @example + * textify({ html: "

Hello world

", preserveFormatting: false }); + * // => "Hello world" + * + * @example + * textify({ html: "

one two three four five

", wrapWords: 2 }); + * // => "one two\nthree four\nfive" + * + * @example + * textify({ html: "

one two three four five

", wrapLength: 10 }); + * // => "one two\nthree four\nfive" + */ export function textify({ html, preserveFormatting = true, ignoreTags = [], + wrapLength, + wrapWords, }: TextifyOptions): string { - // Ignore rest of the function if it's already empty if (!html) return ''; + // Strip or preserve HTML formatting if (preserveFormatting) { - // Keep readable formatting html = preserveFormat({ html, ignoreTags }); } else { if (ignoreTags.length === 0) { - // Strip all tags html = html.replace(/<[^>]+>/g, '').trim(); } else { - // Regex to match all tags except the ignored ones const IG = new Set(ignoreTags.map((t) => t.toLowerCase())); html = html .replace(/<\/?([a-z][a-z0-9-]*)\b[^>]*>/gi, (match, tag) => @@ -31,5 +57,13 @@ export function textify({ .trim(); } } + + // Wrap output text (word-based wrapping takes priority) + if (wrapWords && wrapWords > 0) { + html = wrapByWords(html, wrapWords); + } else if (wrapLength && wrapLength > 0) { + html = wrapByLength(html, wrapLength); + } + return html; } diff --git a/src/textify.test.ts b/src/textify.test.ts index 67f3eeb..80ed5a1 100644 --- a/src/textify.test.ts +++ b/src/textify.test.ts @@ -95,4 +95,37 @@ describe('textify', () => { // since preserveFormat handles it, just check it returns something non-empty expect(result).not.toBe(''); }); + + test('wraps text by word count when wrapWords is set', () => { + const html = '

one two three four five six seven

'; + const result = textify({ html, preserveFormatting: false, wrapWords: 3 }); + expect(result).toBe('one two three\nfour five six\nseven'); + }); + + test('wraps text by character length when wrapLength is set', () => { + const html = '

This is a test sentence for wrapping.

'; + const result = textify({ html, preserveFormatting: false, wrapLength: 10 }); + expect(result).toBe('This is a\ntest\nsentence\nfor\nwrapping.'); + }); + + test('wrapWords takes priority over wrapLength', () => { + const html = '

one two three four five

'; + const result = textify({ + html, + preserveFormatting: false, + wrapWords: 2, + wrapLength: 5, + }); + expect(result).toBe('one two\nthree four\nfive'); + }); + + test('does not wrap when wrapWords or wrapLength is zero or negative', () => { + const html = '

one two three

'; + expect(() => + textify({ html, preserveFormatting: false, wrapWords: 0 }) + ).not.toThrow(); + expect(() => + textify({ html, preserveFormatting: false, wrapLength: 0 }) + ).not.toThrow(); + }); }); diff --git a/src/utils/preserveFormat.test.ts b/src/utils/preserveFormat.test.ts index 551efcd..90d04ab 100644 --- a/src/utils/preserveFormat.test.ts +++ b/src/utils/preserveFormat.test.ts @@ -1,4 +1,4 @@ -import preserveFormat from './preserveFormat'; +import { preserveFormat } from './preserveFormat'; describe('preserveFormat', () => { it('should return empty string for empty input', () => { diff --git a/src/utils/preserveFormat.ts b/src/utils/preserveFormat.ts index 5dd629b..402377e 100644 --- a/src/utils/preserveFormat.ts +++ b/src/utils/preserveFormat.ts @@ -3,7 +3,37 @@ interface PreserveFormatOptions { ignoreTags?: string[]; } -export default function preserveFormat({ +/** + * Converts HTML to a more readable plain-text format while optionally preserving certain tags. + * - Converts headings and paragraphs to double newlines. + * - Converts `
` to newline. + * - Wraps bold (``, ``) in `**`. + * - Wraps italic (``, ``) in `*`. + * - Converts links `text` to `text (url)`. + * - Formats lists (`
    `, `
      `) and list items. + * - Formats blockquotes (`
      `) with `> ` prefix. + * - Converts tables to tab-delimited rows. + * - Decodes common HTML entities. + * - Collapses multiple newlines to a maximum of two. + * + * @param {Object} options - Options for preserving format. + * @param {string} options.html - The input HTML string to format. + * @param {string[]} [options.ignoreTags] - List of tags to leave intact (default: `[]`). + * @returns {string} The formatted plain-text representation of the HTML. + * + * @example + * preserveFormat({ html: '

      Hello world

      ' }); + * // => 'Hello **world**' + * + * @example + * preserveFormat({ html: '
      • One
      • Two
      ' }); + * // => '- One\n- Two' + * + * @example + * preserveFormat({ html: 'Link', ignoreTags: ['a'] }); + * // => 'Link' + */ +export function preserveFormat({ html, ignoreTags = [], }: PreserveFormatOptions): string { diff --git a/src/utils/wrapByLength.test.ts b/src/utils/wrapByLength.test.ts new file mode 100644 index 0000000..22005a4 --- /dev/null +++ b/src/utils/wrapByLength.test.ts @@ -0,0 +1,45 @@ +import { wrapByLength } from './wrapByLength'; + +describe('wrapByLength', () => { + test('wraps text at given character length without breaking words', () => { + const text = 'This is a very long sentence'; + expect(wrapByLength(text, 10)).toBe('This is a\nvery long\nsentence'); + }); + + test('returns text unchanged if shorter than length', () => { + expect(wrapByLength('short text', 20)).toBe('short text'); + }); + + test('handles exact line length', () => { + expect(wrapByLength('12345 67890', 11)).toBe('12345 67890'); + }); + + test('splits multiple lines correctly', () => { + const text = 'one two three four five six seven eight nine'; + expect(wrapByLength(text, 13)).toBe( + 'one two three\nfour five six\nseven eight\nnine' + ); + }); + + test('ignores extra whitespace', () => { + const text = ' alpha beta gamma '; + expect(wrapByLength(text, 8)).toBe('alpha\nbeta\ngamma'); + }); + + test('handles single long word exceeding limit', () => { + const text = 'supercalifragilisticexpialidocious'; + // since function does not force-break words, it stays as is + expect(wrapByLength(text, 10)).toBe(text); + }); + + test('throws error if length is zero or negative', () => { + expect(() => wrapByLength('abc', 0)).toThrow( + 'wrap length must be greater than 0' + ); + expect(() => wrapByLength('abc', -5)).toThrow(); + }); + + test('returns empty string when input is empty', () => { + expect(wrapByLength('', 10)).toBe(''); + }); +}); diff --git a/src/utils/wrapByLength.ts b/src/utils/wrapByLength.ts new file mode 100644 index 0000000..7e871c3 --- /dev/null +++ b/src/utils/wrapByLength.ts @@ -0,0 +1,34 @@ +/** + * Wraps text into lines with a maximum number of characters. + * Breaks at word boundaries when possible. + * + * @param {string} text - The input text to wrap. + * @param {number} length - Maximum allowed characters per line. + * @returns {string} The wrapped text, with lines separated by newline characters. + * + * @example + * wrapByLength("This is a very long sentence", 10); + * // => "This is a\nvery long\nsentence" + */ +export function wrapByLength(text: string, length: number): string { + if (length <= 0) { + throw new Error('wrap length must be greater than 0'); + } + + const words = text.trim().split(/\s+/); + const lines: string[] = []; + let line = ''; + + for (const word of words) { + if ((line + ' ' + word).trim().length > length) { + if (line) lines.push(line.trim()); + line = word; + } else { + line += ' ' + word; + } + } + + if (line) lines.push(line.trim()); + + return lines.join('\n'); +} diff --git a/src/utils/wrapByWords.test.ts b/src/utils/wrapByWords.test.ts new file mode 100644 index 0000000..7ad9e19 --- /dev/null +++ b/src/utils/wrapByWords.test.ts @@ -0,0 +1,31 @@ +import { wrapByWords } from './wrapByWords'; + +describe('wrapByWords', () => { + test('splits text into lines of given word count', () => { + const text = 'one two three four five six seven'; + expect(wrapByWords(text, 3)).toBe('one two three\nfour five six\nseven'); + }); + + test('handles text shorter than word count', () => { + const text = 'hello world'; + expect(wrapByWords(text, 5)).toBe('hello world'); + }); + + test('handles exact multiples', () => { + const text = 'a b c d'; + expect(wrapByWords(text, 2)).toBe('a b\nc d'); + }); + + test('ignores extra whitespace', () => { + const text = ' alpha beta gamma '; + expect(wrapByWords(text, 2)).toBe('alpha beta\ngamma'); + }); + + test('handles single word', () => { + expect(wrapByWords('word', 2)).toBe('word'); + }); + + test('returns empty string for empty input', () => { + expect(wrapByWords('', 3)).toBe(''); + }); +}); diff --git a/src/utils/wrapByWords.ts b/src/utils/wrapByWords.ts new file mode 100644 index 0000000..1117671 --- /dev/null +++ b/src/utils/wrapByWords.ts @@ -0,0 +1,21 @@ +/** + * Wraps text into lines containing a fixed number of words. + * + * @param {string} text - The input text to wrap. + * @param {number} count - Maximum number of words per line. Must be greater than 0. + * @returns {string} The wrapped text, with lines separated by newline characters. + * + * @example + * wrapByWords("one two three four five", 2); + * // => "one two\nthree four\nfive" + */ +export function wrapByWords(text: string, count: number): string { + const words = text.trim().split(/\s+/); + const lines: string[] = []; + + for (let i = 0; i < words.length; i += count) { + lines.push(words.slice(i, i + count).join(' ')); + } + + return lines.join('\n'); +}