Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 41 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Convert HTML into plain text while optionally preserving formatting and keeping
- Convert HTML to plain text
- Preserve formatting such as paragraphs, headings, lists, bold, italic, links, blockquotes, and tables
- Optionally ignore specific tags to keep them in the output
- Wrap output by word count or character length
- Handles self-closing tags and nested content
- Strips unknown tags and decodes common HTML entities (` `, `&`, `<`, `>`)

Expand All @@ -25,30 +26,60 @@ yarn add html-textify
## Usage

```ts
import { textify } from "html-textify";
import { textify } from 'html-textify';

// Simple usage
const html = "<p>Hello <b>World</b></p>";
const html = '<p>Hello <b>World</b></p>';
const plain = textify({ html });
console.log(plain); // "**Hello** World"
console.log(plain); // "Hello **World**"

// Preserve formatting but ignore certain tags
const html2 = "<p>Paragraph <b>bold</b> <i>italic</i></p>";
const html2 = '<p>Paragraph <b>bold</b> <i>italic</i></p>';
const result = textify({
html: html2,
preserveFormatting: true,
ignoreTags: ["b", "i"],
ignoreTags: ['b', 'i'],
});
console.log(result); // "Paragraph <b>bold</b><i>italic</i>"

// Strip all tags except ignored ones
const html3 = "<p>Paragraph <mark>highlighted</mark></p>";
const html3 = '<p>Paragraph <mark>highlighted</mark></p>';
const stripped = textify({
html: html3,
preserveFormatting: false,
ignoreTags: ["mark"],
ignoreTags: ['mark'],
});
console.log(stripped); // "Paragraph <mark>highlighted</mark>"

// Wrap by words (max 2 words per line)
const html4 = '<p>one two three four five</p>';
const wrappedWords = textify({
html: html4,
preserveFormatting: false,
wrapWords: 2,
});
console.log(wrappedWords);
/* Output:
one two
three four
five
*/

// Wrap by characters (max 10 characters per line)
const html5 = '<p>This is a test sentence for wrapping.</p>';
const wrappedChars = textify({
html: html5,
preserveFormatting: false,
wrapLength: 10,
});
console.log(wrappedChars);
/* Output:
This is a
test
sentence
for
wrapping.
*/
```

## API
Expand All @@ -58,11 +89,13 @@ console.log(stripped); // "Paragraph <mark>highlighted</mark>"
- `options.html (string)` – HTML string to convert
- `options.preserveFormatting (boolean, default: true)` – Whether to keep formatting like lists, headings, blockquotes, bold/italic
- `options.ignoreTags (string[], optional)` – Tags to keep intact in output (e.g., ["b", "mark"])
- `options.wrapWords (number, optional)` – Maximum words per line (takes priority over wrapLength)
- `options.wrapLength (number, optional)` – Maximum characters per line

## Examples

```ts
import { textify } from "html-textify";
import { textify } from 'html-textify';

const html = `
<h1>Title</h1>
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-textify",
"version": "0.1.2",
"version": "1.0.0",
"description": "Convert html to plain text",
"main": "dist/index.js",
"module": "dist/index.mjs",
Expand Down
44 changes: 39 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,54 @@
import preserveFormat from './utils/preserveFormat';
import { preserveFormat } from './utils/preserveFormat';
import { wrapByLength } from './utils/wrapByLength';
import { wrapByWords } from './utils/wrapByWords';

export interface TextifyOptions {
html: string;
preserveFormatting?: boolean; // optional, default true
ignoreTags?: string[]; // optional tags to keep intact
wrapLength?: number; // max chars per line
wrapWords?: number; // max words per line
}

/**
* Converts HTML to plain text with optional formatting and wrapping.
*
* @param {Object} options - Configuration options.
* @param {string} options.html - The input HTML string to convert.
* @param {boolean} [options.preserveFormatting=true] - Whether to preserve readable formatting.
* @param {string[]} [options.ignoreTags=[]] - List of HTML tags to keep intact.
* @param {number} [options.wrapLength] - Maximum characters per line (ignored if wrapWords is set).
* @param {number} [options.wrapWords] - Maximum words per line. Takes priority over wrapLength.
* @returns {string} The plain text result with optional wrapping.
*
* @example
* textify({ html: "<p>Hello <b>world</b></p>", preserveFormatting: false });
* // => "Hello world"
*
* @example
* textify({ html: "<p>one two three four five</p>", wrapWords: 2 });
* // => "one two\nthree four\nfive"
*
* @example
* textify({ html: "<p>one two three four five</p>", wrapLength: 10 });
* // => "one two\nthree four\nfive"
*/
export function textify({
html,
preserveFormatting = true,
ignoreTags = [],
wrapLength,
wrapWords,
}: TextifyOptions): string {
// Ignore rest of the function if it's already empty
if (!html) return '';

// Strip or preserve HTML formatting
if (preserveFormatting) {
// Keep readable formatting
html = preserveFormat({ html, ignoreTags });
} else {
if (ignoreTags.length === 0) {
// Strip all tags
html = html.replace(/<[^>]+>/g, '').trim();
} else {
// Regex to match all tags except the ignored ones
const IG = new Set(ignoreTags.map((t) => t.toLowerCase()));
html = html
.replace(/<\/?([a-z][a-z0-9-]*)\b[^>]*>/gi, (match, tag) =>
Expand All @@ -31,5 +57,13 @@ export function textify({
.trim();
}
}

// Wrap output text (word-based wrapping takes priority)
if (wrapWords && wrapWords > 0) {
html = wrapByWords(html, wrapWords);
} else if (wrapLength && wrapLength > 0) {
html = wrapByLength(html, wrapLength);
}

return html;
}
33 changes: 33 additions & 0 deletions src/textify.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,37 @@ describe('textify', () => {
// since preserveFormat handles it, just check it returns something non-empty
expect(result).not.toBe('');
});

test('wraps text by word count when wrapWords is set', () => {
const html = '<p>one two three four five six seven</p>';
const result = textify({ html, preserveFormatting: false, wrapWords: 3 });
expect(result).toBe('one two three\nfour five six\nseven');
});

test('wraps text by character length when wrapLength is set', () => {
const html = '<p>This is a test sentence for wrapping.</p>';
const result = textify({ html, preserveFormatting: false, wrapLength: 10 });
expect(result).toBe('This is a\ntest\nsentence\nfor\nwrapping.');
});

test('wrapWords takes priority over wrapLength', () => {
const html = '<p>one two three four five</p>';
const result = textify({
html,
preserveFormatting: false,
wrapWords: 2,
wrapLength: 5,
});
expect(result).toBe('one two\nthree four\nfive');
});

test('does not wrap when wrapWords or wrapLength is zero or negative', () => {
const html = '<p>one two three</p>';
expect(() =>
textify({ html, preserveFormatting: false, wrapWords: 0 })
).not.toThrow();
expect(() =>
textify({ html, preserveFormatting: false, wrapLength: 0 })
).not.toThrow();
});
});
2 changes: 1 addition & 1 deletion src/utils/preserveFormat.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import preserveFormat from './preserveFormat';
import { preserveFormat } from './preserveFormat';

describe('preserveFormat', () => {
it('should return empty string for empty input', () => {
Expand Down
32 changes: 31 additions & 1 deletion src/utils/preserveFormat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,37 @@ interface PreserveFormatOptions {
ignoreTags?: string[];
}

export default function preserveFormat({
/**
* Converts HTML to a more readable plain-text format while optionally preserving certain tags.
* - Converts headings and paragraphs to double newlines.
* - Converts `<br>` to newline.
* - Wraps bold (`<b>`, `<strong>`) in `**`.
* - Wraps italic (`<i>`, `<em>`) in `*`.
* - Converts links `<a href="...">text</a>` to `text (url)`.
* - Formats lists (`<ol>`, `<ul>`) and list items.
* - Formats blockquotes (`<blockquote>`) with `> ` prefix.
* - Converts tables to tab-delimited rows.
* - Decodes common HTML entities.
* - Collapses multiple newlines to a maximum of two.
*
* @param {Object} options - Options for preserving format.
* @param {string} options.html - The input HTML string to format.
* @param {string[]} [options.ignoreTags] - List of tags to leave intact (default: `[]`).
* @returns {string} The formatted plain-text representation of the HTML.
*
* @example
* preserveFormat({ html: '<p>Hello <b>world</b></p>' });
* // => 'Hello **world**'
*
* @example
* preserveFormat({ html: '<ul><li>One</li><li>Two</li></ul>' });
* // => '- One\n- Two'
*
* @example
* preserveFormat({ html: '<a href="https://example.com">Link</a>', ignoreTags: ['a'] });
* // => '<a href="https://example.com">Link</a>'
*/
export function preserveFormat({
html,
ignoreTags = [],
}: PreserveFormatOptions): string {
Expand Down
45 changes: 45 additions & 0 deletions src/utils/wrapByLength.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { wrapByLength } from './wrapByLength';

describe('wrapByLength', () => {
test('wraps text at given character length without breaking words', () => {
const text = 'This is a very long sentence';
expect(wrapByLength(text, 10)).toBe('This is a\nvery long\nsentence');
});

test('returns text unchanged if shorter than length', () => {
expect(wrapByLength('short text', 20)).toBe('short text');
});

test('handles exact line length', () => {
expect(wrapByLength('12345 67890', 11)).toBe('12345 67890');
});

test('splits multiple lines correctly', () => {
const text = 'one two three four five six seven eight nine';
expect(wrapByLength(text, 13)).toBe(
'one two three\nfour five six\nseven eight\nnine'
);
});

test('ignores extra whitespace', () => {
const text = ' alpha beta gamma ';
expect(wrapByLength(text, 8)).toBe('alpha\nbeta\ngamma');
});

test('handles single long word exceeding limit', () => {
const text = 'supercalifragilisticexpialidocious';
// since function does not force-break words, it stays as is
expect(wrapByLength(text, 10)).toBe(text);
});

test('throws error if length is zero or negative', () => {
expect(() => wrapByLength('abc', 0)).toThrow(
'wrap length must be greater than 0'
);
expect(() => wrapByLength('abc', -5)).toThrow();
});

test('returns empty string when input is empty', () => {
expect(wrapByLength('', 10)).toBe('');
});
});
34 changes: 34 additions & 0 deletions src/utils/wrapByLength.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* Wraps text into lines with a maximum number of characters.
* Breaks at word boundaries when possible.
*
* @param {string} text - The input text to wrap.
* @param {number} length - Maximum allowed characters per line.
* @returns {string} The wrapped text, with lines separated by newline characters.
*
* @example
* wrapByLength("This is a very long sentence", 10);
* // => "This is a\nvery long\nsentence"
*/
export function wrapByLength(text: string, length: number): string {
if (length <= 0) {
throw new Error('wrap length must be greater than 0');
}

const words = text.trim().split(/\s+/);
const lines: string[] = [];
let line = '';

for (const word of words) {
if ((line + ' ' + word).trim().length > length) {
if (line) lines.push(line.trim());
line = word;
} else {
line += ' ' + word;
}
}

if (line) lines.push(line.trim());

return lines.join('\n');
}
31 changes: 31 additions & 0 deletions src/utils/wrapByWords.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { wrapByWords } from './wrapByWords';

describe('wrapByWords', () => {
test('splits text into lines of given word count', () => {
const text = 'one two three four five six seven';
expect(wrapByWords(text, 3)).toBe('one two three\nfour five six\nseven');
});

test('handles text shorter than word count', () => {
const text = 'hello world';
expect(wrapByWords(text, 5)).toBe('hello world');
});

test('handles exact multiples', () => {
const text = 'a b c d';
expect(wrapByWords(text, 2)).toBe('a b\nc d');
});

test('ignores extra whitespace', () => {
const text = ' alpha beta gamma ';
expect(wrapByWords(text, 2)).toBe('alpha beta\ngamma');
});

test('handles single word', () => {
expect(wrapByWords('word', 2)).toBe('word');
});

test('returns empty string for empty input', () => {
expect(wrapByWords('', 3)).toBe('');
});
});
21 changes: 21 additions & 0 deletions src/utils/wrapByWords.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* Wraps text into lines containing a fixed number of words.
*
* @param {string} text - The input text to wrap.
* @param {number} count - Maximum number of words per line. Must be greater than 0.
* @returns {string} The wrapped text, with lines separated by newline characters.
*
* @example
* wrapByWords("one two three four five", 2);
* // => "one two\nthree four\nfive"
*/
export function wrapByWords(text: string, count: number): string {
const words = text.trim().split(/\s+/);
const lines: string[] = [];

for (let i = 0; i < words.length; i += count) {
lines.push(words.slice(i, i + count).join(' '));
}

return lines.join('\n');
}