Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ app.*.map.json
/android/fastlane/.env
/test/screenshots/goldens/
/android/fastlane/metadata/android/en-US/images/
/:memory:
176 changes: 176 additions & 0 deletions lib/parsing/content_processor.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import 'package:collection/collection.dart';
import 'package:html/dom.dart' as dom;
import 'package:html/parser.dart';

import 'paragraph.dart';

class ContentProcessor {
final Set<String> _blacklistedTags = {
'header',
'footer',
'script',
'noscript',
'style',
'nav',
'button',
'aside',
};
final Map<String, Set<String>> _blacklistedAttributes = {
'class': {
'sidebar',
'right-sidebar',
'post-footer',
'post-header',
'footer-widget',
'hide-on-button',
'highlight-and-share-wrapper',
'related-posts',
'adsbygoogle',
'has_twitter',
'social-icons'
'highlight-and-share-wrapper',
'comments',
'foot section'
},
'id': {
'sidebar',
'right-sidebar',
'subscription-nudge',
'comments',
'right-sidebar-inner',
'related-posts',
'adsbygoogle',
'has-mastodon-prompt',
'cssnav',
'PageList1',
}
};

List<Paragraph> process(String html) {
final document = parse(html);
_removeBlacklistedElements(document.body!);
final paragraphs = _parseElements(document.body!.nodes);

return paragraphs;
}

List<Paragraph> _parseElements(List<dom.Node> nodes) {
final paragraphs = <Paragraph>[];

for (var node in nodes) {
if (node is dom.Element) {
if (node.nodes.length > 1 || node.nodes.firstWhereOrNull((n) => n is! dom.Text) != null) {
paragraphs.addAll(_parseElements(node.nodes));
} else if (node.localName == 'img') {
final url = node.attributes['src'] ?? '';
final features = <SpecialFeature>[
ImageFeature(start:0, end: 0, url: url),
];

paragraphs.add(Paragraph(
text: '',
features: features,
));
} else if (node.text.trim().isNotEmpty) {
final text = node.text;
final features = _extractFeatures(node);

paragraphs.add(Paragraph(text: text, features: features));
}
} else if (node is dom.Text) {
if (node.text.trim().isNotEmpty) {
final text = node.text;
final features = <SpecialFeature>[];

paragraphs.add(Paragraph(text: text, features: features));
}
// Check if the node is an image

}
}

return paragraphs;
}

void _removeBlacklistedElements(dom.Element element) {
for (var child in element.nodes.toList()) {
if (child is dom.Element) {
if (_isBlacklisted(child)) {
element.nodes.remove(child);
} else {
_removeBlacklistedElements(child);
}
}
}
}

bool _isBlacklisted(dom.Element element) {

if (_blacklistedTags.contains(element.localName)) {
return true;
}

for (var attribute in _blacklistedAttributes.entries) {
if (element.attributes[attribute.key] != null &&
attribute.value.contains(element.attributes[attribute.key])) {
return true;
}
}

return false;
}

List<SpecialFeature> _extractFeatures(dom.Node element) {
final features = <SpecialFeature>[];


if (element is dom.Element) {
// Extract links
if (element.localName == 'a') {
final link = element;
final start = element.text.indexOf(link.text);
final end = start + link.text.length;
final url = link.attributes['href'] ?? '';

features.add(LinkFeature(start: start, end: end, url: url));
}

// Extract bold text
element.getElementsByTagName('b').forEach((bold) {
final start = element.text.indexOf(bold.text);
final end = start + bold.text.length;

features.add(BoldFeature(start: start, end: end));
});

// Extract italic text
element.getElementsByTagName('i').forEach((italic) {
final start = element.text.indexOf(italic.text);
final end = start + italic.text.length;

// Add ItalicFeature class to the SpecialFeature hierarchy
features.add(ItalicFeature(start: start, end: end));
});

// Extract all headings
for (var i = 1; i <= 6; i++) {
if (element.localName == 'h$i') {
final heading = element;
final start = element.text.indexOf(heading.text);
final end = start + heading.text.length;

features.add(HeadingFeature(start: start, end: end, level: i));
}
}

// Extract images
if (element.localName == 'img') {
final url = element.attributes['src'] ?? '';

features.add(ImageFeature(start: 0, end: 0, url: url));
}
}

return features;
}
}
47 changes: 47 additions & 0 deletions lib/parsing/paragraph.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
class Paragraph {
final String text;
final List<SpecialFeature> features;

Paragraph({required this.text, this.features = const []});
}

abstract class SpecialFeature {
final int start;
final int end;

SpecialFeature({required this.start, required this.end});
}

class LinkFeature extends SpecialFeature {
final String url;

LinkFeature({required super.start, required super.end, required this.url});
}

class ImageFeature extends SpecialFeature {
final String url;

ImageFeature({required super.start, required super.end, required this.url});
}

class HighlightFeature extends SpecialFeature {
final String highlightText;

HighlightFeature({required super.start, required super.end, required this.highlightText});
}

class BoldFeature extends SpecialFeature {
BoldFeature({required super.start, required super.end});
}

class ItalicFeature extends SpecialFeature {
ItalicFeature({required super.start, required super.end});
}

class HeadingFeature extends SpecialFeature {
final int level;

HeadingFeature({required super.start, required super.end, required this.level});
}


41 changes: 41 additions & 0 deletions lib/test_parser.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// ignore_for_file: avoid_print

import 'package:http/http.dart' as http;
import 'package:html/parser.dart' as html_parser;
import 'package:know_keeper/parsing/content_processor.dart';

import 'parsing/paragraph.dart';

void main(List<String> args) async {
if (args.length != 1) {
print('Usage: dart run test_parser.dart <url>');
return;
}

final url = args[0];
final response = await http.get(Uri.parse(url));

if (response.statusCode == 200) {
final html = response.body;
final document = html_parser.parse(html);
final contentProcessor = ContentProcessor();
final paragraphs = contentProcessor.process(document.body!.innerHtml);

for (var paragraph in paragraphs) {
print('Text: ${paragraph.text}');
for (var feature in paragraph.features) {
print(' Feature: ${feature.runtimeType}');
print(' Start: ${feature.start}');
print(' End: ${feature.end}');
if (feature is LinkFeature) {
print(' URL: ${feature.url}');
}
if (feature is ImageFeature) {
print(' URL: ${feature.url}');
}
}
}
} else {
print('Failed to load URL: ${response.statusCode}');
}
}
2 changes: 1 addition & 1 deletion lib/widgets/content_widget.dart
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class ContentWidgetState extends ConsumerState<ContentWidget> {
currentIndex = highlightEnd;
} else if (element is LinkInfo) {
spans.add(TextSpan(
text: text.substring(element.startIndex, element.endIndex),
text: text.substring(element.startIndex, min(element.endIndex, text.length)),
style: const TextStyle(color: Colors.blue, decoration: TextDecoration.underline),
recognizer: _getOrCreateLinkRecognizer(element.url),
));
Expand Down
Loading
Loading