From b15cd37f09a8bb2cf53ade999533b230b523f541 Mon Sep 17 00:00:00 2001 From: Jersonal Date: Sun, 23 Mar 2025 09:51:23 +0100 Subject: [PATCH] Updated dependencies and improved content parsing - Updated multiple dependencies to newer versions in `pubspec.lock`. - Added a new parser `test_parser.dart`. - Fixed a bug that caused crashes in `content_widget.dart`. - Added a new file `content_processor.dart` with functionality to process content from html and remove blacklisted elements. - Added a new file `paragraph.dart` with classes to represent text and content features. - Updated `.gitignore` to ignore memory files. --- .gitignore | 1 + lib/parsing/content_processor.dart | 176 ++++++++++++++++++ lib/parsing/paragraph.dart | 47 +++++ lib/test_parser.dart | 41 ++++ lib/widgets/content_widget.dart | 2 +- pubspec.lock | 74 ++++---- .../golden_screenshots_decoration.dart | 3 - 7 files changed, 303 insertions(+), 41 deletions(-) create mode 100644 lib/parsing/content_processor.dart create mode 100644 lib/parsing/paragraph.dart create mode 100644 lib/test_parser.dart diff --git a/.gitignore b/.gitignore index 4f5f753..25ad6aa 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,4 @@ app.*.map.json /android/fastlane/.env /test/screenshots/goldens/ /android/fastlane/metadata/android/en-US/images/ +/:memory: diff --git a/lib/parsing/content_processor.dart b/lib/parsing/content_processor.dart new file mode 100644 index 0000000..28bc420 --- /dev/null +++ b/lib/parsing/content_processor.dart @@ -0,0 +1,176 @@ +import 'package:collection/collection.dart'; +import 'package:html/dom.dart' as dom; +import 'package:html/parser.dart'; + +import 'paragraph.dart'; + +class ContentProcessor { + final Set _blacklistedTags = { + 'header', + 'footer', + 'script', + 'noscript', + 'style', + 'nav', + 'button', + 'aside', + }; + final Map> _blacklistedAttributes = { + 'class': { + 'sidebar', + 'right-sidebar', + 'post-footer', + 'post-header', + 'footer-widget', + 'hide-on-button', + 'highlight-and-share-wrapper', + 'related-posts', + 'adsbygoogle', + 'has_twitter', + 'social-icons' + 'highlight-and-share-wrapper', + 'comments', + 'foot section' + }, + 'id': { + 'sidebar', + 'right-sidebar', + 'subscription-nudge', + 'comments', + 'right-sidebar-inner', + 'related-posts', + 'adsbygoogle', + 'has-mastodon-prompt', + 'cssnav', + 'PageList1', + } + }; + + List process(String html) { + final document = parse(html); + _removeBlacklistedElements(document.body!); + final paragraphs = _parseElements(document.body!.nodes); + + return paragraphs; + } + + List _parseElements(List nodes) { + final paragraphs = []; + + for (var node in nodes) { + if (node is dom.Element) { + if (node.nodes.length > 1 || node.nodes.firstWhereOrNull((n) => n is! dom.Text) != null) { + paragraphs.addAll(_parseElements(node.nodes)); + } else if (node.localName == 'img') { + final url = node.attributes['src'] ?? ''; + final features = [ + ImageFeature(start:0, end: 0, url: url), + ]; + + paragraphs.add(Paragraph( + text: '', + features: features, + )); + } else if (node.text.trim().isNotEmpty) { + final text = node.text; + final features = _extractFeatures(node); + + paragraphs.add(Paragraph(text: text, features: features)); + } + } else if (node is dom.Text) { + if (node.text.trim().isNotEmpty) { + final text = node.text; + final features = []; + + paragraphs.add(Paragraph(text: text, features: features)); + } + // Check if the node is an image + + } + } + + return paragraphs; + } + + void _removeBlacklistedElements(dom.Element element) { + for (var child in element.nodes.toList()) { + if (child is dom.Element) { + if (_isBlacklisted(child)) { + element.nodes.remove(child); + } else { + _removeBlacklistedElements(child); + } + } + } + } + + bool _isBlacklisted(dom.Element element) { + + if (_blacklistedTags.contains(element.localName)) { + return true; + } + + for (var attribute in _blacklistedAttributes.entries) { + if (element.attributes[attribute.key] != null && + attribute.value.contains(element.attributes[attribute.key])) { + return true; + } + } + + return false; + } + + List _extractFeatures(dom.Node element) { + final features = []; + + + if (element is dom.Element) { + // Extract links + if (element.localName == 'a') { + final link = element; + final start = element.text.indexOf(link.text); + final end = start + link.text.length; + final url = link.attributes['href'] ?? ''; + + features.add(LinkFeature(start: start, end: end, url: url)); + } + + // Extract bold text + element.getElementsByTagName('b').forEach((bold) { + final start = element.text.indexOf(bold.text); + final end = start + bold.text.length; + + features.add(BoldFeature(start: start, end: end)); + }); + + // Extract italic text + element.getElementsByTagName('i').forEach((italic) { + final start = element.text.indexOf(italic.text); + final end = start + italic.text.length; + + // Add ItalicFeature class to the SpecialFeature hierarchy + features.add(ItalicFeature(start: start, end: end)); + }); + + // Extract all headings + for (var i = 1; i <= 6; i++) { + if (element.localName == 'h$i') { + final heading = element; + final start = element.text.indexOf(heading.text); + final end = start + heading.text.length; + + features.add(HeadingFeature(start: start, end: end, level: i)); + } + } + + // Extract images + if (element.localName == 'img') { + final url = element.attributes['src'] ?? ''; + + features.add(ImageFeature(start: 0, end: 0, url: url)); + } + } + + return features; + } +} diff --git a/lib/parsing/paragraph.dart b/lib/parsing/paragraph.dart new file mode 100644 index 0000000..7e5baf0 --- /dev/null +++ b/lib/parsing/paragraph.dart @@ -0,0 +1,47 @@ +class Paragraph { + final String text; + final List features; + + Paragraph({required this.text, this.features = const []}); +} + +abstract class SpecialFeature { + final int start; + final int end; + + SpecialFeature({required this.start, required this.end}); +} + +class LinkFeature extends SpecialFeature { + final String url; + + LinkFeature({required super.start, required super.end, required this.url}); +} + +class ImageFeature extends SpecialFeature { + final String url; + + ImageFeature({required super.start, required super.end, required this.url}); +} + +class HighlightFeature extends SpecialFeature { + final String highlightText; + + HighlightFeature({required super.start, required super.end, required this.highlightText}); +} + +class BoldFeature extends SpecialFeature { + BoldFeature({required super.start, required super.end}); +} + +class ItalicFeature extends SpecialFeature { + ItalicFeature({required super.start, required super.end}); +} + +class HeadingFeature extends SpecialFeature { + final int level; + + HeadingFeature({required super.start, required super.end, required this.level}); +} + + diff --git a/lib/test_parser.dart b/lib/test_parser.dart new file mode 100644 index 0000000..18adc0a --- /dev/null +++ b/lib/test_parser.dart @@ -0,0 +1,41 @@ +// ignore_for_file: avoid_print + +import 'package:http/http.dart' as http; +import 'package:html/parser.dart' as html_parser; +import 'package:know_keeper/parsing/content_processor.dart'; + +import 'parsing/paragraph.dart'; + +void main(List args) async { + if (args.length != 1) { + print('Usage: dart run test_parser.dart '); + return; + } + + final url = args[0]; + final response = await http.get(Uri.parse(url)); + + if (response.statusCode == 200) { + final html = response.body; + final document = html_parser.parse(html); + final contentProcessor = ContentProcessor(); + final paragraphs = contentProcessor.process(document.body!.innerHtml); + + for (var paragraph in paragraphs) { + print('Text: ${paragraph.text}'); + for (var feature in paragraph.features) { + print(' Feature: ${feature.runtimeType}'); + print(' Start: ${feature.start}'); + print(' End: ${feature.end}'); + if (feature is LinkFeature) { + print(' URL: ${feature.url}'); + } + if (feature is ImageFeature) { + print(' URL: ${feature.url}'); + } + } + } + } else { + print('Failed to load URL: ${response.statusCode}'); + } +} \ No newline at end of file diff --git a/lib/widgets/content_widget.dart b/lib/widgets/content_widget.dart index d691f34..96728b6 100644 --- a/lib/widgets/content_widget.dart +++ b/lib/widgets/content_widget.dart @@ -137,7 +137,7 @@ class ContentWidgetState extends ConsumerState { currentIndex = highlightEnd; } else if (element is LinkInfo) { spans.add(TextSpan( - text: text.substring(element.startIndex, element.endIndex), + text: text.substring(element.startIndex, min(element.endIndex, text.length)), style: const TextStyle(color: Colors.blue, decoration: TextDecoration.underline), recognizer: _getOrCreateLinkRecognizer(element.url), )); diff --git a/pubspec.lock b/pubspec.lock index fc2e6e6..2631da3 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -29,10 +29,10 @@ packages: dependency: transitive description: name: async - sha256: "947bfcf187f74dbc5e146c9eb9c0f10c9f8b30743e341481c1e2ed3ecc18c20c" + sha256: d2872f9c19731c2e5f10444b14686eb7cc85c76274bd6c16e1816bff9a3bab63 url: "https://pub.dev" source: hosted - version: "2.11.0" + version: "2.12.0" audio_session: dependency: transitive description: @@ -53,10 +53,10 @@ packages: dependency: transitive description: name: boolean_selector - sha256: "6cfb5af12253eaf2b368f07bacc5a80d1301a071c73360d746b7f2e32d762c66" + sha256: "8aab1771e1243a5063b8b0ff68042d67334e3feab9e95b9490f9a6ebf73b42ea" url: "https://pub.dev" source: hosted - version: "2.1.1" + version: "2.1.2" cached_network_image: dependency: "direct main" description: @@ -85,10 +85,10 @@ packages: dependency: transitive description: name: characters - sha256: "04a925763edad70e8443c99234dc3328f442e811f1d8fd1a72f1c8ad0f69a605" + sha256: f71061c654a3380576a52b451dd5532377954cf9dbd272a78fc8479606670803 url: "https://pub.dev" source: hosted - version: "1.3.0" + version: "1.4.0" checked_yaml: dependency: transitive description: @@ -117,18 +117,18 @@ packages: dependency: transitive description: name: clock - sha256: cb6d7f03e1de671e34607e909a7213e31d7752be4fb66a86d29fe1eb14bfb5cf + sha256: fddb70d9b5277016c77a80201021d40a2247104d9f4aa7bab7157b7e3f05b84b url: "https://pub.dev" source: hosted - version: "1.1.1" + version: "1.1.2" collection: dependency: "direct main" description: name: collection - sha256: a1ace0a119f20aabc852d165077c036cd864315bd99b7eaa10a60100341941bf + sha256: "2f5709ae4d3d59dd8f7cd309b4e023046b57d8a6c82130785d2b0e5868084e76" url: "https://pub.dev" source: hosted - version: "1.19.0" + version: "1.19.1" convert: dependency: transitive description: @@ -221,10 +221,10 @@ packages: dependency: transitive description: name: fake_async - sha256: "511392330127add0b769b75a987850d136345d9227c6b94c96a04cf4a391bf78" + sha256: "6a95e56b2449df2273fd8c45a662d6947ce1ebb7aafe80e550a3f68297f3cacc" url: "https://pub.dev" source: hosted - version: "1.3.1" + version: "1.3.2" favicon: dependency: "direct main" description: @@ -492,18 +492,18 @@ packages: dependency: transitive description: name: leak_tracker - sha256: "7bb2830ebd849694d1ec25bf1f44582d6ac531a57a365a803a6034ff751d2d06" + sha256: c35baad643ba394b40aac41080300150a4f08fd0fd6a10378f8f7c6bc161acec url: "https://pub.dev" source: hosted - version: "10.0.7" + version: "10.0.8" leak_tracker_flutter_testing: dependency: transitive description: name: leak_tracker_flutter_testing - sha256: "9491a714cca3667b60b5c420da8217e6de0d1ba7a5ec322fab01758f6998f379" + sha256: f8b613e7e6a13ec79cfdc0e97638fddb3ab848452eff057653abd3edba760573 url: "https://pub.dev" source: hosted - version: "3.0.8" + version: "3.0.9" leak_tracker_testing: dependency: transitive description: @@ -540,10 +540,10 @@ packages: dependency: transitive description: name: matcher - sha256: d2323aa2060500f906aa31a895b4030b6da3ebdcc5619d14ce1aada65cd161cb + sha256: dc58c723c3c24bf8d3e2d3ad3f2f9d7bd9cf43ec6feaa64181775e60190153f2 url: "https://pub.dev" source: hosted - version: "0.12.16+1" + version: "0.12.17" material_color_utilities: dependency: transitive description: @@ -556,10 +556,10 @@ packages: dependency: transitive description: name: meta - sha256: bdb68674043280c3428e9ec998512fb681678676b3c54e773629ffe74419f8c7 + sha256: e3641ec5d63ebf0d9b41bd43201a66e3fc79a65db5f61fc181f04cd27aab950c url: "https://pub.dev" source: hosted - version: "1.15.0" + version: "1.16.0" mime: dependency: transitive description: @@ -604,10 +604,10 @@ packages: dependency: "direct main" description: name: path - sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" + sha256: "75cca69d1490965be98c73ceaea117e8a04dd21217b37b292c9ddbec0d955bc5" url: "https://pub.dev" source: hosted - version: "1.9.0" + version: "1.9.1" path_parsing: dependency: transitive description: @@ -825,10 +825,10 @@ packages: dependency: transitive description: name: source_span - sha256: "53e943d4206a5e30df338fd4c6e7a077e02254531b138a15aec3bd143c1a8b3c" + sha256: "254ee5351d6cb365c859e20ee823c3bb479bf4a293c22d17a9f1bf144ce86f7c" url: "https://pub.dev" source: hosted - version: "1.10.0" + version: "1.10.1" sprintf: dependency: transitive description: @@ -881,10 +881,10 @@ packages: dependency: transitive description: name: stack_trace - sha256: "9f47fd3630d76be3ab26f0ee06d213679aa425996925ff3feffdec504931c377" + sha256: "8b27215b45d22309b5cddda1aa2b19bdfec9df0e765f2de506401c071d38d1b1" url: "https://pub.dev" source: hosted - version: "1.12.0" + version: "1.12.1" state_notifier: dependency: transitive description: @@ -897,18 +897,18 @@ packages: dependency: transitive description: name: stream_channel - sha256: ba2aa5d8cc609d96bbb2899c28934f9e1af5cddbd60a827822ea467161eb54e7 + sha256: "969e04c80b8bcdf826f8f16579c7b14d780458bd97f56d107d3950fdbeef059d" url: "https://pub.dev" source: hosted - version: "2.1.2" + version: "2.1.4" string_scanner: dependency: transitive description: name: string_scanner - sha256: "688af5ed3402a4bde5b3a6c15fd768dbf2621a614950b17f04626c431ab3c4c3" + sha256: "921cd31725b72fe181906c6a94d987c78e3b98c2e205b397ea399d4054872b43" url: "https://pub.dev" source: hosted - version: "1.3.0" + version: "1.4.1" synchronized: dependency: transitive description: @@ -921,18 +921,18 @@ packages: dependency: transitive description: name: term_glyph - sha256: a29248a84fbb7c79282b40b8c72a1209db169a2e0542bce341da992fe1bc7e84 + sha256: "7f554798625ea768a7518313e58f83891c7f5024f88e46e7182a4558850a4b8e" url: "https://pub.dev" source: hosted - version: "1.2.1" + version: "1.2.2" test_api: dependency: transitive description: name: test_api - sha256: "664d3a9a64782fcdeb83ce9c6b39e78fd2971d4e37827b9b06c3aa1edc5e760c" + sha256: fb31f383e2ee25fbbfe06b40fe21e1e458d14080e3c67e7ba0acfde4df4e0bbd url: "https://pub.dev" source: hosted - version: "0.7.3" + version: "0.7.4" typed_data: dependency: transitive description: @@ -1089,10 +1089,10 @@ packages: dependency: transitive description: name: vm_service - sha256: f6be3ed8bd01289b34d679c2b62226f63c0e69f9fd2e50a6b3c1c729a961041b + sha256: "0968250880a6c5fe7edc067ed0a13d4bae1577fe2771dcf3010d52c4a9d3ca14" url: "https://pub.dev" source: hosted - version: "14.3.0" + version: "14.3.1" wakelock_plus: dependency: transitive description: @@ -1182,5 +1182,5 @@ packages: source: hosted version: "3.1.3" sdks: - dart: ">=3.6.0 <4.0.0" + dart: ">=3.7.0-0 <4.0.0" flutter: ">=3.27.0" diff --git a/test/screenshots/golden_screenshots_decoration.dart b/test/screenshots/golden_screenshots_decoration.dart index 2827ce2..0837e7d 100644 --- a/test/screenshots/golden_screenshots_decoration.dart +++ b/test/screenshots/golden_screenshots_decoration.dart @@ -34,9 +34,6 @@ void main() { tester.view.physicalSize = deviceInfo.size; tester.view.devicePixelRatio = deviceInfo.devicePixelRatio; - print("Size: ${tester.view.physicalSize}"); - print("Pixel ratio: ${tester.view.devicePixelRatio}"); - await tester.pumpWidget( MaterialApp( home: ScreenshotWrapper(