From 950bf28289820b1f8317da949a1d6a09ed323cf5 Mon Sep 17 00:00:00 2001 From: Imran Remtulla Date: Sun, 27 Apr 2025 00:10:06 -0400 Subject: [PATCH] =?UTF-8?q?Even=20more=20flexibility=20in=20the=20HTML=20s?= =?UTF-8?q?ource=20=E2=80=94=20JSON=20string=20extraction=20fallback=20(#2?= =?UTF-8?q?262)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/app_sources/html.dart | 51 ++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/lib/app_sources/html.dart b/lib/app_sources/html.dart index 6c8269c..af26cec 100644 --- a/lib/app_sources/html.dart +++ b/lib/app_sources/html.dart @@ -1,3 +1,5 @@ +import 'dart:convert'; + import 'package:easy_localization/easy_localization.dart'; import 'package:html/parser.dart'; import 'package:http/http.dart'; @@ -67,6 +69,27 @@ int compareAlphaNumeric(String a, String b) { return aParts.length.compareTo(bParts.length); } +List collectAllStringsFromJSONObject(dynamic obj) { + List extractor(dynamic obj) { + final results = []; + if (obj is String) { + results.add(obj); + } else if (obj is List) { + for (final item in obj) { + results.addAll(extractor(item)); + } + } else if (obj is Map) { + for (final value in obj.values) { + results.addAll(extractor(value)); + } + } + + return results; + } + + return extractor(obj); +} + List _splitAlphaNumeric(String s) { List parts = []; StringBuffer sb = StringBuffer(); @@ -95,6 +118,13 @@ bool _isNumeric(String s) { return s.codeUnitAt(0) >= 48 && s.codeUnitAt(0) <= 57; } +List> getLinksInLines(String lines) => RegExp( + r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?') + .allMatches(lines) + .map((match) => + MapEntry(match.group(0)!, match.group(0)?.split('/').last ?? '')) + .toList(); + // Given an HTTP response, grab some links according to the common additional settings // (those that apply to intermediate and final steps) Future>> grabLinksCommon( @@ -114,12 +144,21 @@ Future>> grabLinksCommon( .map((e) => MapEntry(ensureAbsoluteUrl(e.key, res.request!.url), e.value)) .toList(); if (allLinks.isEmpty) { - allLinks = RegExp( - r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?') - .allMatches(res.body) - .map((match) => - MapEntry(match.group(0)!, match.group(0)?.split('/').last ?? '')) - .toList(); + allLinks = getLinksInLines(res.body); + } + if (allLinks.isEmpty) { + // Getting desperate + try { + var jsonStrings = collectAllStringsFromJSONObject(jsonDecode(res.body)); + allLinks = getLinksInLines(jsonStrings.join('\n')); + if (allLinks.isEmpty) { + allLinks = getLinksInLines(jsonStrings.map((l) { + return ensureAbsoluteUrl(l, res.request!.url); + }).join('\n')); + } + } catch (e) { + // + } } List> links = []; bool skipSort = additionalSettings['skipSort'] == true;