diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java index 0a9b257948..de2842fab6 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java @@ -79,6 +79,11 @@ public final class YoutubeParsingHelper { private YoutubeParsingHelper() { } + /** + * The base URL for plain Youtube. + */ + public static final String YOUTUBE_BASE = "https://www.youtube.com/"; + /** * The base URL of requests of the {@code WEB} clients to the InnerTube internal API. */ @@ -212,6 +217,11 @@ private YoutubeParsingHelper() { private static final String CONTENT_PLAYBACK_NONCE_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + /** + * Regex for extracing any JSON array. + */ + private static final String JSON_ARRAY = "\\[.*\\]"; + /** * The device machine id for the iPhone 15 Pro Max, * used to get 60fps with the {@code iOS} client. @@ -322,6 +332,35 @@ public static String randomVisitorData(final ContentCountry country) { return pb.toUrlencodedBase64(); } + /** + * Requests and parses out the visitor data from the sw.js_data YT endpoint. + * This function does not parse it into a programmatic form, just returns the encoded string. + * Useful for passing into API requests which require visitorData to work. + * The function currently uses very brittle extraction logic. + * Likely to fail with future changes. + * + * @return extracted encoded visitor data string + * @throws ParsingException if the format of data is no longer a JSON array + * @throws IOException when it cannot fetch the API data + * @throws ReCaptchaException when it cannot fetch the API data + */ + public static String extractVisitorData() + throws ParsingException, IOException, ReCaptchaException { + final String url = YOUTUBE_BASE + "sw.js_data"; + final var headers = getOriginReferrerHeaders(YOUTUBE_BASE); + final String response = getDownloader().get(url, headers).responseBody(); + final JsonArray jsonArray = JsonUtils.toJsonArray( + Parser.matchGroup(JSON_ARRAY, response, 0)); + // Got this particular extraction logic by finding where the visitor data + // lives through comparison. If the structure changes this is likely to fail. + return jsonArray + .getArray(0) + .getArray(2) + .getArray(0) + .getArray(0) + .getString(13); + } + /** * Parses the duration string of the video expecting ":" or "." as separators * @@ -1264,6 +1303,16 @@ public static JsonBuilder prepareAndroidMobileJsonBuilder( public static JsonBuilder prepareIosMobileJsonBuilder( @Nonnull final Localization localization, @Nonnull final ContentCountry contentCountry) { + + // Try to extract the visitor data from the sw.js_data API, but otherwise + // fall back to randomly generating the visitor data. + String visitorData = null; + try { + visitorData = extractVisitorData(); + } catch (ParsingException | IOException | ReCaptchaException e) { + visitorData = randomVisitorData(contentCountry); + } + // @formatter:off return JsonObject.builder() .object("context") @@ -1276,7 +1325,7 @@ public static JsonBuilder prepareIosMobileJsonBuilder( .value("platform", "MOBILE") .value("osName", "iOS") .value("osVersion", IOS_OS_VERSION) - .value("visitorData", randomVisitorData(contentCountry)) + .value("visitorData", visitorData) .value("hl", localization.getLocalizationCode()) .value("gl", contentCountry.getCountryCode()) .value("utcOffsetMinutes", 0)