From 3506d2b31632419516d4267e266d7215f6f02a6f Mon Sep 17 00:00:00 2001 From: Ryan Hirsch Date: Mon, 29 Dec 2025 20:00:00 -0600 Subject: [PATCH 1/2] update parser version --- package.json | 2 +- src/parser/index.ts | 2 ++ src/parser/xml-parser.ts | 24 ++++++++++++------------ yarn.lock | 15 +++++++++++---- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/package.json b/package.json index 4c59bf5..60c379a 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,7 @@ }, "dependencies": { "dotenv": "^10.0.0", - "fast-xml-parser": "^3.18.0", + "fast-xml-parser": "^5.3.3", "he": "^1.2.0", "node-fetch": "^2.6.1", "pino": "^7.0.5", diff --git a/src/parser/index.ts b/src/parser/index.ts index d353a38..73ad131 100644 --- a/src/parser/index.ts +++ b/src/parser/index.ts @@ -10,6 +10,8 @@ import { ParserOptions, unifiedParser } from "./unified"; import { parse, validate } from "./xml-parser"; import { FeedObject, FeedType } from "./types"; +export type { FeedObject } from "./types"; + export function parseFeed(xml: string, options?: ParserOptions): FeedObject | null { const parsedContent = validate(xml.trim()); if (parsedContent === true) { diff --git a/src/parser/xml-parser.ts b/src/parser/xml-parser.ts index 3907b13..bf52506 100644 --- a/src/parser/xml-parser.ts +++ b/src/parser/xml-parser.ts @@ -1,31 +1,31 @@ -import parser, { ValidationError } from "fast-xml-parser"; +import { XMLParser, XMLValidator, ValidationError } from "fast-xml-parser"; import he from "he"; import { XmlNode } from "./types"; const parserOptions = { attributeNamePrefix: "@_", - attrNodeName: "attr", // default is 'false' + attributesGroupName: "attr", textNodeName: "#text", ignoreAttributes: false, - ignoreNameSpace: false, + removeNSPrefix: false, allowBooleanAttributes: false, - parseNodeValue: true, + parseTagValue: true, parseAttributeValue: false, trimValues: true, - // cdataTagName: "__cdata", //default is 'false' - // cdataPositionChar: "\\c", - parseTrueNumberOnly: false, - arrayMode: false, // "strict" - tagValueProcessor: (val: string) => he.decode(val), // default is a=>a - stopNodes: ["parse-me-as-string"], + tagValueProcessor: (_tagName: string, tagValue: string) => he.decode(tagValue), + stopNodes: ["*.parse-me-as-string"], }; +const xmlParser = new XMLParser(parserOptions); + export function validate(xml: string): true | ValidationError { - return parser.validate(xml.trim()); + return XMLValidator.validate(xml.trim()); } export function parse(xml: string): XmlNode { // eslint-disable-next-line @typescript-eslint/no-unsafe-return - return parser.parse(xml.trim(), parserOptions); + return xmlParser.parse(xml.trim()); } + +export type { ValidationError }; diff --git a/yarn.lock b/yarn.lock index b8496d8..ea01061 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2361,10 +2361,12 @@ fast-safe-stringify@^2.0.8: resolved "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz" integrity sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA== -fast-xml-parser@^3.18.0: - version "3.18.0" - resolved "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-3.18.0.tgz" - integrity sha512-tRrwShhppv0K5GKEtuVs92W0VGDaVltZAwtHbpjNF+JOT7cjIFySBGTEOmdBslXYyWYaZwEX/g4Su8ZeKg0LKQ== +fast-xml-parser@^5.3.3: + version "5.3.3" + resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-5.3.3.tgz#84b678e44eb81207c8585795152b4b1c94738b4d" + integrity sha512-2O3dkPAAC6JavuMm8+4+pgTk+5hoAs+CjZ+sWcQLkX9+/tHRuTkQh/Oaifr8qDmZ8iEHb771Ea6G8CdwkrgvYA== + dependencies: + strnum "^2.1.0" fastify-warning@^0.2.0: version "0.2.0" @@ -4945,6 +4947,11 @@ strip-json-comments@^3.1.0, strip-json-comments@^3.1.1: resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz" integrity sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig== +strnum@^2.1.0: + version "2.1.2" + resolved "https://registry.yarnpkg.com/strnum/-/strnum-2.1.2.tgz#a5e00ba66ab25f9cafa3726b567ce7a49170937a" + integrity sha512-l63NF9y/cLROq/yqKXSLtcMeeyOfnSQlfMSlzFt/K73oIaD8DGaQWd7Z34X9GPiKqP5rbSh84Hl4bOlLcjiSrQ== + supports-color@^5.3.0: version "5.5.0" resolved "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz" From 52a911c3edeb0a03b81ea79ceb779fc2338b77a2 Mon Sep 17 00:00:00 2001 From: Ryan Hirsch Date: Mon, 29 Dec 2025 20:00:29 -0600 Subject: [PATCH 2/2] update readme and dev validation script --- README.md | 27 +++++ src/dev.ts | 295 ++++++++++++++++++++++++++--------------------------- 2 files changed, 172 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index 2206965..c8b980e 100644 --- a/README.md +++ b/README.md @@ -97,3 +97,30 @@ The sample feeds below were chosen for their varied nature. Including things lik ## Development Update dependencies (person enum and valid license list) via `yarn deps` or `npm run deps`. + +### Testing the Parser with Live Feeds + +Use the dev script to quickly test the parser against live podcast feeds: + +```sh +# Parse a specific feed URL +yarn dev http://mp3s.nashownotes.com/pc20rss.xml + +# Parse and save the raw XML and parsed JSON to disk +yarn dev --save http://mp3s.nashownotes.com/pc20rss.xml + +# Parse all feeds in the predefined list +yarn dev --all + +# Fetch and parse latest feeds from Podcast Index API +# (requires PI_API_KEY and PI_API_SECRET environment variables) +yarn dev --latest +``` + +The dev script will output: +- The byte count of the fetched feed +- Success/failure status +- Episode count and Podcasting 2.0 namespace support summary +- Full parsed JSON output + +To add more feeds to the predefined list, edit the `feeds` array in `src/dev.ts`. diff --git a/src/dev.ts b/src/dev.ts index 086fbce..7c707a0 100644 --- a/src/dev.ts +++ b/src/dev.ts @@ -1,202 +1,187 @@ /* eslint-disable no-console */ /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable import/no-extraneous-dependencies */ +/* eslint-disable no-restricted-syntax */ +/* eslint-disable no-await-in-loop */ import * as fs from "fs"; import * as path from "path"; import crypto from "crypto"; import fetch from "node-fetch"; -import invariant from "tiny-invariant"; -import stringify from "fast-json-stable-stringify"; import { logger } from "./logger"; -import { parseFeed } from "./parser"; -// import { checkFeedByUri } from "./cor"; +import { parseFeed, FeedObject } from "./parser"; import { getFeedText } from "./shared"; const feeds: Array<{ name: string; url: string }> = [ - // { name: "Podcasting 2.0", url: "http://mp3s.nashownotes.com/pc20rss.xml" }, - // // Recent soundbites + { name: "Podcasting 2.0", url: "http://mp3s.nashownotes.com/pc20rss.xml" }, // { name: "Golden Nuggets", url: "https://feeds.buzzsprout.com/1293872.rss" }, - // // Value Block // { name: "PodClock", url: "https://podnews.net/clock-rss" }, - // // Location // { name: "That's all I got", url: "https://kevinbae.com/feed/podcast" }, - // // nested categories - // { name: "livetpajorden", url: "https://rss.acast.com/http-acast.com-acast.com-livetpajorden" }, - // // Example - // { - // name: "Local Example", - // url: `file://${path.resolve(__dirname, "parser/__test__/fixtures/example.xml")}`, - // }, - // { - // name: "Animated No Agenda", - // url: "https://noagendatube.com/feeds/videos.xml?videoChannelId=73&format=podcast", - // }, - // { - // name: "Chad Hartman", - // url: - // "https://www.omnycontent.com/d/playlist/4b5f9d6d-9214-48cb-8455-a73200038129/1c3da022-ec7e-4a7f-91dc-a78e00380a9d/44fb8925-fb04-4712-903e-a78e00380aa1/podcast.rss", - // }, - // { name: "Channel History Hit", url: "http://rss.acast.com/channelhistoryhit" }, - // { name: "Chaosradio", url: "https://chaosradio.de/feed/m4a" }, - // { name: "Chaosradio Freiburg", url: "https://rdl.de/podcast/all/35236" }, - // { - // name: "Cheat Sheet Podcast from The Daily Beast", - // url: "https://rss.acast.com/cheat-sheet-podcast", - // }, - // { name: "Chompers", url: "https://feeds.megaphone.fm/chompers" }, - // { - // name: 'Chris Walker - "The Innerwealth Podcast"', - // url: "http://feeds.soundcloud.com/users/soundcloud:users:2321682/sounds.rss", - // }, - // { name: "Christopher Walch – SDWT", url: "https://anchor.fm/s/5720588/podcast/rss" }, - // { name: "Clark Film", url: "https://clarkfilm.libsyn.com/rss" }, - // { - // name: "Cliffo and Gabi - Hit Queensland", - // url: - // "https://www.omnycontent.com/d/playlist/566281f8-200e-4c9f-8378-a4870055423b/dbd79469-2427-4cb3-bd47-a636000b0b05/984b7b50-dc38-4e9b-b153-a636000b985b/podcast.rss", - // }, - // { name: "Coach's Fitness Journey", url: "https://anchor.fm/s/e5ef6d8/podcast/rss" }, - // { name: "CoinDesk Podcast Network", url: "https://rss.art19.com/late-confirmation" }, - // { name: "CommSec", url: "https://commsecpodcasts.podbean.com/feed.xml" }, - // { - // name: "Confidence & Self Esteem Podcast", - // url: "https://www.spreaker.com/show/3128218/episodes/feed", - // }, - // { - // name: "Connect FM Podcasts", - // url: "http://feeds.soundcloud.com/users/soundcloud:users:79112501/sounds.rss", - // }, - // { - // name: "Conservative Business Journal Podcast", - // url: "https://conservativebusinessjournal.libsyn.com/rss", - // }, - // { name: "Cooper And Anthony Show", url: "https://anchor.fm/s/245b3618/podcast/rss" }, - // { - // name: "Costa Rica Pura Vida Lifestyle Podcast", - // url: "https://anchor.fm/s/1347e704/podcast/rss", - // }, - // { - // name: "Crime Stories with Nancy Grace", - // url: "https://rss.art19.com/crime-stories-with-nancy-grace", - // }, - // { name: "Gabfest", url: "https://gabfest.wordpress.com/feed/atom/" }, - // { name: "The Her Freedom Podcast", url: "http://herfreedomaudio.blogspot.de/atom.xml" }, - // { name: "Causality", url: "https://engineered.network/causality/feed/index.xml" }, - // { name: "Dudes and Dads", url: "https://feeds.podcastmirror.com/dudesanddadspodcast" }, ]; export async function checkAll(): Promise { for (let i = 0; i < feeds.length; i += 1) { const { name, url } = feeds[i]; logger.info(`Parsing ${name}: ${url}`); - // eslint-disable-next-line no-await-in-loop - await getFeed(url); + await getFeed(url, { save: false }); } } -function save({ - relativePath, - data, - parser, -}: { - relativePath: string; - data: T; - parser: (d: T) => string; -}): Promise { - const filePath = path.resolve(__dirname, relativePath); - logger.info(`Save ${filePath}`); - return new Promise((resolve, reject) => - fs.writeFile(filePath, parser(data), (err) => { - if (err) { - reject(err); - } else { - resolve(undefined); - } - }) - ); +interface GetFeedOptions { + save?: boolean; + verbose?: boolean; } const urlToFile: Array<{ uri: string; uriHash: string; title: string; parsed: string }> = []; -async function getFeed(uri: string): Promise { - const uriHash = crypto.createHash("md5").update(uri).digest("hex"); + +async function getFeed(uri: string, options: GetFeedOptions = {}): Promise { + const { save: shouldSave = false, verbose = true } = options; const xml = await getFeedText(uri); - const xmlSave = save({ - relativePath: `../raw/${uriHash}.txt`, - data: xml, - parser: (x) => x, - }); + + if (verbose) { + console.log(`\nšŸ“” Fetched ${xml.length} bytes from ${uri}\n`); + } const feedObject = parseFeed(xml); if (!feedObject) { - logger.warn(`Failed to parse xml from ${uri}`); + console.error(`āŒ Failed to parse xml from ${uri}`); return; } + if (verbose) { + console.log(`āœ… Successfully parsed feed: "${feedObject.title}"`); + console.log(` Episodes: ${feedObject.items?.length ?? 0}`); + console.log(` PC2.0 Support: ${JSON.stringify(feedObject.pc20support)}`); + console.log(""); + console.log(JSON.stringify(feedObject, null, 2)); + } + + if (shouldSave) { + saveFeedData(uri, xml, feedObject); + } +} + +function saveFeedData(uri: string, xml: string, feedObject: FeedObject): void { + const uriHash = crypto.createHash("md5").update(uri).digest("hex"); + + // Ensure directories exist + const rawDir = path.resolve(__dirname, "../raw"); + const resultsDir = path.resolve(__dirname, "../results"); + + if (!fs.existsSync(rawDir)) { + fs.mkdirSync(rawDir, { recursive: true }); + } + if (!fs.existsSync(resultsDir)) { + fs.mkdirSync(resultsDir, { recursive: true }); + } + + // Save raw XML + const xmlPath = path.resolve(rawDir, `${uriHash}.txt`); + fs.writeFileSync(xmlPath, xml); + logger.info(`Saved raw XML to ${xmlPath}`); + + // Save parsed JSON + const parsedFilename = `${feedObject.title + .toLowerCase() + .replace(/'/g, "") + .replace(/\W+/g, "-")}.json`; + const parsedPath = path.resolve(resultsDir, parsedFilename); + fs.writeFileSync(parsedPath, JSON.stringify({ ...feedObject, url: uri }, null, 2)); + logger.info(`Saved parsed feed to ${parsedPath}`); + + // Update list urlToFile.push({ uri, uriHash, title: feedObject.title, - parsed: `${feedObject.title.toLowerCase().replace(/'/g, "").replace(/\W+/g, "-")}.json`, + parsed: parsedFilename, }); - const listSave = save({ - relativePath: `../raw/list.json`, - data: urlToFile, - parser: (list) => JSON.stringify(list, null, 2), - }); - - const parsed = path.resolve( - __dirname, - "..", - `results/${feedObject.title.toLowerCase().replace(/'/g, "").replace(/\W+/g, "-")}.json` - ); - logger.info(`Parsed feed object ${parsed}`); - fs.writeFileSync(parsed, stringify({ ...feedObject, url: uri })); - await Promise.all([xmlSave, listSave]); - - // const corsSupport = await checkFeedByUri(uri); - // logger.info(corsSupport); - - // eslint-disable-next-line no-underscore-dangle, @typescript-eslint/no-unsafe-member-access - logger.info(feedObject.pc20support); + const listPath = path.resolve(rawDir, "list.json"); + fs.writeFileSync(listPath, JSON.stringify(urlToFile, null, 2)); } -if (process.argv[2] === "--latest") { - runPromise( - fetch(`https://api.podcastindex.org/api/1.0/recent/feeds?max=10`, { - headers: getHeaders(), - }) - .then((resp) => resp.json()) - // eslint-disable-next-line @typescript-eslint/no-unsafe-return, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access - .then((json) => json.feeds.map((x: { url: string }) => x.url)) - .then((feedUrls: Array) => Promise.all(feedUrls.map((x) => getFeed(x)))) - ); -} else if (process.argv[2]) { - runPromise(getFeed(process.argv[2])); +function printUsage(): void { + console.log(` +Usage: yarn dev [options] [url] + +Options: + --latest Fetch and parse latest feeds from Podcast Index API + (requires PI_API_KEY and PI_API_SECRET env vars) + --save Save raw XML and parsed JSON to disk + --all Parse all feeds in the predefined list + Parse a specific feed URL + +Examples: + yarn dev http://mp3s.nashownotes.com/pc20rss.xml + yarn dev --save http://mp3s.nashownotes.com/pc20rss.xml + yarn dev --all + yarn dev --latest +`); } -function runPromise(prom: Promise): void { - prom - .then( - () => logger.info("done"), - (err) => logger.error(err) - ) - .finally(() => process.exit()); +async function main(): Promise { + const args = process.argv.slice(2); + + if (args.length === 0) { + printUsage(); + return; + } + + const shouldSave = args.includes("--save"); + const filteredArgs = args.filter((arg) => arg !== "--save"); + + if (filteredArgs.includes("--latest")) { + await runLatest(shouldSave); + } else if (filteredArgs.includes("--all")) { + for (const feed of feeds) { + console.log(`\n${"=".repeat(60)}`); + console.log(`Parsing: ${feed.name}`); + console.log(`${"=".repeat(60)}`); + await getFeed(feed.url, { save: shouldSave }); + } + } else if (filteredArgs.length > 0) { + const url = filteredArgs.find((arg) => !arg.startsWith("--")); + if (url) { + await getFeed(url, { save: shouldSave }); + } else { + printUsage(); + } + } else { + printUsage(); + } } -// eslint-disable-next-line @typescript-eslint/explicit-function-return-type -function getHeaders() { +async function runLatest(shouldSave: boolean): Promise { const key = process.env.PI_API_KEY; const secret = process.env.PI_API_SECRET; - invariant(key); - invariant(secret); + + if (!key || !secret) { + console.error( + "āŒ PI_API_KEY and PI_API_SECRET environment variables are required for --latest" + ); + console.error(" Set them in your environment or .env file"); + process.exit(1); + } + + const response = await fetch(`https://api.podcastindex.org/api/1.0/recent/feeds?max=10`, { + headers: getHeaders(key, secret), + }); + const json = (await response.json()) as { feeds: Array<{ url: string; title: string }> }; + + for (const feed of json.feeds) { + console.log(`\n${"=".repeat(60)}`); + console.log(`Parsing: ${feed.title}`); + console.log(`${"=".repeat(60)}`); + await getFeed(feed.url, { save: shouldSave }); + } +} + +function getHeaders(key: string, secret: string): Record { const apiHeaderTime = Math.floor(Date.now() / 1000); - const sha1Algorithm = "sha1"; - const sha1Hash = crypto.createHash(sha1Algorithm); - const data4Hash = `${key}${secret}${apiHeaderTime}`; - sha1Hash.update(data4Hash); + const sha1Hash = crypto.createHash("sha1"); + sha1Hash.update(`${key}${secret}${apiHeaderTime}`); const hash4Header = sha1Hash.digest("hex"); return { @@ -204,6 +189,16 @@ function getHeaders() { "X-Auth-Date": `${apiHeaderTime}`, "X-Auth-Key": key, Authorization: hash4Header, - "User-Agent": `custom/1.0.0`, + "User-Agent": `partytime/dev`, }; } + +main() + .then(() => { + console.log("\nāœ… Done"); + process.exit(0); + }) + .catch((err) => { + console.error("\nāŒ Error:", err); + process.exit(1); + });