From 4c351465b3910948a5f2e27e69dfe8a59d3c25c4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:11:05 +0000
Subject: [PATCH 1/9] Initial plan
From e0477aba2ecb9f398e4e5ba2fba2af86774edd12 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:20:22 +0000
Subject: [PATCH 2/9] Add URL crawling functionality with comprehensive tests
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/crawler.integration.test.js | 200 +++++++++++++
src/crawler.js | 209 ++++++++++++++
src/crawler.test.js | 491 ++++++++++++++++++++++++++++++++
src/utils.js | 52 ++++
4 files changed, 952 insertions(+)
create mode 100644 src/crawler.integration.test.js
create mode 100644 src/crawler.js
create mode 100644 src/crawler.test.js
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
new file mode 100644
index 0000000..d6f8797
--- /dev/null
+++ b/src/crawler.integration.test.js
@@ -0,0 +1,200 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+ const { expect } = await import("chai");
+ global.expect = expect;
+});
+
+describe("crawler integration", function () {
+ let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub;
+
+ beforeEach(function () {
+ axiosStub = {
+ get: sinon.stub(),
+ };
+
+ fsStub = {
+ statSync: sinon.stub(),
+ readdirSync: sinon.stub(),
+ existsSync: sinon.stub(),
+ mkdirSync: sinon.stub(),
+ writeFileSync: sinon.stub(),
+ };
+
+ logStub = sinon.stub();
+ crawlUrlsStub = sinon.stub();
+ readFileStub = sinon.stub().resolves({});
+
+ // Mock fetchFile behavior
+ axiosStub.get.callsFake(async (url) => {
+ if (url === "https://example.com/page1") {
+ return {
+ data: 'Link',
+ };
+ } else if (url === "https://example.com/page2") {
+ return { data: "Content" };
+ }
+ return { data: "" };
+ });
+
+ const utilsModule = proxyquire("./utils", {
+ axios: axiosStub,
+ fs: fsStub,
+ "./crawler": { crawlUrls: crawlUrlsStub },
+ "doc-detective-common": {
+ validate: () => ({ valid: true }),
+ resolvePaths: (x) => x,
+ transformToSchemaKey: (x) => x,
+ readFile: readFileStub,
+ },
+ });
+
+ qualifyFiles = utilsModule.qualifyFiles;
+ });
+
+ afterEach(function () {
+ sinon.restore();
+ });
+
+ it("should enable crawling by default for HTTP URLs", async function () {
+ const config = {
+ input: ["https://example.com/page1"],
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlUrlsStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlUrlsStub.calledOnce).to.be.true;
+ expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([
+ "https://example.com/page1",
+ ]);
+ });
+
+ it("should disable crawling when crawl is false", async function () {
+ const config = {
+ input: ["https://example.com/page1"],
+ crawl: false,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlUrlsStub.called).to.be.false;
+ });
+
+ it("should enable crawling when crawl is true", async function () {
+ const config = {
+ input: ["https://example.com/page1"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlUrlsStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlUrlsStub.calledOnce).to.be.true;
+ });
+
+ it("should not crawl file:// URLs by default", async function () {
+ const config = {
+ input: [], // Empty input to avoid processing issues
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ // file:// URLs won't trigger crawling since they don't start with http:// or https://
+ // This test just verifies no crawling happens
+
+ await qualifyFiles({ config });
+
+ expect(crawlUrlsStub.called).to.be.false;
+ });
+
+ it("should pass origin config to crawler", async function () {
+ const config = {
+ input: ["https://example.com/page1"],
+ origin: "https://example.com",
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlUrlsStub.resolves(["https://example.com/page1"]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlUrlsStub.calledOnce).to.be.true;
+ expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal(
+ "https://example.com"
+ );
+ });
+
+ it("should log crawling activity", async function () {
+ const config = {
+ input: ["https://example.com/page1"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlUrlsStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ // Capture console output
+ const originalConsoleLog = console.log;
+ const logOutput = [];
+ console.log = (...args) => {
+ logOutput.push(args.join(" "));
+ originalConsoleLog(...args);
+ };
+
+ try {
+ await qualifyFiles({ config });
+
+ // Check that crawling info was logged
+ const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling"));
+ const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
+
+ expect(hasCrawlingLog).to.be.true;
+ expect(hasDiscoveredLog).to.be.true;
+ } finally {
+ console.log = originalConsoleLog;
+ }
+ });
+});
diff --git a/src/crawler.js b/src/crawler.js
new file mode 100644
index 0000000..62c2ed5
--- /dev/null
+++ b/src/crawler.js
@@ -0,0 +1,209 @@
+const axios = require("axios");
+const { log } = require("./utils");
+
+exports.extractHtmlUrls = extractHtmlUrls;
+exports.extractMarkdownUrls = extractMarkdownUrls;
+exports.isSameOrigin = isSameOrigin;
+exports.resolveRelativeUrl = resolveRelativeUrl;
+exports.crawlUrls = crawlUrls;
+
+/**
+ * Extracts URLs from HTML tags with href attributes.
+ *
+ * @param {string} html - The HTML content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractHtmlUrls(html) {
+ if (typeof html !== "string") {
+ return [];
+ }
+
+ const urls = [];
+ // Match tags with href attributes
+ // This regex handles various formats: href="url", href='url', href=url
+ const anchorRegex = /]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi;
+ let match;
+
+ while ((match = anchorRegex.exec(html)) !== null) {
+ const url = match[1];
+ if (url && url !== "#" && !url.startsWith("javascript:")) {
+ urls.push(url);
+ }
+ }
+
+ return urls;
+}
+
+/**
+ * Extracts URLs from Markdown [text](url) syntax.
+ *
+ * @param {string} markdown - The Markdown content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractMarkdownUrls(markdown) {
+ if (typeof markdown !== "string") {
+ return [];
+ }
+
+ const urls = [];
+ // Match [text](url) syntax, handling escaped brackets
+ // This regex avoids matching image syntax 
+ const linkRegex = /(?} - Promise resolving to array of all discovered URLs
+ */
+async function crawlUrls({ config, initialUrls }) {
+ const visitedUrls = new Set();
+ const discoveredUrls = [];
+ const MAX_URLS = 10000;
+ let urlQueue = [...initialUrls];
+
+ // Process each URL in the queue
+ while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) {
+ const currentUrl = urlQueue.shift();
+
+ // Skip if already visited
+ if (visitedUrls.has(currentUrl)) {
+ continue;
+ }
+
+ visitedUrls.add(currentUrl);
+ discoveredUrls.push(currentUrl);
+
+ log(config, "debug", `Crawling: ${currentUrl}`);
+
+ // Fetch the URL content
+ let content;
+ try {
+ const response = await axios.get(currentUrl, {
+ timeout: 30000,
+ maxRedirects: 5,
+ });
+ content = response.data;
+ } catch (error) {
+ log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
+ continue;
+ }
+
+ // Extract URLs based on content type
+ let extractedUrls = [];
+ if (typeof content === "string") {
+ // Try both HTML and Markdown extraction
+ extractedUrls = [
+ ...extractHtmlUrls(content),
+ ...extractMarkdownUrls(content),
+ ];
+ }
+
+ // Process extracted URLs
+ for (const url of extractedUrls) {
+ let absoluteUrl;
+
+ // Check if URL is relative
+ try {
+ new URL(url);
+ absoluteUrl = url;
+ } catch {
+ // It's relative
+ if (config.origin) {
+ absoluteUrl = resolveRelativeUrl(url, config.origin);
+ if (!absoluteUrl) {
+ continue; // Skip malformed URLs
+ }
+ } else {
+ // No origin configured, skip relative URLs
+ log(
+ config,
+ "debug",
+ `Skipping relative URL (no origin configured): ${url}`
+ );
+ continue;
+ }
+ }
+
+ // Check if same origin as current URL
+ if (isSameOrigin(absoluteUrl, currentUrl)) {
+ if (!visitedUrls.has(absoluteUrl)) {
+ urlQueue.push(absoluteUrl);
+ }
+ }
+ }
+ }
+
+ // Log warning if limit reached
+ if (discoveredUrls.length >= MAX_URLS) {
+ log(
+ config,
+ "warn",
+ `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
+ );
+ }
+
+ return discoveredUrls;
+}
diff --git a/src/crawler.test.js b/src/crawler.test.js
new file mode 100644
index 0000000..0b17ca5
--- /dev/null
+++ b/src/crawler.test.js
@@ -0,0 +1,491 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+ const { expect } = await import("chai");
+ global.expect = expect;
+});
+
+describe("crawler", function () {
+ describe("extractHtmlUrls", function () {
+ let extractHtmlUrls;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ extractHtmlUrls = crawler.extractHtmlUrls;
+ });
+
+ it("should extract single URL from HTML", function () {
+ const html = 'Link';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should extract multiple URLs from HTML", function () {
+ const html = `
+ Link 1
+ Link 2
+ Link 3
+ `;
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ "https://example.com/page3",
+ ]);
+ });
+
+ it("should handle single and double quotes", function () {
+ const html = `
+ Link 1
+ Link 2
+ `;
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ });
+
+ it("should ignore anchor links", function () {
+ const html = 'AnchorLink';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["https://example.com"]);
+ });
+
+ it("should ignore javascript: links", function () {
+ const html = 'JS LinkLink';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["https://example.com"]);
+ });
+
+ it("should handle empty string", function () {
+ const urls = extractHtmlUrls("");
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle non-string input", function () {
+ const urls = extractHtmlUrls(null);
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should extract relative URLs", function () {
+ const html = 'RelativeAbsolute';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["/page1", "https://example.com"]);
+ });
+ });
+
+ describe("extractMarkdownUrls", function () {
+ let extractMarkdownUrls;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ extractMarkdownUrls = crawler.extractMarkdownUrls;
+ });
+
+ it("should extract single URL from Markdown", function () {
+ const markdown = "[Link](https://example.com/page1)";
+ const urls = extractMarkdownUrls(markdown);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should extract multiple URLs from Markdown", function () {
+ const markdown = `
+ [Link 1](https://example.com/page1)
+ [Link 2](https://example.com/page2)
+ [Link 3](https://example.com/page3)
+ `;
+ const urls = extractMarkdownUrls(markdown);
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ "https://example.com/page3",
+ ]);
+ });
+
+ it("should ignore image syntax", function () {
+ const markdown = " [Link](https://example.com/page1)";
+ const urls = extractMarkdownUrls(markdown);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should handle URLs with title text", function () {
+ const markdown = '[Link](https://example.com/page1 "Title text")';
+ const urls = extractMarkdownUrls(markdown);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should handle empty string", function () {
+ const urls = extractMarkdownUrls("");
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle non-string input", function () {
+ const urls = extractMarkdownUrls(null);
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should extract relative URLs", function () {
+ const markdown = "[Relative](/page1) [Absolute](https://example.com)";
+ const urls = extractMarkdownUrls(markdown);
+ expect(urls).to.deep.equal(["/page1", "https://example.com"]);
+ });
+ });
+
+ describe("isSameOrigin", function () {
+ let isSameOrigin;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ isSameOrigin = crawler.isSameOrigin;
+ });
+
+ it("should return true for same protocol, domain, and port", function () {
+ const result = isSameOrigin(
+ "https://example.com:443/page1",
+ "https://example.com:443/page2"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should return true for same origin with default ports", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://example.com/page2"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should return false for different protocol", function () {
+ const result = isSameOrigin(
+ "http://example.com/page1",
+ "https://example.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for different domain", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://other.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for different port", function () {
+ const result = isSameOrigin(
+ "https://example.com:443/page1",
+ "https://example.com:8080/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for subdomain differences", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://subdomain.example.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for malformed URLs", function () {
+ const result = isSameOrigin("not a url", "https://example.com");
+ expect(result).to.be.false;
+ });
+
+ it("should handle query parameters", function () {
+ const result = isSameOrigin(
+ "https://example.com/page?foo=bar",
+ "https://example.com/page?baz=qux"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should handle fragments", function () {
+ const result = isSameOrigin(
+ "https://example.com/page#section1",
+ "https://example.com/page#section2"
+ );
+ expect(result).to.be.true;
+ });
+ });
+
+ describe("resolveRelativeUrl", function () {
+ let resolveRelativeUrl;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ resolveRelativeUrl = crawler.resolveRelativeUrl;
+ });
+
+ it("should resolve relative path against origin", function () {
+ const result = resolveRelativeUrl(
+ "/page1",
+ "https://example.com"
+ );
+ expect(result).to.equal("https://example.com/page1");
+ });
+
+ it("should resolve relative path with ../ navigation", function () {
+ const result = resolveRelativeUrl(
+ "../page1",
+ "https://example.com/dir/subdir/"
+ );
+ expect(result).to.equal("https://example.com/dir/page1");
+ });
+
+ it("should resolve absolute path starting with /", function () {
+ const result = resolveRelativeUrl(
+ "/absolute/path",
+ "https://example.com/some/dir"
+ );
+ expect(result).to.equal("https://example.com/absolute/path");
+ });
+
+ it("should return null for malformed relative URLs", function () {
+ // Note: URL constructor is quite forgiving, so we need a truly malformed URL
+ // In practice, most strings can be parsed as relative URLs
+ const result = resolveRelativeUrl(
+ "",
+ "not a valid base"
+ );
+ expect(result).to.be.null;
+ });
+
+ it("should return absolute URL unchanged", function () {
+ const result = resolveRelativeUrl(
+ "https://other.com/page",
+ "https://example.com"
+ );
+ expect(result).to.equal("https://other.com/page");
+ });
+
+ it("should handle query parameters in relative URLs", function () {
+ const result = resolveRelativeUrl(
+ "/page?foo=bar",
+ "https://example.com"
+ );
+ expect(result).to.equal("https://example.com/page?foo=bar");
+ });
+
+ it("should handle fragments in relative URLs", function () {
+ const result = resolveRelativeUrl(
+ "/page#section",
+ "https://example.com"
+ );
+ expect(result).to.equal("https://example.com/page#section");
+ });
+ });
+
+ describe("crawlUrls", function () {
+ let crawlUrls, axiosStub, logStub;
+
+ beforeEach(function () {
+ axiosStub = {
+ get: sinon.stub(),
+ };
+ logStub = sinon.stub();
+
+ const crawlerModule = proxyquire("./crawler", {
+ axios: axiosStub,
+ "./utils": { log: logStub },
+ });
+ crawlUrls = crawlerModule.crawlUrls;
+ });
+
+ afterEach(function () {
+ sinon.restore();
+ });
+
+ it("should crawl single URL with no links", async function () {
+ const config = { logLevel: "info" };
+ axiosStub.get.resolves({ data: "
No links" });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ expect(axiosStub.get.calledOnce).to.be.true;
+ });
+
+ it("should crawl same-origin links", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get
+ .withArgs("https://example.com/page1")
+ .resolves({
+ data: 'Link',
+ });
+
+ axiosStub.get
+ .withArgs("https://example.com/page2")
+ .resolves({
+ data: "No more links",
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ expect(axiosStub.get.calledTwice).to.be.true;
+ });
+
+ it("should not crawl cross-origin links", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get.resolves({
+ data: 'External',
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ expect(axiosStub.get.calledOnce).to.be.true;
+ });
+
+ it("should deduplicate URLs", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get
+ .withArgs("https://example.com/page1")
+ .resolves({
+ data: 'Link',
+ });
+
+ axiosStub.get
+ .withArgs("https://example.com/page2")
+ .resolves({
+ data: 'Back',
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ expect(axiosStub.get.calledTwice).to.be.true;
+ });
+
+ it("should handle fetch errors gracefully", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get
+ .withArgs("https://example.com/page1")
+ .resolves({
+ data: 'Link',
+ });
+
+ axiosStub.get
+ .withArgs("https://example.com/page2")
+ .rejects(new Error("404 Not Found"));
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ expect(logStub.calledWith(config, "warn")).to.be.true;
+ });
+
+ it("should resolve relative URLs with origin config", async function () {
+ const config = { logLevel: "info", origin: "https://example.com" };
+
+ axiosStub.get
+ .withArgs("https://example.com/page1")
+ .resolves({
+ data: 'Relative Link',
+ });
+
+ axiosStub.get
+ .withArgs("https://example.com/page2")
+ .resolves({
+ data: "No more links",
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ });
+
+ it("should skip relative URLs without origin config", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get.resolves({
+ data: 'Relative Link',
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true;
+ });
+
+ it("should extract URLs from Markdown content", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get
+ .withArgs("https://example.com/page1")
+ .resolves({
+ data: "[Link](https://example.com/page2)",
+ });
+
+ axiosStub.get
+ .withArgs("https://example.com/page2")
+ .resolves({
+ data: "No more links",
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ });
+
+ it("should handle non-string content", async function () {
+ const config = { logLevel: "info" };
+
+ axiosStub.get.resolves({ data: { json: "object" } });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page1"],
+ });
+
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+ });
+});
diff --git a/src/utils.js b/src/utils.js
index 8b0a61c..fb6ee29 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,6 +11,7 @@ const {
transformToSchemaKey,
readFile,
} = require("doc-detective-common");
+const { crawlUrls } = require("./crawler");
exports.qualifyFiles = qualifyFiles;
exports.parseTests = parseTests;
@@ -184,6 +185,57 @@ async function qualifyFiles({ config }) {
const cleanup = config.afterAll;
if (cleanup) sequence = sequence.concat(cleanup);
+ // Determine if crawling is enabled
+ let shouldCrawl = false;
+ if (config.crawl !== undefined) {
+ // Explicit config setting takes precedence
+ shouldCrawl = config.crawl === true;
+ }
+
+ // Collect URLs that should be crawled
+ const urlsToCrawl = [];
+ for (const source of sequence) {
+ const isHttpUrl =
+ typeof source === "string" &&
+ (source.startsWith("http://") || source.startsWith("https://"));
+
+ if (isHttpUrl) {
+ // Determine if this specific URL should be crawled
+ let crawlThisUrl = shouldCrawl;
+
+ // If crawl config is not explicitly set, use protocol-based default
+ if (config.crawl === undefined) {
+ crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default
+ }
+
+ if (crawlThisUrl) {
+ urlsToCrawl.push(source);
+ }
+ }
+ }
+
+ // Perform crawling if there are URLs to crawl
+ if (urlsToCrawl.length > 0) {
+ log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`);
+ try {
+ const crawledUrls = await crawlUrls({
+ config,
+ initialUrls: urlsToCrawl,
+ });
+
+ // Add newly discovered URLs to the sequence
+ // Filter out URLs that were already in the initial sequence
+ const newUrls = crawledUrls.filter((url) => !sequence.includes(url));
+ log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`);
+
+ // Add new URLs after the input section but before cleanup
+ const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
+ sequence.splice(cleanupStartIndex, 0, ...newUrls);
+ } catch (error) {
+ log(config, "error", `Crawling failed: ${error.message}`);
+ }
+ }
+
for (let source of sequence) {
log(config, "debug", `source: ${source}`);
// Check if source is a URL
From c529cbf848bb48f25bcf9a47c1d648183f68369c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:23:37 +0000
Subject: [PATCH 3/9] Add XML sitemap support, URL limit test, and config
validation handling
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/config.js | 11 ++++++
src/config.test.js | 40 ++++++++++++++++++++++
src/crawler.js | 30 +++++++++++++++-
src/crawler.test.js | 83 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 163 insertions(+), 1 deletion(-)
diff --git a/src/config.js b/src/config.js
index 72eb211..2ca814f 100644
--- a/src/config.js
+++ b/src/config.js
@@ -478,6 +478,12 @@ async function setConfig({ config }) {
}
}
+ // Extract non-schema fields before validation
+ const nonSchemaFields = {
+ crawl: config.crawl,
+ };
+ delete config.crawl;
+
// Validate inbound `config`.
const validityCheck = validate({ schemaKey: "config_v3", object: config });
if (!validityCheck.valid) {
@@ -491,6 +497,11 @@ async function setConfig({ config }) {
}
config = validityCheck.object;
+ // Restore non-schema fields after validation
+ if (nonSchemaFields.crawl !== undefined) {
+ config.crawl = nonSchemaFields.crawl;
+ }
+
// Replace fileType strings with objects
config.fileTypes = config.fileTypes.map((fileType) => {
if (typeof fileType === "object") return fileType;
diff --git a/src/config.test.js b/src/config.test.js
index 3880c5b..610df4b 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
expect(result.concurrentRunners).to.equal(4);
});
});
+
+describe("crawl config field", function () {
+ it("should preserve crawl field through validation", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.equal(true);
+ });
+
+ it("should handle crawl field set to false", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ crawl: false,
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.equal(false);
+ });
+
+ it("should handle missing crawl field", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.be.undefined;
+ });
+});
diff --git a/src/crawler.js b/src/crawler.js
index 62c2ed5..4b442dc 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -3,6 +3,7 @@ const { log } = require("./utils");
exports.extractHtmlUrls = extractHtmlUrls;
exports.extractMarkdownUrls = extractMarkdownUrls;
+exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
exports.isSameOrigin = isSameOrigin;
exports.resolveRelativeUrl = resolveRelativeUrl;
exports.crawlUrls = crawlUrls;
@@ -63,6 +64,32 @@ function extractMarkdownUrls(markdown) {
return urls;
}
+/**
+ * Extracts URLs from XML sitemap.
+ *
+ * @param {string} xml - The XML sitemap content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractXmlSitemapUrls(xml) {
+ if (typeof xml !== "string") {
+ return [];
+ }
+
+ const urls = [];
+ // Match tags in XML sitemaps
+ const locRegex = /([^<]+)<\/loc>/gi;
+ let match;
+
+ while ((match = locRegex.exec(xml)) !== null) {
+ const url = match[1].trim();
+ if (url) {
+ urls.push(url);
+ }
+ }
+
+ return urls;
+}
+
/**
* Compares two URLs for strict origin matching.
*
@@ -154,10 +181,11 @@ async function crawlUrls({ config, initialUrls }) {
// Extract URLs based on content type
let extractedUrls = [];
if (typeof content === "string") {
- // Try both HTML and Markdown extraction
+ // Try HTML, Markdown, and XML sitemap extraction
extractedUrls = [
...extractHtmlUrls(content),
...extractMarkdownUrls(content),
+ ...extractXmlSitemapUrls(content),
];
}
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 0b17ca5..74432d5 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -134,6 +134,63 @@ describe("crawler", function () {
});
});
+ describe("extractXmlSitemapUrls", function () {
+ let extractXmlSitemapUrls;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ extractXmlSitemapUrls = crawler.extractXmlSitemapUrls;
+ });
+
+ it("should extract single URL from XML sitemap", function () {
+ const xml = `
+
+
+ https://example.com/page1
+
+ `;
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should extract multiple URLs from XML sitemap", function () {
+ const xml = `
+
+
+ https://example.com/page1
+
+
+ https://example.com/page2
+
+
+ https://example.com/page3
+
+ `;
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ "https://example.com/page3",
+ ]);
+ });
+
+ it("should handle empty string", function () {
+ const urls = extractXmlSitemapUrls("");
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle non-string input", function () {
+ const urls = extractXmlSitemapUrls(null);
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle XML without loc tags", function () {
+ const xml = "- test
";
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal([]);
+ });
+ });
+
describe("isSameOrigin", function () {
let isSameOrigin;
@@ -487,5 +544,31 @@ describe("crawler", function () {
expect(urls).to.deep.equal(["https://example.com/page1"]);
});
+
+ it("should enforce 10,000 URL limit", async function () {
+ const config = { logLevel: "info" };
+
+ // Create a mock that generates many URLs
+ let callCount = 0;
+ axiosStub.get.callsFake(async (url) => {
+ callCount++;
+ if (callCount <= 10001) {
+ // Generate unique URLs
+ return {
+ data: `Link`,
+ };
+ }
+ return { data: "No more links" };
+ });
+
+ const urls = await crawlUrls({
+ config,
+ initialUrls: ["https://example.com/page0"],
+ });
+
+ // Should stop at 10,000 URLs
+ expect(urls.length).to.equal(10000);
+ expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true;
+ });
});
});
From 7a6ec961befd4e8180e926989fc0ac6177755878 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:27:25 +0000
Subject: [PATCH 4/9] Fix circular dependency by removing log import from
crawler
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/crawler.js | 15 +++++++++------
src/crawler.test.js | 11 ++++++++++-
src/utils.js | 1 +
3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/src/crawler.js b/src/crawler.js
index 4b442dc..0a5f5f7 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -1,5 +1,4 @@
const axios = require("axios");
-const { log } = require("./utils");
exports.extractHtmlUrls = extractHtmlUrls;
exports.extractMarkdownUrls = extractMarkdownUrls;
@@ -143,9 +142,13 @@ function resolveRelativeUrl(relativeUrl, baseOrigin) {
* @param {Object} options - Crawling options
* @param {Object} options.config - Configuration object
* @param {string[]} options.initialUrls - Array of initial URLs to crawl
+ * @param {Function} options.log - Logging function (optional)
* @returns {Promise} - Promise resolving to array of all discovered URLs
*/
-async function crawlUrls({ config, initialUrls }) {
+async function crawlUrls({ config, initialUrls, log }) {
+ // Default no-op logger if not provided
+ const logger = log || (() => {});
+
const visitedUrls = new Set();
const discoveredUrls = [];
const MAX_URLS = 10000;
@@ -163,7 +166,7 @@ async function crawlUrls({ config, initialUrls }) {
visitedUrls.add(currentUrl);
discoveredUrls.push(currentUrl);
- log(config, "debug", `Crawling: ${currentUrl}`);
+ logger(config, "debug", `Crawling: ${currentUrl}`);
// Fetch the URL content
let content;
@@ -174,7 +177,7 @@ async function crawlUrls({ config, initialUrls }) {
});
content = response.data;
} catch (error) {
- log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
+ logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
continue;
}
@@ -206,7 +209,7 @@ async function crawlUrls({ config, initialUrls }) {
}
} else {
// No origin configured, skip relative URLs
- log(
+ logger(
config,
"debug",
`Skipping relative URL (no origin configured): ${url}`
@@ -226,7 +229,7 @@ async function crawlUrls({ config, initialUrls }) {
// Log warning if limit reached
if (discoveredUrls.length >= MAX_URLS) {
- log(
+ logger(
config,
"warn",
`Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 74432d5..6674f9c 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -347,7 +347,6 @@ describe("crawler", function () {
const crawlerModule = proxyquire("./crawler", {
axios: axiosStub,
- "./utils": { log: logStub },
});
crawlUrls = crawlerModule.crawlUrls;
});
@@ -363,6 +362,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -387,6 +387,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal([
@@ -406,6 +407,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -430,6 +432,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal([
@@ -455,6 +458,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal([
@@ -482,6 +486,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal([
@@ -500,6 +505,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -524,6 +530,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal([
@@ -540,6 +547,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page1"],
+ log: logStub,
});
expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -564,6 +572,7 @@ describe("crawler", function () {
const urls = await crawlUrls({
config,
initialUrls: ["https://example.com/page0"],
+ log: logStub,
});
// Should stop at 10,000 URLs
diff --git a/src/utils.js b/src/utils.js
index fb6ee29..36912b7 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -221,6 +221,7 @@ async function qualifyFiles({ config }) {
const crawledUrls = await crawlUrls({
config,
initialUrls: urlsToCrawl,
+ log,
});
// Add newly discovered URLs to the sequence
From e4cb209a23fe82cfd215b957e4d6a3f4b51cf203 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:30:42 +0000
Subject: [PATCH 5/9] Add security filtering for unsafe URL schemes (vbscript,
data)
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/crawler.js | 10 +++++++++-
src/crawler.test.js | 12 ++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/crawler.js b/src/crawler.js
index 0a5f5f7..77b78cf 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -26,7 +26,15 @@ function extractHtmlUrls(html) {
while ((match = anchorRegex.exec(html)) !== null) {
const url = match[1];
- if (url && url !== "#" && !url.startsWith("javascript:")) {
+ // Filter out unsafe URL schemes
+ const lowerUrl = url.toLowerCase();
+ if (
+ url &&
+ url !== "#" &&
+ !lowerUrl.startsWith("javascript:") &&
+ !lowerUrl.startsWith("vbscript:") &&
+ !lowerUrl.startsWith("data:")
+ ) {
urls.push(url);
}
}
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 6674f9c..63e51da 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -60,6 +60,18 @@ describe("crawler", function () {
expect(urls).to.deep.equal(["https://example.com"]);
});
+ it("should ignore vbscript: links", function () {
+ const html = 'VB LinkLink';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["https://example.com"]);
+ });
+
+ it("should ignore data: links", function () {
+ const html = 'Data LinkLink';
+ const urls = extractHtmlUrls(html);
+ expect(urls).to.deep.equal(["https://example.com"]);
+ });
+
it("should handle empty string", function () {
const urls = extractHtmlUrls("");
expect(urls).to.deep.equal([]);
From 6c5378f6472c129f52478abb0893f967e2c6273f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 31 Oct 2025 00:16:51 +0000
Subject: [PATCH 6/9] Reduce scope to only support sitemap.xml URLs
- Remove HTML and Markdown URL extraction
- Only process URLs ending with sitemap.xml
- Simplify crawler to crawlSitemap function
- Update tests to reflect reduced scope
- 66 tests passing (down from 96)
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/crawler.integration.test.js | 95 ++++---
src/crawler.js | 200 +++------------
src/crawler.test.js | 421 +++-----------------------------
src/utils.js | 42 ++--
4 files changed, 128 insertions(+), 630 deletions(-)
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
index d6f8797..7d4ab9e 100644
--- a/src/crawler.integration.test.js
+++ b/src/crawler.integration.test.js
@@ -8,7 +8,7 @@ before(async function () {
});
describe("crawler integration", function () {
- let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub;
+ let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;
beforeEach(function () {
axiosStub = {
@@ -23,18 +23,19 @@ describe("crawler integration", function () {
writeFileSync: sinon.stub(),
};
- logStub = sinon.stub();
- crawlUrlsStub = sinon.stub();
+ crawlSitemapStub = sinon.stub();
readFileStub = sinon.stub().resolves({});
// Mock fetchFile behavior
axiosStub.get.callsFake(async (url) => {
- if (url === "https://example.com/page1") {
+ if (url.endsWith("sitemap.xml")) {
return {
- data: 'Link',
+ data: `
+
+ https://example.com/page1
+ https://example.com/page2
+ `,
};
- } else if (url === "https://example.com/page2") {
- return { data: "Content" };
}
return { data: "" };
});
@@ -42,7 +43,7 @@ describe("crawler integration", function () {
const utilsModule = proxyquire("./utils", {
axios: axiosStub,
fs: fsStub,
- "./crawler": { crawlUrls: crawlUrlsStub },
+ "./crawler": { crawlSitemap: crawlSitemapStub },
"doc-detective-common": {
validate: () => ({ valid: true }),
resolvePaths: (x) => x,
@@ -58,14 +59,14 @@ describe("crawler integration", function () {
sinon.restore();
});
- it("should enable crawling by default for HTTP URLs", async function () {
+ it("should process sitemap.xml URLs by default", async function () {
const config = {
- input: ["https://example.com/page1"],
+ input: ["https://example.com/sitemap.xml"],
logLevel: "info",
fileTypes: [],
};
- crawlUrlsStub.resolves([
+ crawlSitemapStub.resolves([
"https://example.com/page1",
"https://example.com/page2",
]);
@@ -76,16 +77,13 @@ describe("crawler integration", function () {
await qualifyFiles({ config });
- expect(crawlUrlsStub.calledOnce).to.be.true;
- expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([
- "https://example.com/page1",
- ]);
+ expect(crawlSitemapStub.calledOnce).to.be.true;
+ expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
});
- it("should disable crawling when crawl is false", async function () {
+ it("should not process non-sitemap URLs", async function () {
const config = {
- input: ["https://example.com/page1"],
- crawl: false,
+ input: ["https://example.com/page.html"],
logLevel: "info",
fileTypes: [],
};
@@ -96,78 +94,69 @@ describe("crawler integration", function () {
await qualifyFiles({ config });
- expect(crawlUrlsStub.called).to.be.false;
+ expect(crawlSitemapStub.called).to.be.false;
});
- it("should enable crawling when crawl is true", async function () {
+ it("should disable processing when crawl is false", async function () {
const config = {
- input: ["https://example.com/page1"],
- crawl: true,
+ input: ["https://example.com/sitemap.xml"],
+ crawl: false,
logLevel: "info",
fileTypes: [],
};
- crawlUrlsStub.resolves([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
-
// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
await qualifyFiles({ config });
- expect(crawlUrlsStub.calledOnce).to.be.true;
+ expect(crawlSitemapStub.called).to.be.false;
});
- it("should not crawl file:// URLs by default", async function () {
+ it("should enable processing when crawl is true", async function () {
const config = {
- input: [], // Empty input to avoid processing issues
+ input: ["https://example.com/sitemap.xml"],
+ crawl: true,
logLevel: "info",
fileTypes: [],
};
- // file:// URLs won't trigger crawling since they don't start with http:// or https://
- // This test just verifies no crawling happens
+ crawlSitemapStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
await qualifyFiles({ config });
- expect(crawlUrlsStub.called).to.be.false;
+ expect(crawlSitemapStub.calledOnce).to.be.true;
});
- it("should pass origin config to crawler", async function () {
+ it("should not process file:// URLs", async function () {
const config = {
- input: ["https://example.com/page1"],
- origin: "https://example.com",
- crawl: true,
+ input: [],
logLevel: "info",
fileTypes: [],
};
- crawlUrlsStub.resolves(["https://example.com/page1"]);
-
- // Mock file system calls for fetched files
- fsStub.existsSync.returns(true);
- fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
-
await qualifyFiles({ config });
- expect(crawlUrlsStub.calledOnce).to.be.true;
- expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal(
- "https://example.com"
- );
+ expect(crawlSitemapStub.called).to.be.false;
});
- it("should log crawling activity", async function () {
+ it("should log sitemap processing activity", async function () {
const config = {
- input: ["https://example.com/page1"],
+ input: ["https://example.com/sitemap.xml"],
crawl: true,
logLevel: "info",
fileTypes: [],
};
- crawlUrlsStub.resolves([
+ crawlSitemapStub.resolves([
"https://example.com/page1",
"https://example.com/page2",
]);
@@ -187,11 +176,11 @@ describe("crawler integration", function () {
try {
await qualifyFiles({ config });
- // Check that crawling info was logged
- const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling"));
+ // Check that processing info was logged
+ const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
- expect(hasCrawlingLog).to.be.true;
+ expect(hasProcessingLog).to.be.true;
expect(hasDiscoveredLog).to.be.true;
} finally {
console.log = originalConsoleLog;
diff --git a/src/crawler.js b/src/crawler.js
index 77b78cf..4549b77 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -1,75 +1,8 @@
const axios = require("axios");
-exports.extractHtmlUrls = extractHtmlUrls;
-exports.extractMarkdownUrls = extractMarkdownUrls;
exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
exports.isSameOrigin = isSameOrigin;
-exports.resolveRelativeUrl = resolveRelativeUrl;
-exports.crawlUrls = crawlUrls;
-
-/**
- * Extracts URLs from HTML tags with href attributes.
- *
- * @param {string} html - The HTML content to parse
- * @returns {string[]} - Array of extracted URLs
- */
-function extractHtmlUrls(html) {
- if (typeof html !== "string") {
- return [];
- }
-
- const urls = [];
- // Match tags with href attributes
- // This regex handles various formats: href="url", href='url', href=url
- const anchorRegex = /]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi;
- let match;
-
- while ((match = anchorRegex.exec(html)) !== null) {
- const url = match[1];
- // Filter out unsafe URL schemes
- const lowerUrl = url.toLowerCase();
- if (
- url &&
- url !== "#" &&
- !lowerUrl.startsWith("javascript:") &&
- !lowerUrl.startsWith("vbscript:") &&
- !lowerUrl.startsWith("data:")
- ) {
- urls.push(url);
- }
- }
-
- return urls;
-}
-
-/**
- * Extracts URLs from Markdown [text](url) syntax.
- *
- * @param {string} markdown - The Markdown content to parse
- * @returns {string[]} - Array of extracted URLs
- */
-function extractMarkdownUrls(markdown) {
- if (typeof markdown !== "string") {
- return [];
- }
-
- const urls = [];
- // Match [text](url) syntax, handling escaped brackets
- // This regex avoids matching image syntax 
- const linkRegex = /(?} - Promise resolving to array of all discovered URLs
*/
-async function crawlUrls({ config, initialUrls, log }) {
+async function crawlSitemap({ config, sitemapUrl, log }) {
// Default no-op logger if not provided
const logger = log || (() => {});
- const visitedUrls = new Set();
const discoveredUrls = [];
- const MAX_URLS = 10000;
- let urlQueue = [...initialUrls];
- // Process each URL in the queue
- while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) {
- const currentUrl = urlQueue.shift();
-
- // Skip if already visited
- if (visitedUrls.has(currentUrl)) {
- continue;
- }
-
- visitedUrls.add(currentUrl);
- discoveredUrls.push(currentUrl);
-
- logger(config, "debug", `Crawling: ${currentUrl}`);
-
- // Fetch the URL content
- let content;
- try {
- const response = await axios.get(currentUrl, {
- timeout: 30000,
- maxRedirects: 5,
- });
- content = response.data;
- } catch (error) {
- logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
- continue;
- }
-
- // Extract URLs based on content type
- let extractedUrls = [];
- if (typeof content === "string") {
- // Try HTML, Markdown, and XML sitemap extraction
- extractedUrls = [
- ...extractHtmlUrls(content),
- ...extractMarkdownUrls(content),
- ...extractXmlSitemapUrls(content),
- ];
- }
+ logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);
+
+ // Fetch the sitemap content
+ let content;
+ try {
+ const response = await axios.get(sitemapUrl, {
+ timeout: 30000,
+ maxRedirects: 5,
+ });
+ content = response.data;
+ } catch (error) {
+ logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
+ return discoveredUrls;
+ }
+
+ // Extract URLs from sitemap
+ if (typeof content === "string") {
+ const extractedUrls = extractXmlSitemapUrls(content);
- // Process extracted URLs
+ // Filter URLs to only include same-origin URLs
for (const url of extractedUrls) {
- let absoluteUrl;
-
- // Check if URL is relative
- try {
- new URL(url);
- absoluteUrl = url;
- } catch {
- // It's relative
- if (config.origin) {
- absoluteUrl = resolveRelativeUrl(url, config.origin);
- if (!absoluteUrl) {
- continue; // Skip malformed URLs
- }
- } else {
- // No origin configured, skip relative URLs
- logger(
- config,
- "debug",
- `Skipping relative URL (no origin configured): ${url}`
- );
- continue;
- }
- }
-
- // Check if same origin as current URL
- if (isSameOrigin(absoluteUrl, currentUrl)) {
- if (!visitedUrls.has(absoluteUrl)) {
- urlQueue.push(absoluteUrl);
- }
+ if (isSameOrigin(url, sitemapUrl)) {
+ discoveredUrls.push(url);
+ } else {
+ logger(config, "debug", `Skipping cross-origin URL: ${url}`);
}
}
}
- // Log warning if limit reached
- if (discoveredUrls.length >= MAX_URLS) {
- logger(
- config,
- "warn",
- `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
- );
- }
+ logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);
return discoveredUrls;
}
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 63e51da..54a07e8 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -8,144 +8,6 @@ before(async function () {
});
describe("crawler", function () {
- describe("extractHtmlUrls", function () {
- let extractHtmlUrls;
-
- beforeEach(function () {
- const crawler = require("./crawler");
- extractHtmlUrls = crawler.extractHtmlUrls;
- });
-
- it("should extract single URL from HTML", function () {
- const html = 'Link';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- });
-
- it("should extract multiple URLs from HTML", function () {
- const html = `
- Link 1
- Link 2
- Link 3
- `;
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- "https://example.com/page3",
- ]);
- });
-
- it("should handle single and double quotes", function () {
- const html = `
- Link 1
- Link 2
- `;
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
- });
-
- it("should ignore anchor links", function () {
- const html = 'AnchorLink';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["https://example.com"]);
- });
-
- it("should ignore javascript: links", function () {
- const html = 'JS LinkLink';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["https://example.com"]);
- });
-
- it("should ignore vbscript: links", function () {
- const html = 'VB LinkLink';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["https://example.com"]);
- });
-
- it("should ignore data: links", function () {
- const html = 'Data LinkLink';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["https://example.com"]);
- });
-
- it("should handle empty string", function () {
- const urls = extractHtmlUrls("");
- expect(urls).to.deep.equal([]);
- });
-
- it("should handle non-string input", function () {
- const urls = extractHtmlUrls(null);
- expect(urls).to.deep.equal([]);
- });
-
- it("should extract relative URLs", function () {
- const html = 'RelativeAbsolute';
- const urls = extractHtmlUrls(html);
- expect(urls).to.deep.equal(["/page1", "https://example.com"]);
- });
- });
-
- describe("extractMarkdownUrls", function () {
- let extractMarkdownUrls;
-
- beforeEach(function () {
- const crawler = require("./crawler");
- extractMarkdownUrls = crawler.extractMarkdownUrls;
- });
-
- it("should extract single URL from Markdown", function () {
- const markdown = "[Link](https://example.com/page1)";
- const urls = extractMarkdownUrls(markdown);
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- });
-
- it("should extract multiple URLs from Markdown", function () {
- const markdown = `
- [Link 1](https://example.com/page1)
- [Link 2](https://example.com/page2)
- [Link 3](https://example.com/page3)
- `;
- const urls = extractMarkdownUrls(markdown);
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- "https://example.com/page3",
- ]);
- });
-
- it("should ignore image syntax", function () {
- const markdown = " [Link](https://example.com/page1)";
- const urls = extractMarkdownUrls(markdown);
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- });
-
- it("should handle URLs with title text", function () {
- const markdown = '[Link](https://example.com/page1 "Title text")';
- const urls = extractMarkdownUrls(markdown);
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- });
-
- it("should handle empty string", function () {
- const urls = extractMarkdownUrls("");
- expect(urls).to.deep.equal([]);
- });
-
- it("should handle non-string input", function () {
- const urls = extractMarkdownUrls(null);
- expect(urls).to.deep.equal([]);
- });
-
- it("should extract relative URLs", function () {
- const markdown = "[Relative](/page1) [Absolute](https://example.com)";
- const urls = extractMarkdownUrls(markdown);
- expect(urls).to.deep.equal(["/page1", "https://example.com"]);
- });
- });
-
describe("extractXmlSitemapUrls", function () {
let extractXmlSitemapUrls;
@@ -281,75 +143,8 @@ describe("crawler", function () {
});
});
- describe("resolveRelativeUrl", function () {
- let resolveRelativeUrl;
-
- beforeEach(function () {
- const crawler = require("./crawler");
- resolveRelativeUrl = crawler.resolveRelativeUrl;
- });
-
- it("should resolve relative path against origin", function () {
- const result = resolveRelativeUrl(
- "/page1",
- "https://example.com"
- );
- expect(result).to.equal("https://example.com/page1");
- });
-
- it("should resolve relative path with ../ navigation", function () {
- const result = resolveRelativeUrl(
- "../page1",
- "https://example.com/dir/subdir/"
- );
- expect(result).to.equal("https://example.com/dir/page1");
- });
-
- it("should resolve absolute path starting with /", function () {
- const result = resolveRelativeUrl(
- "/absolute/path",
- "https://example.com/some/dir"
- );
- expect(result).to.equal("https://example.com/absolute/path");
- });
-
- it("should return null for malformed relative URLs", function () {
- // Note: URL constructor is quite forgiving, so we need a truly malformed URL
- // In practice, most strings can be parsed as relative URLs
- const result = resolveRelativeUrl(
- "",
- "not a valid base"
- );
- expect(result).to.be.null;
- });
-
- it("should return absolute URL unchanged", function () {
- const result = resolveRelativeUrl(
- "https://other.com/page",
- "https://example.com"
- );
- expect(result).to.equal("https://other.com/page");
- });
-
- it("should handle query parameters in relative URLs", function () {
- const result = resolveRelativeUrl(
- "/page?foo=bar",
- "https://example.com"
- );
- expect(result).to.equal("https://example.com/page?foo=bar");
- });
-
- it("should handle fragments in relative URLs", function () {
- const result = resolveRelativeUrl(
- "/page#section",
- "https://example.com"
- );
- expect(result).to.equal("https://example.com/page#section");
- });
- });
-
- describe("crawlUrls", function () {
- let crawlUrls, axiosStub, logStub;
+ describe("crawlSitemap", function () {
+ let crawlSitemap, axiosStub, logStub;
beforeEach(function () {
axiosStub = {
@@ -360,45 +155,27 @@ describe("crawler", function () {
const crawlerModule = proxyquire("./crawler", {
axios: axiosStub,
});
- crawlUrls = crawlerModule.crawlUrls;
+ crawlSitemap = crawlerModule.crawlSitemap;
});
afterEach(function () {
sinon.restore();
});
- it("should crawl single URL with no links", async function () {
- const config = { logLevel: "info" };
- axiosStub.get.resolves({ data: "No links" });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page1"],
- log: logStub,
- });
-
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- expect(axiosStub.get.calledOnce).to.be.true;
- });
-
- it("should crawl same-origin links", async function () {
+ it("should process sitemap and extract same-origin URLs", async function () {
const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+ const sitemapContent = `
+
+ https://example.com/page1
+ https://example.com/page2
+ `;
- axiosStub.get
- .withArgs("https://example.com/page1")
- .resolves({
- data: 'Link',
- });
-
- axiosStub.get
- .withArgs("https://example.com/page2")
- .resolves({
- data: "No more links",
- });
+ axiosStub.get.resolves({ data: sitemapContent });
- const urls = await crawlUrls({
+ const urls = await crawlSitemap({
config,
- initialUrls: ["https://example.com/page1"],
+ sitemapUrl,
log: logStub,
});
@@ -406,190 +183,58 @@ describe("crawler", function () {
"https://example.com/page1",
"https://example.com/page2",
]);
- expect(axiosStub.get.calledTwice).to.be.true;
- });
-
- it("should not crawl cross-origin links", async function () {
- const config = { logLevel: "info" };
-
- axiosStub.get.resolves({
- data: 'External',
- });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page1"],
- log: logStub,
- });
-
- expect(urls).to.deep.equal(["https://example.com/page1"]);
expect(axiosStub.get.calledOnce).to.be.true;
});
- it("should deduplicate URLs", async function () {
+ it("should filter out cross-origin URLs", async function () {
const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+ const sitemapContent = `
+
+ https://example.com/page1
+ https://other.com/page2
+ `;
- axiosStub.get
- .withArgs("https://example.com/page1")
- .resolves({
- data: 'Link',
- });
-
- axiosStub.get
- .withArgs("https://example.com/page2")
- .resolves({
- data: 'Back',
- });
+ axiosStub.get.resolves({ data: sitemapContent });
- const urls = await crawlUrls({
+ const urls = await crawlSitemap({
config,
- initialUrls: ["https://example.com/page1"],
+ sitemapUrl,
log: logStub,
});
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
- expect(axiosStub.get.calledTwice).to.be.true;
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
});
it("should handle fetch errors gracefully", async function () {
const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
- axiosStub.get
- .withArgs("https://example.com/page1")
- .resolves({
- data: 'Link',
- });
-
- axiosStub.get
- .withArgs("https://example.com/page2")
- .rejects(new Error("404 Not Found"));
+ axiosStub.get.rejects(new Error("404 Not Found"));
- const urls = await crawlUrls({
+ const urls = await crawlSitemap({
config,
- initialUrls: ["https://example.com/page1"],
+ sitemapUrl,
log: logStub,
});
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
+ expect(urls).to.deep.equal([]);
expect(logStub.calledWith(config, "warn")).to.be.true;
});
- it("should resolve relative URLs with origin config", async function () {
- const config = { logLevel: "info", origin: "https://example.com" };
-
- axiosStub.get
- .withArgs("https://example.com/page1")
- .resolves({
- data: 'Relative Link',
- });
-
- axiosStub.get
- .withArgs("https://example.com/page2")
- .resolves({
- data: "No more links",
- });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page1"],
- log: logStub,
- });
-
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
- });
-
- it("should skip relative URLs without origin config", async function () {
- const config = { logLevel: "info" };
-
- axiosStub.get.resolves({
- data: 'Relative Link',
- });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page1"],
- log: logStub,
- });
-
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true;
- });
-
- it("should extract URLs from Markdown content", async function () {
- const config = { logLevel: "info" };
-
- axiosStub.get
- .withArgs("https://example.com/page1")
- .resolves({
- data: "[Link](https://example.com/page2)",
- });
-
- axiosStub.get
- .withArgs("https://example.com/page2")
- .resolves({
- data: "No more links",
- });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page1"],
- log: logStub,
- });
-
- expect(urls).to.deep.equal([
- "https://example.com/page1",
- "https://example.com/page2",
- ]);
- });
-
it("should handle non-string content", async function () {
const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
axiosStub.get.resolves({ data: { json: "object" } });
- const urls = await crawlUrls({
+ const urls = await crawlSitemap({
config,
- initialUrls: ["https://example.com/page1"],
+ sitemapUrl,
log: logStub,
});
- expect(urls).to.deep.equal(["https://example.com/page1"]);
- });
-
- it("should enforce 10,000 URL limit", async function () {
- const config = { logLevel: "info" };
-
- // Create a mock that generates many URLs
- let callCount = 0;
- axiosStub.get.callsFake(async (url) => {
- callCount++;
- if (callCount <= 10001) {
- // Generate unique URLs
- return {
- data: `Link`,
- };
- }
- return { data: "No more links" };
- });
-
- const urls = await crawlUrls({
- config,
- initialUrls: ["https://example.com/page0"],
- log: logStub,
- });
-
- // Should stop at 10,000 URLs
- expect(urls.length).to.equal(10000);
- expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true;
+ expect(urls).to.deep.equal([]);
});
});
});
diff --git a/src/utils.js b/src/utils.js
index 36912b7..ab4fa47 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,7 +11,7 @@ const {
transformToSchemaKey,
readFile,
} = require("doc-detective-common");
-const { crawlUrls } = require("./crawler");
+const { crawlSitemap } = require("./crawler");
exports.qualifyFiles = qualifyFiles;
exports.parseTests = parseTests;
@@ -192,48 +192,56 @@ async function qualifyFiles({ config }) {
shouldCrawl = config.crawl === true;
}
- // Collect URLs that should be crawled
- const urlsToCrawl = [];
+ // Collect sitemap.xml URLs that should be crawled
+ const sitemapsToProcess = [];
for (const source of sequence) {
const isHttpUrl =
typeof source === "string" &&
(source.startsWith("http://") || source.startsWith("https://"));
+
+ const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
- if (isHttpUrl) {
+ if (isHttpUrl && isSitemapUrl) {
// Determine if this specific URL should be crawled
let crawlThisUrl = shouldCrawl;
// If crawl config is not explicitly set, use protocol-based default
if (config.crawl === undefined) {
- crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default
+ crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default
}
if (crawlThisUrl) {
- urlsToCrawl.push(source);
+ sitemapsToProcess.push(source);
}
}
}
- // Perform crawling if there are URLs to crawl
- if (urlsToCrawl.length > 0) {
- log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`);
+ // Process sitemaps if there are any to crawl
+ if (sitemapsToProcess.length > 0) {
+ log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`);
try {
- const crawledUrls = await crawlUrls({
- config,
- initialUrls: urlsToCrawl,
- log,
- });
+ const allDiscoveredUrls = [];
+
+ // Process each sitemap
+ for (const sitemapUrl of sitemapsToProcess) {
+ const discoveredUrls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log,
+ });
+ allDiscoveredUrls.push(...discoveredUrls);
+ }
// Add newly discovered URLs to the sequence
// Filter out URLs that were already in the initial sequence
- const newUrls = crawledUrls.filter((url) => !sequence.includes(url));
- log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`);
+ const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url));
+ log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`);
// Add new URLs after the input section but before cleanup
const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
sequence.splice(cleanupStartIndex, 0, ...newUrls);
} catch (error) {
- log(config, "error", `Crawling failed: ${error.message}`);
+ log(config, "error", `Sitemap processing failed: ${error.message}`);
}
}
From 3f247a1dcb1676cdfe4005d017a903af03de77b3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 31 Oct 2025 00:38:07 +0000
Subject: [PATCH 7/9] Pin to doc-detective-common@3.4.0-dev.3 and remove crawl
field workaround
- Update package.json to use doc-detective-common@3.4.0-dev.3
- Remove code that strips/restores crawl field during validation (now in schema)
- Simplify utils.js logic since config.crawl always defined (defaults to false)
- Update tests to reflect new default behavior (crawl defaults to false)
- All 66 tests passing
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
package-lock.json | 8 ++++----
package.json | 2 +-
src/config.js | 11 -----------
src/config.test.js | 4 ++--
src/crawler.integration.test.js | 3 ++-
src/utils.js | 18 ++----------------
6 files changed, 11 insertions(+), 35 deletions(-)
diff --git a/package-lock.json b/package-lock.json
index 357fc7b..ee9c42c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -12,7 +12,7 @@
"@apidevtools/json-schema-ref-parser": "^14.2.1",
"ajv": "^8.17.1",
"axios": "^1.13.1",
- "doc-detective-common": "3.4.0-dev.2",
+ "doc-detective-common": "3.4.0-dev.3",
"dotenv": "^17.2.3",
"json-schema-faker": "^0.5.9",
"posthog-node": "^5.10.4"
@@ -700,9 +700,9 @@
}
},
"node_modules/doc-detective-common": {
- "version": "3.4.0-dev.2",
- "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.2.tgz",
- "integrity": "sha512-COI9K9jBTwMJmlDLVUksF2WTCKnNlIJhr7pgUUR5VAX6Wt0t6DLfAkbHOUROVFWeBAPW1yVwuLRTQkmNxoR9Dw==",
+ "version": "3.4.0-dev.3",
+ "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.3.tgz",
+ "integrity": "sha512-ZZohuQ7qTxwjXkaehyEJA/5hM5oKQ2J1B6Ue6zTbRl0MJcKtH+iF9X5Vl9aqyEplwPmUabxtzO6m0H4PZKko0g==",
"license": "AGPL-3.0-only",
"dependencies": {
"@apidevtools/json-schema-ref-parser": "^14.2.1",
diff --git a/package.json b/package.json
index 0cf5f87..c5ccb0b 100644
--- a/package.json
+++ b/package.json
@@ -27,7 +27,7 @@
"@apidevtools/json-schema-ref-parser": "^14.2.1",
"ajv": "^8.17.1",
"axios": "^1.13.1",
- "doc-detective-common": "3.4.0-dev.2",
+ "doc-detective-common": "3.4.0-dev.3",
"dotenv": "^17.2.3",
"json-schema-faker": "^0.5.9",
"posthog-node": "^5.10.4"
diff --git a/src/config.js b/src/config.js
index 2ca814f..72eb211 100644
--- a/src/config.js
+++ b/src/config.js
@@ -478,12 +478,6 @@ async function setConfig({ config }) {
}
}
- // Extract non-schema fields before validation
- const nonSchemaFields = {
- crawl: config.crawl,
- };
- delete config.crawl;
-
// Validate inbound `config`.
const validityCheck = validate({ schemaKey: "config_v3", object: config });
if (!validityCheck.valid) {
@@ -497,11 +491,6 @@ async function setConfig({ config }) {
}
config = validityCheck.object;
- // Restore non-schema fields after validation
- if (nonSchemaFields.crawl !== undefined) {
- config.crawl = nonSchemaFields.crawl;
- }
-
// Replace fileType strings with objects
config.fileTypes = config.fileTypes.map((fileType) => {
if (typeof fileType === "object") return fileType;
diff --git a/src/config.test.js b/src/config.test.js
index 610df4b..dc8e283 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -530,7 +530,7 @@ describe("crawl config field", function () {
expect(result.crawl).to.equal(false);
});
- it("should handle missing crawl field", async function () {
+ it("should default crawl field to false when not specified", async function () {
const inputConfig = {
input: ["https://example.com"],
logLevel: "info",
@@ -539,6 +539,6 @@ describe("crawl config field", function () {
const result = await setConfig({ config: inputConfig });
- expect(result.crawl).to.be.undefined;
+ expect(result.crawl).to.equal(false);
});
});
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
index 7d4ab9e..ad885c3 100644
--- a/src/crawler.integration.test.js
+++ b/src/crawler.integration.test.js
@@ -59,9 +59,10 @@ describe("crawler integration", function () {
sinon.restore();
});
- it("should process sitemap.xml URLs by default", async function () {
+ it("should process sitemap.xml URLs when crawl is true", async function () {
const config = {
input: ["https://example.com/sitemap.xml"],
+ crawl: true,
logLevel: "info",
fileTypes: [],
};
diff --git a/src/utils.js b/src/utils.js
index ab4fa47..9253cdf 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -185,13 +185,6 @@ async function qualifyFiles({ config }) {
const cleanup = config.afterAll;
if (cleanup) sequence = sequence.concat(cleanup);
- // Determine if crawling is enabled
- let shouldCrawl = false;
- if (config.crawl !== undefined) {
- // Explicit config setting takes precedence
- shouldCrawl = config.crawl === true;
- }
-
// Collect sitemap.xml URLs that should be crawled
const sitemapsToProcess = [];
for (const source of sequence) {
@@ -202,15 +195,8 @@ async function qualifyFiles({ config }) {
const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
if (isHttpUrl && isSitemapUrl) {
- // Determine if this specific URL should be crawled
- let crawlThisUrl = shouldCrawl;
-
- // If crawl config is not explicitly set, use protocol-based default
- if (config.crawl === undefined) {
- crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default
- }
-
- if (crawlThisUrl) {
+ // Check if crawling is enabled (defaults to false in config)
+ if (config.crawl === true) {
sitemapsToProcess.push(source);
}
}
From b11bd5085ab0667ea91083a69e510041306738f3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:47:59 +0000
Subject: [PATCH 8/9] Fix origin matching to use final URL after redirects
- Update crawlSitemap to track final URL after axios redirects
- Use responseUrl from axios response for origin comparison
- Fixes issue where www.doc-detective.com redirects to doc-detective.com
- Now correctly discovers 125 URLs from doc-detective.com sitemap
- All 66 tests passing
Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
src/crawler.js | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/crawler.js b/src/crawler.js
index 4549b77..b97fd95 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -73,12 +73,19 @@ async function crawlSitemap({ config, sitemapUrl, log }) {
// Fetch the sitemap content
let content;
+ let finalUrl = sitemapUrl;
try {
const response = await axios.get(sitemapUrl, {
timeout: 30000,
maxRedirects: 5,
});
content = response.data;
+
+ // Use the final URL after redirects for origin comparison
+ if (response.request && response.request.res && response.request.res.responseUrl) {
+ finalUrl = response.request.res.responseUrl;
+ logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
+ }
} catch (error) {
logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
return discoveredUrls;
@@ -88,9 +95,9 @@ async function crawlSitemap({ config, sitemapUrl, log }) {
if (typeof content === "string") {
const extractedUrls = extractXmlSitemapUrls(content);
- // Filter URLs to only include same-origin URLs
+ // Filter URLs to only include same-origin URLs (using final URL after redirects)
for (const url of extractedUrls) {
- if (isSameOrigin(url, sitemapUrl)) {
+ if (isSameOrigin(url, finalUrl)) {
discoveredUrls.push(url);
} else {
logger(config, "debug", `Skipping cross-origin URL: ${url}`);
From 50dac4eb9e8eda86c40af2817affe271e94ae23f Mon Sep 17 00:00:00 2001
From: hawkeyexl
Date: Tue, 4 Nov 2025 13:34:31 -0500
Subject: [PATCH 9/9] Save as HTML
---
dev/index.js | 3 ++-
src/utils.js | 6 +++++-
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/dev/index.js b/dev/index.js
index 847624b..03afb33 100644
--- a/dev/index.js
+++ b/dev/index.js
@@ -12,7 +12,8 @@ main();
*/
async function main() {
const json = {
- input: "dev/doc-content.dita",
+ input: "https://www.doc-detective.com/sitemap.xml",
+ crawl: true,
logLevel: "debug",
runOn: [
{
diff --git a/src/utils.js b/src/utils.js
index 9253cdf..39bcd78 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -154,7 +154,11 @@ async function fetchFile(fileURL) {
} else {
response.data = response.data.toString();
}
- const fileName = fileURL.split("/").pop();
+ let fileName = fileURL.split("/").pop();
+ // If fileName doesn't have an extension, add ".html"
+ if (!path.extname(fileName)) {
+ fileName += ".html";
+ }
const hash = crypto.createHash("md5").update(response.data).digest("hex");
const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`;
// If doc-detective temp directory doesn't exist, create it