From 8c077113005d8da3e26bb82177416a3216910ce5 Mon Sep 17 00:00:00 2001 From: Jake Peterson Date: Fri, 7 Nov 2025 07:56:29 -0800 Subject: [PATCH 1/2] minor: add websocket error code handlers --- firecrawl.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/firecrawl.go b/firecrawl.go index 695dc40..8ff3349 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -698,7 +698,7 @@ func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, he } defer resp.Body.Close() - if resp.StatusCode != 502 { + if resp.StatusCode != 502 && resp.StatusCode != 503 { break } @@ -821,12 +821,18 @@ func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) var message string switch statusCode { + case 401: + message = fmt.Sprintf("Unauthorized: Failed to %s. %s", action, errorMessage) case 402: message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage) + case 403: + message = fmt.Sprintf("Forbidden: Failed to %s. %s", action, errorMessage) case 408: message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage) case 409: message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage) + case 429: + message = fmt.Sprintf("Too Many Requests: Failed to %s. %s", action, errorMessage) case 500: message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage) default: From 583b577ff365f8c591ae0b7eca0269209932b5b6 Mon Sep 17 00:00:00 2001 From: Jake Peterson Date: Fri, 7 Nov 2025 07:56:39 -0800 Subject: [PATCH 2/2] Add batch scrape --- firecrawl.go | 198 ++++++++++++++++++++++++++++++++++++++++++++++ firecrawl_test.go | 135 +++++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+) diff --git a/firecrawl.go b/firecrawl.go index 8ff3349..b51d480 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -144,6 +144,96 @@ type CancelCrawlJobResponse struct { Status string `json:"status"` } +// WebhookSpec represents a webhook specification +type WebhookSpec struct { + URL string `json:"url"` + Headers map[string]string `json:"headers,omitempty"` + Metadata map[string]any `json:"metadata,omitempty"` + Events []string `json:"events,omitempty"` +} + +// FormatSpec represents a format specification +type FormatSpec struct { + Type string `json:"type"` + FullPage *bool `json:"fullPage,omitempty"` + Quality *int `json:"quality,omitempty"` + Viewport *Viewport `json:"viewport,omitempty"` + Schema map[string]any `json:"schema,omitempty"` + Prompt *string `json:"prompt,omitempty"` + Modes []string `json:"modes,omitempty"` + Tag *string `json:"tag,omitempty"` +} + +// Viewport represents viewport dimensions +type Viewport struct { + Width int `json:"width"` + Height int `json:"height"` +} + +// ParserSpec represents a parser specification +type ParserSpec struct { + Type string `json:"type"` + MaxPages *int `json:"maxPages,omitempty"` +} + +// ActionSpec represents an action specification +type ActionSpec struct { + Type string `json:"type"` + Milliseconds *int `json:"milliseconds,omitempty"` + Selector *string `json:"selector,omitempty"` + FullPage *bool `json:"fullPage,omitempty"` + Quality *int `json:"quality,omitempty"` + Viewport *Viewport `json:"viewport,omitempty"` + All *bool `json:"all,omitempty"` + Text *string `json:"text,omitempty"` + Key *string `json:"key,omitempty"` + Direction *string `json:"direction,omitempty"` + Script *string `json:"script,omitempty"` + Format *string `json:"format,omitempty"` + Landscape *bool `json:"landscape,omitempty"` + Scale *float64 `json:"scale,omitempty"` +} + +// LocationSpec represents location settings +type LocationSpec struct { + Country *string `json:"country,omitempty"` + Languages []string `json:"languages,omitempty"` +} + +// BatchScrapeParams represents the parameters for a batch scrape request +type BatchScrapeParams struct { + URLs []string `json:"urls"` + Webhook *WebhookSpec `json:"webhook,omitempty"` + MaxConcurrency *int `json:"maxConcurrency,omitempty"` + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + Formats []interface{} `json:"formats,omitempty"` + OnlyMainContent *bool `json:"onlyMainContent,omitempty"` + IncludeTags []string `json:"includeTags,omitempty"` + ExcludeTags []string `json:"excludeTags,omitempty"` + MaxAge *int `json:"maxAge,omitempty"` + Headers map[string]string `json:"headers,omitempty"` + WaitFor *int `json:"waitFor,omitempty"` + Mobile *bool `json:"mobile,omitempty"` + SkipTlsVerification *bool `json:"skipTlsVerification,omitempty"` + Timeout *int `json:"timeout,omitempty"` + Parsers []interface{} `json:"parsers,omitempty"` + Actions []ActionSpec `json:"actions,omitempty"` + Location *LocationSpec `json:"location,omitempty"` + RemoveBase64Images *bool `json:"removeBase64Images,omitempty"` + BlockAds *bool `json:"blockAds,omitempty"` + Proxy *string `json:"proxy,omitempty"` + StoreInCache *bool `json:"storeInCache,omitempty"` + ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"` +} + +// BatchScrapeResponse represents the response for batch scrape operations +type BatchScrapeResponse struct { + Success bool `json:"success"` + ID string `json:"id"` + URL string `json:"url"` + InvalidURLs []string `json:"invalidURLs,omitempty"` +} + // MapParams represents the parameters for a map request. type MapParams struct { IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` @@ -626,6 +716,114 @@ func (app *FirecrawlApp) MapURL(url string, params *MapParams) (*MapResponse, er } } +// BatchScrape starts a batch scrape job for the specified URLs using the Firecrawl API. +// +// Parameters: +// - params: Parameters for the batch scrape request, including URLs and optional configuration. +// +// Returns: +// - *BatchScrapeResponse: The batch scrape response with job ID and URL. +// - error: An error if the batch scrape request fails. +func (app *FirecrawlApp) BatchScrape(params *BatchScrapeParams) (*BatchScrapeResponse, error) { + if params == nil || len(params.URLs) == 0 { + return nil, fmt.Errorf("urls are required") + } + + headers := app.prepareHeaders(nil) + batchBody := map[string]any{ + "urls": params.URLs, + } + + if params.Webhook != nil { + batchBody["webhook"] = params.Webhook + } + if params.MaxConcurrency != nil { + batchBody["maxConcurrency"] = params.MaxConcurrency + } + if params.IgnoreInvalidURLs != nil { + batchBody["ignoreInvalidURLs"] = params.IgnoreInvalidURLs + } + if params.Formats != nil { + batchBody["formats"] = params.Formats + } + if params.OnlyMainContent != nil { + batchBody["onlyMainContent"] = params.OnlyMainContent + } + if params.IncludeTags != nil { + batchBody["includeTags"] = params.IncludeTags + } + if params.ExcludeTags != nil { + batchBody["excludeTags"] = params.ExcludeTags + } + if params.MaxAge != nil { + batchBody["maxAge"] = params.MaxAge + } + if params.Headers != nil { + batchBody["headers"] = params.Headers + } + if params.WaitFor != nil { + batchBody["waitFor"] = params.WaitFor + } + if params.Mobile != nil { + batchBody["mobile"] = params.Mobile + } + if params.SkipTlsVerification != nil { + batchBody["skipTlsVerification"] = params.SkipTlsVerification + } + if params.Timeout != nil { + batchBody["timeout"] = params.Timeout + } + if params.Parsers != nil { + batchBody["parsers"] = params.Parsers + } + if params.Actions != nil { + batchBody["actions"] = params.Actions + } + if params.Location != nil { + batchBody["location"] = params.Location + } + if params.RemoveBase64Images != nil { + batchBody["removeBase64Images"] = params.RemoveBase64Images + } + if params.BlockAds != nil { + batchBody["blockAds"] = params.BlockAds + } + if params.Proxy != nil { + batchBody["proxy"] = params.Proxy + } + if params.StoreInCache != nil { + batchBody["storeInCache"] = params.StoreInCache + } + if params.ZeroDataRetention != nil { + batchBody["zeroDataRetention"] = params.ZeroDataRetention + } + + resp, err := app.makeRequest( + http.MethodPost, + fmt.Sprintf("%s/v2/batch/scrape", app.APIURL), + batchBody, + headers, + "start batch scrape", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var batchResponse BatchScrapeResponse + err = json.Unmarshal(resp, &batchResponse) + if err != nil { + return nil, err + } + + if !batchResponse.Success { + return nil, fmt.Errorf("failed to start batch scrape") + } + + return &batchResponse, nil +} + // SearchURL searches for a URL using the Firecrawl API. // // Parameters: diff --git a/firecrawl_test.go b/firecrawl_test.go index d012bf8..5aff76d 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -3,6 +3,7 @@ package firecrawl import ( "log" "os" + "strings" "testing" "time" @@ -544,3 +545,137 @@ func TestScrapeURLWithJSONOptions(t *testing.T) { // Check that the extracted data contains the expected fields assert.Contains(t, response.JSON, "mission") } + +func TestBatchScrapeInvalidAPIKey(t *testing.T) { + app, err := NewFirecrawlApp("invalid_api_key", API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{"https://www.scrapethissite.com"}, + } + _, err = app.BatchScrape(params) + assert.Error(t, err) + assert.True(t, + strings.Contains(err.Error(), "Unauthorized") || + strings.Contains(err.Error(), "Internal Server Error") || + strings.Contains(err.Error(), "Status code 401") || + strings.Contains(err.Error(), "Status code 500"), + "Expected error to contain 'Unauthorized' or server error, got: %s", err.Error()) +} + +func TestBatchScrapeE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{ + "https://www.scrapethissite.com", + "https://roastmywebsite.ai", + }, + Formats: []interface{}{"markdown"}, + OnlyMainContent: ptr(true), + } + + response, err := app.BatchScrape(params) + require.NoError(t, err) + assert.NotNil(t, response) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) + assert.NotEmpty(t, response.URL) +} + +func TestBatchScrapeWithOptionsE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{ + "https://www.scrapethissite.com", + }, + Formats: []interface{}{"markdown", "html"}, + OnlyMainContent: ptr(true), + MaxConcurrency: ptr(2), + IgnoreInvalidURLs: ptr(true), + MaxAge: ptr(3600000), + WaitFor: ptr(1000), + Mobile: ptr(false), + BlockAds: ptr(true), + RemoveBase64Images: ptr(true), + StoreInCache: ptr(true), + } + + response, err := app.BatchScrape(params) + require.NoError(t, err) + assert.NotNil(t, response) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) + assert.NotEmpty(t, response.URL) +} + +func TestBatchScrapeWithWebhookE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{ + "https://www.scrapethissite.com", + }, + Formats: []interface{}{"markdown"}, + Webhook: &WebhookSpec{ + URL: "https://example.com/webhook", + Events: []string{"completed", "page"}, + }, + } + + response, err := app.BatchScrape(params) + require.NoError(t, err) + assert.NotNil(t, response) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) +} + +func TestBatchScrapeWithInvalidURLs(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{ + "https://www.scrapethissite.com", + "not-a-valid-url", + "https://roastmywebsite.ai", + }, + Formats: []interface{}{"markdown"}, + IgnoreInvalidURLs: ptr(true), + } + + response, err := app.BatchScrape(params) + require.NoError(t, err) + assert.NotNil(t, response) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) + if response.InvalidURLs != nil { + assert.Contains(t, response.InvalidURLs, "not-a-valid-url") + } +} + +func TestBatchScrapeEmptyURLs(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + params := &BatchScrapeParams{ + URLs: []string{}, + } + + _, err = app.BatchScrape(params) + assert.Error(t, err) + assert.Contains(t, err.Error(), "urls are required") +} + +func TestBatchScrapeNilParams(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + _, err = app.BatchScrape(nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "urls are required") +}