Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 205 additions & 1 deletion firecrawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,96 @@ type CancelCrawlJobResponse struct {
Status string `json:"status"`
}

// WebhookSpec represents a webhook specification
type WebhookSpec struct {
URL string `json:"url"`
Headers map[string]string `json:"headers,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
Events []string `json:"events,omitempty"`
}

// FormatSpec represents a format specification
type FormatSpec struct {
Type string `json:"type"`
FullPage *bool `json:"fullPage,omitempty"`
Quality *int `json:"quality,omitempty"`
Viewport *Viewport `json:"viewport,omitempty"`
Schema map[string]any `json:"schema,omitempty"`
Prompt *string `json:"prompt,omitempty"`
Modes []string `json:"modes,omitempty"`
Tag *string `json:"tag,omitempty"`
}

// Viewport represents viewport dimensions
type Viewport struct {
Width int `json:"width"`
Height int `json:"height"`
}

// ParserSpec represents a parser specification
type ParserSpec struct {
Type string `json:"type"`
MaxPages *int `json:"maxPages,omitempty"`
}

// ActionSpec represents an action specification
type ActionSpec struct {
Type string `json:"type"`
Milliseconds *int `json:"milliseconds,omitempty"`
Selector *string `json:"selector,omitempty"`
FullPage *bool `json:"fullPage,omitempty"`
Quality *int `json:"quality,omitempty"`
Viewport *Viewport `json:"viewport,omitempty"`
All *bool `json:"all,omitempty"`
Text *string `json:"text,omitempty"`
Key *string `json:"key,omitempty"`
Direction *string `json:"direction,omitempty"`
Script *string `json:"script,omitempty"`
Format *string `json:"format,omitempty"`
Landscape *bool `json:"landscape,omitempty"`
Scale *float64 `json:"scale,omitempty"`
}

// LocationSpec represents location settings
type LocationSpec struct {
Country *string `json:"country,omitempty"`
Languages []string `json:"languages,omitempty"`
}

// BatchScrapeParams represents the parameters for a batch scrape request
type BatchScrapeParams struct {
URLs []string `json:"urls"`
Webhook *WebhookSpec `json:"webhook,omitempty"`
MaxConcurrency *int `json:"maxConcurrency,omitempty"`
IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"`
Formats []interface{} `json:"formats,omitempty"`
OnlyMainContent *bool `json:"onlyMainContent,omitempty"`
IncludeTags []string `json:"includeTags,omitempty"`
ExcludeTags []string `json:"excludeTags,omitempty"`
MaxAge *int `json:"maxAge,omitempty"`
Headers map[string]string `json:"headers,omitempty"`
WaitFor *int `json:"waitFor,omitempty"`
Mobile *bool `json:"mobile,omitempty"`
SkipTlsVerification *bool `json:"skipTlsVerification,omitempty"`
Timeout *int `json:"timeout,omitempty"`
Parsers []interface{} `json:"parsers,omitempty"`
Actions []ActionSpec `json:"actions,omitempty"`
Location *LocationSpec `json:"location,omitempty"`
RemoveBase64Images *bool `json:"removeBase64Images,omitempty"`
BlockAds *bool `json:"blockAds,omitempty"`
Proxy *string `json:"proxy,omitempty"`
StoreInCache *bool `json:"storeInCache,omitempty"`
ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"`
}

// BatchScrapeResponse represents the response for batch scrape operations
type BatchScrapeResponse struct {
Success bool `json:"success"`
ID string `json:"id"`
URL string `json:"url"`
InvalidURLs []string `json:"invalidURLs,omitempty"`
}

// MapParams represents the parameters for a map request.
type MapParams struct {
IncludeSubdomains *bool `json:"includeSubdomains,omitempty"`
Expand Down Expand Up @@ -626,6 +716,114 @@ func (app *FirecrawlApp) MapURL(url string, params *MapParams) (*MapResponse, er
}
}

// BatchScrape starts a batch scrape job for the specified URLs using the Firecrawl API.
//
// Parameters:
// - params: Parameters for the batch scrape request, including URLs and optional configuration.
//
// Returns:
// - *BatchScrapeResponse: The batch scrape response with job ID and URL.
// - error: An error if the batch scrape request fails.
func (app *FirecrawlApp) BatchScrape(params *BatchScrapeParams) (*BatchScrapeResponse, error) {
if params == nil || len(params.URLs) == 0 {
return nil, fmt.Errorf("urls are required")
}

headers := app.prepareHeaders(nil)
batchBody := map[string]any{
"urls": params.URLs,
}

if params.Webhook != nil {
batchBody["webhook"] = params.Webhook
}
if params.MaxConcurrency != nil {
batchBody["maxConcurrency"] = params.MaxConcurrency
}
if params.IgnoreInvalidURLs != nil {
batchBody["ignoreInvalidURLs"] = params.IgnoreInvalidURLs
}
if params.Formats != nil {
batchBody["formats"] = params.Formats
}
if params.OnlyMainContent != nil {
batchBody["onlyMainContent"] = params.OnlyMainContent
}
if params.IncludeTags != nil {
batchBody["includeTags"] = params.IncludeTags
}
if params.ExcludeTags != nil {
batchBody["excludeTags"] = params.ExcludeTags
}
if params.MaxAge != nil {
batchBody["maxAge"] = params.MaxAge
}
if params.Headers != nil {
batchBody["headers"] = params.Headers
}
if params.WaitFor != nil {
batchBody["waitFor"] = params.WaitFor
}
if params.Mobile != nil {
batchBody["mobile"] = params.Mobile
}
if params.SkipTlsVerification != nil {
batchBody["skipTlsVerification"] = params.SkipTlsVerification
}
if params.Timeout != nil {
batchBody["timeout"] = params.Timeout
}
if params.Parsers != nil {
batchBody["parsers"] = params.Parsers
}
if params.Actions != nil {
batchBody["actions"] = params.Actions
}
if params.Location != nil {
batchBody["location"] = params.Location
}
if params.RemoveBase64Images != nil {
batchBody["removeBase64Images"] = params.RemoveBase64Images
}
if params.BlockAds != nil {
batchBody["blockAds"] = params.BlockAds
}
if params.Proxy != nil {
batchBody["proxy"] = params.Proxy
}
if params.StoreInCache != nil {
batchBody["storeInCache"] = params.StoreInCache
}
if params.ZeroDataRetention != nil {
batchBody["zeroDataRetention"] = params.ZeroDataRetention
}

resp, err := app.makeRequest(
http.MethodPost,
fmt.Sprintf("%s/v2/batch/scrape", app.APIURL),
batchBody,
headers,
"start batch scrape",
withRetries(3),
withBackoff(500),
)
if err != nil {
return nil, err
}

var batchResponse BatchScrapeResponse
err = json.Unmarshal(resp, &batchResponse)
if err != nil {
return nil, err
}

if !batchResponse.Success {
return nil, fmt.Errorf("failed to start batch scrape")
}

return &batchResponse, nil
}

// SearchURL searches for a URL using the Firecrawl API.
//
// Parameters:
Expand Down Expand Up @@ -698,7 +896,7 @@ func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, he
}
defer resp.Body.Close()

if resp.StatusCode != 502 {
if resp.StatusCode != 502 && resp.StatusCode != 503 {
break
}

Expand Down Expand Up @@ -821,12 +1019,18 @@ func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string)

var message string
switch statusCode {
case 401:
message = fmt.Sprintf("Unauthorized: Failed to %s. %s", action, errorMessage)
case 402:
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
case 403:
message = fmt.Sprintf("Forbidden: Failed to %s. %s", action, errorMessage)
case 408:
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
case 409:
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
case 429:
message = fmt.Sprintf("Too Many Requests: Failed to %s. %s", action, errorMessage)
case 500:
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
default:
Expand Down
135 changes: 135 additions & 0 deletions firecrawl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package firecrawl
import (
"log"
"os"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -544,3 +545,137 @@ func TestScrapeURLWithJSONOptions(t *testing.T) {
// Check that the extracted data contains the expected fields
assert.Contains(t, response.JSON, "mission")
}

func TestBatchScrapeInvalidAPIKey(t *testing.T) {
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{"https://www.scrapethissite.com"},
}
_, err = app.BatchScrape(params)
assert.Error(t, err)
assert.True(t,
strings.Contains(err.Error(), "Unauthorized") ||
strings.Contains(err.Error(), "Internal Server Error") ||
strings.Contains(err.Error(), "Status code 401") ||
strings.Contains(err.Error(), "Status code 500"),
"Expected error to contain 'Unauthorized' or server error, got: %s", err.Error())
}

func TestBatchScrapeE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{
"https://www.scrapethissite.com",
"https://roastmywebsite.ai",
},
Formats: []interface{}{"markdown"},
OnlyMainContent: ptr(true),
}

response, err := app.BatchScrape(params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.True(t, response.Success)
assert.NotEmpty(t, response.ID)
assert.NotEmpty(t, response.URL)
}

func TestBatchScrapeWithOptionsE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{
"https://www.scrapethissite.com",
},
Formats: []interface{}{"markdown", "html"},
OnlyMainContent: ptr(true),
MaxConcurrency: ptr(2),
IgnoreInvalidURLs: ptr(true),
MaxAge: ptr(3600000),
WaitFor: ptr(1000),
Mobile: ptr(false),
BlockAds: ptr(true),
RemoveBase64Images: ptr(true),
StoreInCache: ptr(true),
}

response, err := app.BatchScrape(params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.True(t, response.Success)
assert.NotEmpty(t, response.ID)
assert.NotEmpty(t, response.URL)
}

func TestBatchScrapeWithWebhookE2E(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{
"https://www.scrapethissite.com",
},
Formats: []interface{}{"markdown"},
Webhook: &WebhookSpec{
URL: "https://example.com/webhook",
Events: []string{"completed", "page"},
},
}

response, err := app.BatchScrape(params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.True(t, response.Success)
assert.NotEmpty(t, response.ID)
}

func TestBatchScrapeWithInvalidURLs(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{
"https://www.scrapethissite.com",
"not-a-valid-url",
"https://roastmywebsite.ai",
},
Formats: []interface{}{"markdown"},
IgnoreInvalidURLs: ptr(true),
}

response, err := app.BatchScrape(params)
require.NoError(t, err)
assert.NotNil(t, response)
assert.True(t, response.Success)
assert.NotEmpty(t, response.ID)
if response.InvalidURLs != nil {
assert.Contains(t, response.InvalidURLs, "not-a-valid-url")
}
}

func TestBatchScrapeEmptyURLs(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

params := &BatchScrapeParams{
URLs: []string{},
}

_, err = app.BatchScrape(params)
assert.Error(t, err)
assert.Contains(t, err.Error(), "urls are required")
}

func TestBatchScrapeNilParams(t *testing.T) {
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
require.NoError(t, err)

_, err = app.BatchScrape(nil)
assert.Error(t, err)
assert.Contains(t, err.Error(), "urls are required")
}