diff --git a/create-url-list/.gitignore b/create-url-list/.gitignore new file mode 100644 index 0000000..15c097f --- /dev/null +++ b/create-url-list/.gitignore @@ -0,0 +1,4 @@ +output/ +test-output/ +create-url-list +config.yml diff --git a/create-url-list/README.md b/create-url-list/README.md new file mode 100644 index 0000000..6287e9d --- /dev/null +++ b/create-url-list/README.md @@ -0,0 +1,109 @@ +# create-url-list + +A Go CLI tool that extracts and ranks URLs by pageviews from CSV data containing page analytics. + +## Build + +```bash +go build +``` + +## Usage + +```bash +./create-url-list [--quiet] [range] [output-path] +``` + +### Arguments + +1. **--quiet** (optional): Suppress all informational output (warnings, info messages, and success messages). Only errors will be displayed. Useful when using this tool in pipelines. +2. **csv-file-path** (required): Path to the input CSV file +3. **range** (optional): Rank range in format `min-max` (e.g., `1-50`). Default: `1-250` + - Specifies which ranked entries to include in the output + - `1-50` means "get the top 50 pages by pageviews" + - `51-100` means "get pages ranked 51-100 by pageviews" +4. **output-path** (optional): Custom output file path. Default: `output/YYYY-MM-DD_HH-MM-SS_range.csv` + +### Examples + +```bash +# Get top 250 pages by pageviews (default) +./create-url-list data.csv + +# Get top 50 pages by pageviews +./create-url-list data.csv 1-50 + +# Get pages ranked 101-200 by pageviews +./create-url-list data.csv 101-200 + +# Specify custom output path +./create-url-list data.csv 1-100 results/top-100.csv + +# Use in a pipeline with quiet mode (no informational output) +./create-url-list --quiet data.csv 1-50 output.csv +``` + +## Input Requirements + +The input CSV file must contain the following columns: +- `Page`: URL of the page (must start with `www.`) +- `Measure Names`: Type of metric +- `Measure Values`: Integer value of the metric + +The tool will: +- Collect all rows where `Measure Names` equals `Pageviews` +- Rank them by `Measure Values` (highest to lowest) +- Extract entries within the specified rank range +- Validate that URLs start with `www.` (to ensure consistent format without `https://`) + +## Output + +The output CSV file contains two columns (no headers): +1. Rank (integer) - Position in the ranking (1 = highest pageviews) +2. URL (string) - Page URL + +Rows are sorted by rank in ascending order (rank 1 first). + +## Configuration (Optional) + +You can create a `config.yml` file in the same directory as the executable to configure URL filtering and output format: + +```yaml +# List of URLs to ignore from the output +ignore_urls: + - www.example.com/page-to-ignore + - www.example.com/another-page-to-ignore + +# Whether to show pageviews as a third column in the output +show_pageviews: true + +# Whether to include headers in the output CSV +show_headers: true +``` + +### Configuration Options + +**`ignore_urls`** (optional) +- URLs listed here will be completely removed from the ranking (not just hidden) +- Excluded before ranking is calculated, so remaining URLs move up without gaps +- For example, if you ignore rank #2, the former rank #3 becomes the new rank #2 + +**`show_pageviews`** (optional, default: `false`) +- When `false`: Output contains 2 columns (rank, URL) +- When `true`: Output contains 3 columns (rank, URL, pageviews) + +**`show_headers`** (optional, default: `false`) +- When `false`: No headers in output (just data rows) +- When `true`: Adds header row with column names + - Without pageviews: `Rank,Page` + - With pageviews: `Rank,Page,Number of Page Views` + +The config file is optional. If it doesn't exist or can't be loaded, the tool will display a warning and continue with default settings. + +## Error Handling + +The tool exits with code 1 and displays an error message if: +- Input file path is invalid or file doesn't exist +- Required columns are missing from the CSV +- URL structure doesn't match expected format (must start with `www.`) +- Range format is invalid diff --git a/create-url-list/config.yml.example b/create-url-list/config.yml.example new file mode 100644 index 0000000..88078e0 --- /dev/null +++ b/create-url-list/config.yml.example @@ -0,0 +1,20 @@ +# Configuration file for create-url-list +# Copy this file to config.yml and customize as needed + +# List of URLs to ignore from the output +# These URLs will be filtered out before ranking, so they won't create gaps +# in the ranking numbers +ignore_urls: + - www.example.com/page-to-ignore + - www.example.com/another-page-to-ignore + +# Whether to show pageviews as a third column in the output +# Default: false (output only rank and URL) +# When true: output rank, URL, and pageviews +show_pageviews: false + +# Whether to include headers in the output CSV +# Default: false (no headers) +# When true: adds "Rank", "Page", and optionally "Number of Page Views" as headers +show_headers: false + diff --git a/create-url-list/go.mod b/create-url-list/go.mod new file mode 100644 index 0000000..d9d28dc --- /dev/null +++ b/create-url-list/go.mod @@ -0,0 +1,5 @@ +module create-url-list + +go 1.25.4 + +require gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/create-url-list/go.sum b/create-url-list/go.sum new file mode 100644 index 0000000..4bc0337 --- /dev/null +++ b/create-url-list/go.sum @@ -0,0 +1,3 @@ +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/create-url-list/main.go b/create-url-list/main.go new file mode 100644 index 0000000..ebb3565 --- /dev/null +++ b/create-url-list/main.go @@ -0,0 +1,341 @@ +package main + +import ( + "encoding/csv" + "fmt" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +type Record struct { + Page string + MeasureValues int +} + +type Config struct { + IgnoreURLs []string `yaml:"ignore_urls"` + ShowPageviews bool `yaml:"show_pageviews"` + ShowHeaders bool `yaml:"show_headers"` +} + +func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func run() error { + // Parse command-line arguments + if len(os.Args) < 2 { + return fmt.Errorf("usage: %s [--quiet] [range] [output-path]", os.Args[0]) + } + + // Check for --quiet flag + quiet := false + args := os.Args[1:] + if len(args) > 0 && args[0] == "--quiet" { + quiet = true + args = args[1:] // Remove --quiet from args + } + + if len(args) < 1 { + return fmt.Errorf("usage: %s [--quiet] [range] [output-path]", os.Args[0]) + } + + inputPath := args[0] + rangeStr := "1-250" // default range + outputPath := "" + + if len(args) >= 2 { + rangeStr = args[1] + } + if len(args) >= 3 { + outputPath = args[2] + } + + // Parse range + minVal, maxVal, err := parseRange(rangeStr) + if err != nil { + return fmt.Errorf("invalid range format: %v", err) + } + + // Validate input file exists + if _, err := os.Stat(inputPath); os.IsNotExist(err) { + return fmt.Errorf("input file does not exist: %s", inputPath) + } + + // Load config (optional) + config, err := loadConfig("config.yml") + if err != nil { + // Config is optional, so we just log a warning if it fails + if !quiet { + fmt.Fprintf(os.Stderr, "Warning: Could not load config.yml: %v\n", err) + } + config = &Config{} // Use empty config + } + + // Read and process CSV + records, err := processCSV(inputPath, config.IgnoreURLs, quiet) + if err != nil { + return err + } + + // Generate output + outputFilePath, err := writeOutput(records, outputPath, rangeStr, minVal, maxVal, config.ShowPageviews, config.ShowHeaders) + if err != nil { + return err + } + + // Print success message + if !quiet { + fmt.Printf("Successfully parsed input file `%s` and created output file at `%s`\n", inputPath, outputFilePath) + } + + return nil +} + +func parseRange(rangeStr string) (int, int, error) { + parts := strings.Split(rangeStr, "-") + if len(parts) != 2 { + return 0, 0, fmt.Errorf("range must be in format 'min-max'") + } + + min, err := strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + return 0, 0, fmt.Errorf("invalid minimum value: %v", err) + } + + max, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return 0, 0, fmt.Errorf("invalid maximum value: %v", err) + } + + if min > max { + return 0, 0, fmt.Errorf("minimum value cannot be greater than maximum value") + } + + return min, max, nil +} + +func loadConfig(configPath string) (*Config, error) { + // Check if config file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + return nil, fmt.Errorf("config file does not exist: %s", configPath) + } + + // Read config file + data, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %v", err) + } + + // Parse YAML + var config Config + if err := yaml.Unmarshal(data, &config); err != nil { + return nil, fmt.Errorf("failed to parse config file: %v", err) + } + + return &config, nil +} + +func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, error) { + file, err := os.Open(inputPath) + if err != nil { + return nil, fmt.Errorf("failed to open file: %v", err) + } + defer file.Close() + + reader := csv.NewReader(file) + + // Read header + header, err := reader.Read() + if err != nil { + return nil, fmt.Errorf("failed to read header: %v", err) + } + + // Find column indices + pageIdx, measureNamesIdx, measureValuesIdx := -1, -1, -1 + for i, col := range header { + switch col { + case "Page": + pageIdx = i + case "Measure Names": + measureNamesIdx = i + case "Measure Values": + measureValuesIdx = i + } + } + + // Validate required columns exist + if pageIdx == -1 || measureNamesIdx == -1 || measureValuesIdx == -1 { + return nil, fmt.Errorf("missing required columns (Page, Measure Names, Measure Values)") + } + + // Create a map for fast lookup of ignored URLs + ignoreMap := make(map[string]bool) + for _, url := range ignoreURLs { + ignoreMap[url] = true + } + + // Read and collect all Pageviews records + var records []Record + var skippedURLs []string + var ignoredURLs []string + for { + row, err := reader.Read() + if err != nil { + break // EOF or error + } + + // Skip if not enough columns + if len(row) <= pageIdx || len(row) <= measureNamesIdx || len(row) <= measureValuesIdx { + continue + } + + // Filter by Measure Names = "Pageviews" + if row[measureNamesIdx] != "Pageviews" { + continue + } + + // Validate URL structure + page := row[pageIdx] + if !strings.HasPrefix(page, "www.") { + skippedURLs = append(skippedURLs, page) + continue + } + + // Check if URL should be ignored + if ignoreMap[page] { + ignoredURLs = append(ignoredURLs, page) + continue + } + + // Parse Measure Values + measureValue, err := strconv.Atoi(row[measureValuesIdx]) + if err != nil { + continue // Skip non-integer values + } + + records = append(records, Record{ + Page: page, + MeasureValues: measureValue, + }) + } + + // Report skipped URLs + if !quiet && len(skippedURLs) > 0 { + fmt.Fprintf(os.Stderr, "Warning: Skipped %d URL(s) that do not match expected structure (www.*):\n", len(skippedURLs)) + for _, url := range skippedURLs { + fmt.Fprintf(os.Stderr, " - %s\n", url) + } + } + + // Report ignored URLs + if !quiet && len(ignoredURLs) > 0 { + fmt.Fprintf(os.Stderr, "Info: Ignored %d URL(s) from config:\n", len(ignoredURLs)) + for _, url := range ignoredURLs { + fmt.Fprintf(os.Stderr, " - %s\n", url) + } + } + + return records, nil +} + +func writeOutput(records []Record, outputPath, rangeStr string, minRank, maxRank int, showPageviews, showHeaders bool) (string, error) { + // Sort by Measure Values (highest to lowest) to establish ranking + sort.Slice(records, func(i, j int) bool { + return records[i].MeasureValues > records[j].MeasureValues + }) + + // Slice to get only the entries within the specified rank range + // minRank and maxRank are 1-based, so we need to convert to 0-based indices + startIdx := minRank - 1 + endIdx := maxRank + + // Ensure we don't go out of bounds + if startIdx < 0 { + startIdx = 0 + } + if endIdx > len(records) { + endIdx = len(records) + } + if startIdx >= len(records) { + // No records in this range + records = []Record{} + } else { + records = records[startIdx:endIdx] + } + + // Determine output directory and filename + var outputDir, filename string + if outputPath != "" { + outputDir = filepath.Dir(outputPath) + filename = filepath.Base(outputPath) + } else { + outputDir = "output" + // Generate filename: YYYY-MM-DD_HH-MM-SS_range.csv + now := time.Now() + filename = fmt.Sprintf("%s_%s.csv", + now.Format("2006-01-02_15-04-05"), + rangeStr) + } + + // Create output directory if it doesn't exist + if err := os.MkdirAll(outputDir, 0755); err != nil { + return "", fmt.Errorf("failed to create output directory: %v", err) + } + + // Create output file + outputFilePath := filepath.Join(outputDir, filename) + file, err := os.Create(outputFilePath) + if err != nil { + return "", fmt.Errorf("failed to create output file: %v", err) + } + defer file.Close() + + writer := csv.NewWriter(file) + defer writer.Flush() + + // Write headers if enabled + if showHeaders { + var headers []string + if showPageviews { + headers = []string{"Rank", "Page", "Number of Page Views"} + } else { + headers = []string{"Rank", "Page"} + } + if err := writer.Write(headers); err != nil { + return "", fmt.Errorf("failed to write headers: %v", err) + } + } + + // Write records with rank number, URL, and optionally pageviews + for i, record := range records { + rank := startIdx + i + 1 // Calculate the actual rank + var row []string + if showPageviews { + row = []string{ + strconv.Itoa(rank), + record.Page, + strconv.Itoa(record.MeasureValues), + } + } else { + row = []string{ + strconv.Itoa(rank), + record.Page, + } + } + if err := writer.Write(row); err != nil { + return "", fmt.Errorf("failed to write record: %v", err) + } + } + + return outputFilePath, nil +} diff --git a/create-url-list/main_test.go b/create-url-list/main_test.go new file mode 100644 index 0000000..358d10e --- /dev/null +++ b/create-url-list/main_test.go @@ -0,0 +1,651 @@ +package main + +import ( + "os" + "path/filepath" + "strconv" + "strings" + "testing" +) + +// TestParseRange tests the parseRange function with various inputs +func TestParseRange(t *testing.T) { + tests := []struct { + name string + input string + wantMin int + wantMax int + wantError bool + }{ + {"valid range", "1-250", 1, 250, false}, + {"valid range with spaces", "10 - 100", 10, 100, false}, + {"single digit range", "5-9", 5, 9, false}, + {"large range", "1-1000000", 1, 1000000, false}, + {"invalid format - no dash", "100", 0, 0, true}, + {"invalid format - multiple dashes", "1-2-3", 0, 0, true}, + {"invalid min - not a number", "abc-100", 0, 0, true}, + {"invalid max - not a number", "1-xyz", 0, 0, true}, + {"min greater than max", "100-50", 0, 0, true}, + {"empty string", "", 0, 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + min, max, err := parseRange(tt.input) + if tt.wantError { + if err == nil { + t.Errorf("parseRange(%q) expected error, got nil", tt.input) + } + } else { + if err != nil { + t.Errorf("parseRange(%q) unexpected error: %v", tt.input, err) + } + if min != tt.wantMin { + t.Errorf("parseRange(%q) min = %d, want %d", tt.input, min, tt.wantMin) + } + if max != tt.wantMax { + t.Errorf("parseRange(%q) max = %d, want %d", tt.input, max, tt.wantMax) + } + } + }) + } +} + +// TestProcessCSV_MissingColumns tests that processCSV returns an error when required columns are missing +func TestProcessCSV_MissingColumns(t *testing.T) { + _, err := processCSV("testdata/missing-columns.csv", nil, false) + if err == nil { + t.Error("processCSV() expected error for missing columns, got nil") + } + expectedMsg := "missing required columns" + if err != nil && !contains(err.Error(), expectedMsg) { + t.Errorf("processCSV() error = %v, want error containing %q", err, expectedMsg) + } +} + +// TestProcessCSV_InvalidURL tests that processCSV skips URLs that don't start with www. +func TestProcessCSV_InvalidURL(t *testing.T) { + records, err := processCSV("testdata/invalid-url.csv", nil, false) + if err != nil { + t.Errorf("processCSV() unexpected error: %v", err) + } + // Should return 0 records since the only URL doesn't start with www. + if len(records) != 0 { + t.Errorf("processCSV() got %d records, want 0 (invalid URL should be skipped)", len(records)) + } +} + +// TestProcessCSV_ValidFiltering tests that processCSV correctly collects all Pageviews records +func TestProcessCSV_ValidFiltering(t *testing.T) { + tests := []struct { + name string + file string + expectedCount int + }{ + { + name: "valid-with-filtering.csv collects all Pageviews", + file: "testdata/valid-with-filtering.csv", + expectedCount: 6, // 50, 200, 300, 100, 1, 250 (excludes Sessions row) + }, + { + name: "simple.csv with one Pageview", + file: "testdata/simple.csv", + expectedCount: 1, // One Pageviews row in simple.csv + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + records, err := processCSV(tt.file, nil, false) + if err != nil { + t.Fatalf("processCSV() unexpected error: %v", err) + } + if len(records) != tt.expectedCount { + t.Errorf("processCSV() got %d records, want %d", len(records), tt.expectedCount) + } + // Verify all records have Pageviews (MeasureValues should be integers) + for _, record := range records { + if record.MeasureValues < 0 { + t.Errorf("Record has invalid pageview value: %d", record.MeasureValues) + } + } + }) + } +} + +// TestProcessCSV_EmptyFile tests that processCSV handles empty CSV files +func TestProcessCSV_EmptyFile(t *testing.T) { + records, err := processCSV("testdata/empty.csv", nil, false) + if err != nil { + t.Fatalf("processCSV() unexpected error: %v", err) + } + if len(records) != 0 { + t.Errorf("processCSV() got %d records, want 0", len(records)) + } +} + +// TestProcessCSV_FileNotFound tests that processCSV returns an error for non-existent files +func TestProcessCSV_FileNotFound(t *testing.T) { + _, err := processCSV("testdata/nonexistent.csv", nil, false) + if err == nil { + t.Error("processCSV() expected error for non-existent file, got nil") + } +} + +// TestWriteOutput tests the writeOutput function +func TestWriteOutput(t *testing.T) { + tests := []struct { + name string + records []Record + outputPath string + rangeStr string + minRank int + maxRank int + wantErr bool + }{ + { + name: "write top 3 records", + records: []Record{ + {Page: "www.example.com/page1", MeasureValues: 100}, + {Page: "www.example.com/page2", MeasureValues: 50}, + {Page: "www.example.com/page3", MeasureValues: 200}, + }, + outputPath: "", + rangeStr: "1-3", + minRank: 1, + maxRank: 3, + wantErr: false, + }, + { + name: "write single record", + records: []Record{ + {Page: "www.example.com/page1", MeasureValues: 100}, + }, + outputPath: "test-output/custom.csv", + rangeStr: "1-1", + minRank: 1, + maxRank: 1, + wantErr: false, + }, + { + name: "write empty records", + records: []Record{}, + outputPath: "test-output/empty.csv", + rangeStr: "1-100", + minRank: 1, + maxRank: 100, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := writeOutput(tt.records, tt.outputPath, tt.rangeStr, tt.minRank, tt.maxRank, false, false) + if (err != nil) != tt.wantErr { + t.Errorf("writeOutput() error = %v, wantErr %v", err, tt.wantErr) + return + } + + // Verify the file was created + if tt.outputPath == "" { + // For default output, check that the output directory exists + if _, err := os.Stat("output"); os.IsNotExist(err) { + t.Error("writeOutput() did not create output directory") + } + } + + // Clean up test output + if tt.outputPath != "" { + os.RemoveAll(filepath.Dir(tt.outputPath)) + } + }) + } + + // Clean up default output directory + os.RemoveAll("output") +} + +// TestWriteOutput_Sorting tests that records are sorted correctly by rank (highest pageviews first) +func TestWriteOutput_Sorting(t *testing.T) { + tmpDir := t.TempDir() + outputPath := filepath.Join(tmpDir, "sorted.csv") + + records := []Record{ + {Page: "www.example.com/page3", MeasureValues: 300}, + {Page: "www.example.com/page1", MeasureValues: 100}, + {Page: "www.example.com/page2", MeasureValues: 200}, + {Page: "www.example.com/page4", MeasureValues: 50}, + } + + _, err := writeOutput(records, outputPath, "1-4", 1, 4, false, false) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + // Read the output file and verify sorting + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := splitLines(string(content)) + // Should have 4 lines (no header) + if len(lines) < 4 { + t.Fatalf("Expected at least 4 lines, got %d", len(lines)) + } + + // Verify first line has rank 1 (highest pageviews = 300) + if !contains(lines[0], "1,www.example.com/page3") { + t.Errorf("First line should contain rank 1 and page3 URL, got: %s", lines[0]) + } + + // Verify last data line has rank 4 (lowest pageviews = 50) + if !contains(lines[3], "4,www.example.com/page4") { + t.Errorf("Last line should contain rank 4 and page4 URL, got: %s", lines[3]) + } +} + +// TestWriteOutput_NoHeaders tests that output CSV has no headers +func TestWriteOutput_NoHeaders(t *testing.T) { + tmpDir := t.TempDir() + outputPath := filepath.Join(tmpDir, "no-headers.csv") + + records := []Record{ + {Page: "www.example.com/page1", MeasureValues: 100}, + } + + _, err := writeOutput(records, outputPath, "1-1", 1, 1, false, false) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := splitLines(string(content)) + // First line should be data, not headers + if contains(lines[0], "Rank") || contains(lines[0], "Page") || contains(lines[0], "URL") { + t.Error("Output file should not contain headers") + } + + // First line should contain the actual data (rank and URL) + if !contains(lines[0], "1,www.example.com/page1") { + t.Errorf("First line should contain rank and URL, got: %s", lines[0]) + } +} + +// Helper function to check if a string contains a substring +func contains(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(substr) == 0 || + (len(s) > 0 && len(substr) > 0 && indexOf(s, substr) >= 0)) +} + +func indexOf(s, substr string) int { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return i + } + } + return -1 +} + +// Helper function to split content into lines +func splitLines(s string) []string { + var lines []string + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == '\n' { + if i > start { + lines = append(lines, s[start:i]) + } + start = i + 1 + } + } + if start < len(s) { + lines = append(lines, s[start:]) + } + return lines +} + +// TestProcessCSV_OnlyPageviewsFiltered tests that only Pageviews rows are included +func TestProcessCSV_OnlyPageviewsFiltered(t *testing.T) { + records, err := processCSV("testdata/more-data.csv", nil, false) + if err != nil { + t.Fatalf("processCSV() unexpected error: %v", err) + } + + // more-data.csv has many rows with different Measure Names, but only one Pageviews row + if len(records) != 1 { + t.Errorf("processCSV() got %d records, want 1 (one Pageviews row)", len(records)) + } + + // Verify it's the correct Pageviews entry + if len(records) > 0 && records[0].MeasureValues != 311105 { + t.Errorf("Expected pageviews of 311105, got %d", records[0].MeasureValues) + } +} + +// TestProcessCSV_URLValidation tests various URL formats +func TestProcessCSV_URLValidation(t *testing.T) { + tests := []struct { + name string + url string + expectSkipped bool + }{ + {"valid www URL", "www.example.com/page", false}, + {"valid www with subdomain", "www.subdomain.example.com/page", false}, + {"invalid https URL", "https://example.com/page", true}, + {"invalid http URL", "http://example.com/page", true}, + {"invalid no www", "example.com/page", true}, + {"invalid relative path", "/page", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a temporary CSV file with the test URL + tmpDir := t.TempDir() + tmpFile := filepath.Join(tmpDir, "test.csv") + content := "Page,Measure Names,Measure Values\n" + tt.url + ",Pageviews,100\n" + if err := os.WriteFile(tmpFile, []byte(content), 0644); err != nil { + t.Fatalf("Failed to create test file: %v", err) + } + + records, err := processCSV(tmpFile, nil, false) + if err != nil { + t.Errorf("processCSV() unexpected error for URL %q: %v", tt.url, err) + } + if tt.expectSkipped && len(records) != 0 { + t.Errorf("processCSV() expected URL %q to be skipped, but got %d records", tt.url, len(records)) + } + if !tt.expectSkipped && len(records) != 1 { + t.Errorf("processCSV() expected URL %q to be included, but got %d records", tt.url, len(records)) + } + }) + } +} + +// TestWriteOutput_ColumnOrder tests that output has correct column order (rank, pageviews, URL) +func TestWriteOutput_ColumnOrder(t *testing.T) { + tmpDir := t.TempDir() + outputPath := filepath.Join(tmpDir, "column-order.csv") + + records := []Record{ + {Page: "www.example.com/page1", MeasureValues: 100}, + } + + _, err := writeOutput(records, outputPath, "1-1", 1, 1, false, false) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := splitLines(string(content)) + if len(lines) < 1 { + t.Fatal("Output file is empty") + } + + // First column should be rank (1), second should be URL + // CSV format: "1,www.example.com/page1" + if !contains(lines[0], "1,www.example.com/page1") { + t.Errorf("Expected format '1,www.example.com/page1', got: %s", lines[0]) + } +} + +// TestIntegration_EndToEnd tests the complete workflow with ranking +func TestIntegration_EndToEnd(t *testing.T) { + tmpDir := t.TempDir() + outputPath := filepath.Join(tmpDir, "result.csv") + + // Process the valid-with-filtering.csv file + records, err := processCSV("testdata/valid-with-filtering.csv", nil, false) + if err != nil { + t.Fatalf("processCSV() unexpected error: %v", err) + } + + // Should get 6 Pageviews records: 300, 250, 200, 100, 50, 1 (excludes Sessions row) + if len(records) != 6 { + t.Fatalf("Expected 6 records, got %d", len(records)) + } + + // Write output for ranks 2-4 (should get 250, 200, 100) + _, err = writeOutput(records, outputPath, "2-4", 2, 4, false, false) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + // Verify output file + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := splitLines(string(content)) + if len(lines) != 3 { + t.Fatalf("Expected 3 lines in output, got %d", len(lines)) + } + + // Verify ranking (should be rank 2=page7, rank 3=page2, rank 4=page4) + if !contains(lines[0], "2,www.example.com/page7") { + t.Errorf("First line should be rank 2 (250 pageviews), got: %s", lines[0]) + } + if !contains(lines[1], "3,www.example.com/page2") { + t.Errorf("Second line should be rank 3 (200 pageviews), got: %s", lines[1]) + } + if !contains(lines[2], "4,www.example.com/page4") { + t.Errorf("Third line should be rank 4 (100 pageviews), got: %s", lines[2]) + } +} + +// TestProcessCSV_IgnoreURLs tests that URLs in the ignore list are filtered out +func TestProcessCSV_IgnoreURLs(t *testing.T) { + tmpDir := t.TempDir() + tmpFile := filepath.Join(tmpDir, "test.csv") + content := `Page,Measure Names,Measure Values +www.example.com/page1,Pageviews,100 +www.example.com/page2,Pageviews,200 +www.example.com/page3,Pageviews,300 +www.example.com/page4,Pageviews,400 +` + if err := os.WriteFile(tmpFile, []byte(content), 0644); err != nil { + t.Fatalf("Failed to create test file: %v", err) + } + + // Test with ignore list + ignoreURLs := []string{"www.example.com/page2", "www.example.com/page4"} + records, err := processCSV(tmpFile, ignoreURLs, false) + if err != nil { + t.Fatalf("processCSV() unexpected error: %v", err) + } + + // Should only get page1 and page3 (page2 and page4 are ignored) + if len(records) != 2 { + t.Errorf("processCSV() got %d records, want 2 (2 URLs ignored)", len(records)) + } + + // Verify the correct URLs are included + foundPage1 := false + foundPage3 := false + for _, record := range records { + if record.Page == "www.example.com/page1" { + foundPage1 = true + } + if record.Page == "www.example.com/page3" { + foundPage3 = true + } + // Make sure ignored URLs are not present + if record.Page == "www.example.com/page2" || record.Page == "www.example.com/page4" { + t.Errorf("Found ignored URL in results: %s", record.Page) + } + } + + if !foundPage1 { + t.Error("Expected to find www.example.com/page1 in results") + } + if !foundPage3 { + t.Error("Expected to find www.example.com/page3 in results") + } +} + +// TestWriteOutput_ShowPageviews tests that pageviews column is added when enabled +func TestWriteOutput_ShowPageviews(t *testing.T) { + tmpDir := t.TempDir() + + tests := []struct { + name string + showPageviews bool + expectedCols int + }{ + { + name: "without pageviews", + showPageviews: false, + expectedCols: 2, // rank, URL + }, + { + name: "with pageviews", + showPageviews: true, + expectedCols: 3, // rank, URL, pageviews + }, + } + + records := []Record{ + {Page: "www.example.com/page1", MeasureValues: 300}, + {Page: "www.example.com/page2", MeasureValues: 200}, + {Page: "www.example.com/page3", MeasureValues: 100}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + outputPath := filepath.Join(tmpDir, tt.name+".csv") + _, err := writeOutput(records, outputPath, "1-3", 1, 3, tt.showPageviews, false) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + // Read and verify the output + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := strings.Split(strings.TrimSpace(string(content)), "\n") + if len(lines) != 3 { + t.Fatalf("Expected 3 lines, got %d", len(lines)) + } + + // Check each line has the correct number of columns + for i, line := range lines { + cols := strings.Split(line, ",") + if len(cols) != tt.expectedCols { + t.Errorf("Line %d: expected %d columns, got %d: %s", i+1, tt.expectedCols, len(cols), line) + } + + // If showing pageviews, verify the third column is a number + if tt.showPageviews && len(cols) == 3 { + if _, err := strconv.Atoi(cols[2]); err != nil { + t.Errorf("Line %d: third column should be a number, got %s", i+1, cols[2]) + } + } + } + + // Verify specific content when showing pageviews + if tt.showPageviews { + expectedLines := []string{ + "1,www.example.com/page1,300", + "2,www.example.com/page2,200", + "3,www.example.com/page3,100", + } + for i, expected := range expectedLines { + if lines[i] != expected { + t.Errorf("Line %d: expected %q, got %q", i+1, expected, lines[i]) + } + } + } + }) + } +} + +// TestWriteOutput_ShowHeaders tests that headers are added when enabled +func TestWriteOutput_ShowHeaders(t *testing.T) { + tmpDir := t.TempDir() + + records := []Record{ + {Page: "www.example.com/page1", MeasureValues: 300}, + {Page: "www.example.com/page2", MeasureValues: 200}, + } + + tests := []struct { + name string + showPageviews bool + showHeaders bool + expectedHeaders string + expectedLines int // total lines including headers + }{ + { + name: "no headers, no pageviews", + showPageviews: false, + showHeaders: false, + expectedHeaders: "", + expectedLines: 2, // just data rows + }, + { + name: "with headers, no pageviews", + showPageviews: false, + showHeaders: true, + expectedHeaders: "Rank,Page", + expectedLines: 3, // header + 2 data rows + }, + { + name: "with headers and pageviews", + showPageviews: true, + showHeaders: true, + expectedHeaders: "Rank,Page,Number of Page Views", + expectedLines: 3, // header + 2 data rows + }, + { + name: "no headers, with pageviews", + showPageviews: true, + showHeaders: false, + expectedHeaders: "", + expectedLines: 2, // just data rows + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + outputPath := filepath.Join(tmpDir, tt.name+".csv") + _, err := writeOutput(records, outputPath, "1-2", 1, 2, tt.showPageviews, tt.showHeaders) + if err != nil { + t.Fatalf("writeOutput() unexpected error: %v", err) + } + + // Read and verify the output + content, err := os.ReadFile(outputPath) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + lines := strings.Split(strings.TrimSpace(string(content)), "\n") + if len(lines) != tt.expectedLines { + t.Fatalf("Expected %d lines, got %d", tt.expectedLines, len(lines)) + } + + // Check headers if they should be present + if tt.showHeaders { + if lines[0] != tt.expectedHeaders { + t.Errorf("Expected headers %q, got %q", tt.expectedHeaders, lines[0]) + } + } else { + // First line should be data, not headers + if !strings.HasPrefix(lines[0], "1,") { + t.Errorf("Expected first line to start with rank '1,', got %q", lines[0]) + } + } + }) + } +} diff --git a/create-url-list/testdata/empty.csv b/create-url-list/testdata/empty.csv new file mode 100644 index 0000000..5d84f59 --- /dev/null +++ b/create-url-list/testdata/empty.csv @@ -0,0 +1,2 @@ +Page,Measure Names,Measure Values + diff --git a/create-url-list/testdata/invalid-url.csv b/create-url-list/testdata/invalid-url.csv new file mode 100644 index 0000000..89253bf --- /dev/null +++ b/create-url-list/testdata/invalid-url.csv @@ -0,0 +1,3 @@ +Page,Measure Names,Measure Values +https://example.com/test,Pageviews,100 + diff --git a/create-url-list/testdata/missing-columns.csv b/create-url-list/testdata/missing-columns.csv new file mode 100644 index 0000000..9aef082 --- /dev/null +++ b/create-url-list/testdata/missing-columns.csv @@ -0,0 +1,3 @@ +Page,Wrong Column,Measure Values +www.example.com/test,Something,100 + diff --git a/create-url-list/testdata/more-data.csv b/create-url-list/testdata/more-data.csv new file mode 100644 index 0000000..4ad8a82 --- /dev/null +++ b/create-url-list/testdata/more-data.csv @@ -0,0 +1,15 @@ +Page,Page Subsite,Measure Names,Measure Values,Min. Aux +www.example.com/docs/databases/installation/,Docs,PT Within 7 Days,14,1 +www.example.com/docs/databases/installation/,Docs,FT Within 1 Day,2145,1 +www.example.com/docs/databases/installation/,Docs,Reg Within 24 Hours,1916,1 +www.example.com/docs/databases/installation/,Docs,Inquiry Within 24 Hours Became SQL,0,1 +www.example.com/docs/databases/installation/,Docs,Inquiry Within 24 Hours Became MQL,4,1 +www.example.com/docs/databases/installation/,Docs,Inquiry Within 24 Hours,4996,1 +www.example.com/docs/databases/installation/,Docs,Exits,7111,1 +www.example.com/docs/databases/installation/,Docs,New Users,20192,1 +www.example.com/docs/databases/installation/,Docs,Sessions,41975,1 +www.example.com/docs/databases/installation/,Docs,Users,37504,1 +www.example.com/docs/databases/installation/,Docs,Bounces,3898,1 +www.example.com/docs/databases/installation/,Docs,Organic Entrances,15760,1 +www.example.com/docs/databases/installation/,Docs,Entrances,20195,1 +www.example.com/docs/control-plane/sdk/deprecation/,Docs,Pageviews,311105,1 diff --git a/create-url-list/testdata/simple.csv b/create-url-list/testdata/simple.csv new file mode 100644 index 0000000..2b2d4df --- /dev/null +++ b/create-url-list/testdata/simple.csv @@ -0,0 +1,3 @@ +Page,Page Subsite,Measure Names,Measure Values,Min. Aux +www.example.com/docs/get-started/create-project/,Docs,Pageviews,14,1 +www.example.com/docs/get-started/install/,Docs,FT Within 1 Day,2145,1 \ No newline at end of file diff --git a/create-url-list/testdata/test-ignore.csv b/create-url-list/testdata/test-ignore.csv new file mode 100644 index 0000000..cd5fb1c --- /dev/null +++ b/create-url-list/testdata/test-ignore.csv @@ -0,0 +1,7 @@ +Page,Measure Names,Measure Values +www.example.com/page1,Pageviews,500 +www.example.com/page2,Pageviews,400 +www.example.com/page3,Pageviews,300 +www.example.com/page4,Pageviews,200 +www.example.com/page5,Pageviews,100 + diff --git a/create-url-list/testdata/valid-with-filtering.csv b/create-url-list/testdata/valid-with-filtering.csv new file mode 100644 index 0000000..c27f6cc --- /dev/null +++ b/create-url-list/testdata/valid-with-filtering.csv @@ -0,0 +1,9 @@ +Page,Measure Names,Measure Values +www.example.com/page1,Pageviews,50 +www.example.com/page2,Pageviews,200 +www.example.com/page3,Pageviews,300 +www.example.com/page4,Pageviews,100 +www.example.com/page5,Sessions,150 +www.example.com/page6,Pageviews,1 +www.example.com/page7,Pageviews,250 +