diff --git a/audit/gdcd/scripts/README.md b/audit/gdcd/scripts/README.md index 716d2cf..021a800 100644 --- a/audit/gdcd/scripts/README.md +++ b/audit/gdcd/scripts/README.md @@ -1,15 +1,18 @@ # Log Parser Scripts -This directory contains a script to parse GDCD log files and analyze page changes, specifically identifying moved pages vs truly new/removed pages and tracking applied usage examples. +This directory contains scripts to parse GDCD log files and analyze page changes, specifically identifying moved pages vs truly new/removed pages and tracking applied usage examples. ## Files -- `parse-log.go` - Main Go script that performs the log parsing and analysis +- `parse-log.go` - Go script that performs log parsing and analysis for page changes +- `compare-page-counts.go` - Go script that compares page counts from log files with audit-cli output - `README.md` - This documentation file ## Purpose -The script analyzes log files to distinguish between: +### parse-log.go + +The parse-log.go script analyzes log files to distinguish between: 1. **Moved Pages**: Pages that appear to be removed and created but are actually the same page moved to a new location within the same project 2. **Maybe New Pages**: Pages that may be genuinely new additions @@ -18,9 +21,43 @@ The script analyzes log files to distinguish between: All results are reported with **project context** to clearly show which project each page belongs to. +### compare-page-counts.go + +The compare-page-counts.go script compares page counts between: + +1. **Log File**: Page counts extracted from GDCD log files (lines like "Found 78 docs pages for project csharp") +2. **audit-cli**: Current page counts from running `audit-cli count pages --current-only --count-by-project` + +This helps identify discrepancies between what was processed during a GDCD run and the current state of the documentation repository. Differences can indicate: +- Pages added or removed since the log was generated +- Project name mismatches between systems +- Data inconsistencies that need investigation + +The script automatically: +1. Runs audit-cli once to identify projects that exist only in audit-cli (not in the log) +2. Re-runs audit-cli with those projects excluded using the `--exclude-dirs` flag +3. Compares the filtered results for a cleaner comparison + +The script includes built-in project name mappings to handle known differences between log file project names and audit-cli project names: +- `scala` → `scala-driver` +- `cloud-docs` → `atlas` +- `c` → `c-driver` +- `cloudgov` → `atlas-government` +- `django` → `django-mongodb` +- `docs` → `manual` +- `docs-relational-migrator` → `relational-migrator` +- `laravel` → `laravel-mongodb` +- `pymongo` → `pymongo-driver` +- `pymongo-arrow` → `pymongo-arrow-driver` +- `mck` → `kubernetes` + +The script also excludes deprecated projects from comparison: +- `docs-k8s-operator` (deprecated) + ## Dependencies - Go +- `audit-cli` command (required for compare-page-counts.go) - must be available in your PATH ## How It Works @@ -59,7 +96,9 @@ moved, we must manually adjust the count of new applied usage examples to omit t ## Usage -**Important**: You must be in the scripts directory to run the Go script directly: +**Important**: You must be in the scripts directory to run the Go scripts directly: + +### parse-log.go ```bash # Navigate to the scripts directory first @@ -70,9 +109,22 @@ go run parse-log.go ../logs/2025-09-24-18-01-30-app.log go run parse-log.go /absolute/path/to/your/log/file.log ``` +### compare-page-counts.go + +```bash +# Navigate to the scripts directory first +cd /Your/Local/Filepath/tooling/audit/gdcd/scripts + +# Then run the Go script with log file and docs repo path +go run compare-page-counts.go ../logs/2025-12-10-17-58-47-app.log /path/to/docs-mongodb-internal +go run compare-page-counts.go /absolute/path/to/log/file.log /absolute/path/to/docs/repo +``` + ## Output Format -The script produces four sections: +### parse-log.go + +The parse-log.go script produces four sections: ### 1. MOVED PAGES ``` @@ -108,6 +160,72 @@ APPLIED USAGE [pymongo]: data-formats|custom-types|type-codecs (1 applied usage Total new applied usage examples: 17 ``` +### compare-page-counts.go + +The compare-page-counts.go script compares page counts from the log file with the current state from audit-cli and produces output like: + +``` +=== INITIAL COMPARISON === +Found 6 projects only in audit-cli: [app-services guides mongodb-analyzer mongodb-intellij mongodb-vscode realm] + +Re-running audit-cli with exclusions... + +=== PAGE COUNT COMPARISON === + +Projects with differences: +-------------------------------------------------- +atlas Log: 777 Audit: 703 (diff: -74) +atlas-architecture Log: 124 Audit: 121 (diff: -3) +atlas-cli Log: 1276 Audit: 930 (diff: -346) +atlas-operator Log: 58 Audit: 57 (diff: -1) +c-driver Log: 86 Audit: 56 (diff: -30) +cloud-manager Log: 490 Audit: 482 (diff: -8) +compass Log: 117 Audit: 115 (diff: -2) +cpp-driver Log: 56 Audit: 52 (diff: -4) +csharp Log: 78 Audit: 77 (diff: -1) +database-tools Log: 61 Audit: 53 (diff: -8) +django-mongodb Log: 30 Audit: 27 (diff: -3) +drivers Log: 21 Audit: 20 (diff: -1) +entity-framework Log: 13 Audit: 14 (diff: +1) +golang Log: 143 Audit: 68 (diff: -75) +java Log: 90 Audit: 89 (diff: -1) +java-rs Log: 56 Audit: 55 (diff: -1) +kotlin Log: 88 Audit: 87 (diff: -1) +kotlin-sync Log: 95 Audit: 66 (diff: -29) +landing Log: 27 Audit: 23 (diff: -4) +laravel-mongodb Log: 58 Audit: 57 (diff: -1) +manual Log: 1668 Audit: 1596 (diff: -72) +mongocli Log: 403 Audit: 17 (diff: -386) +mongoid Log: 60 Audit: 59 (diff: -1) +mongosync Log: 73 Audit: 88 (diff: +15) +node Log: 77 Audit: 76 (diff: -1) +ops-manager Log: 632 Audit: 628 (diff: -4) +php-library Log: 259 Audit: 258 (diff: -1) +pymongo-arrow-driver Log: 8 Audit: 9 (diff: +1) +pymongo-driver Log: 67 Audit: 66 (diff: -1) +relational-migrator Log: 135 Audit: 109 (diff: -26) +ruby-driver Log: 91 Audit: 62 (diff: -29) +rust Log: 76 Audit: 74 (diff: -2) +scala-driver Log: 44 Audit: 43 (diff: -1) +spark-connector Log: 16 Audit: 17 (diff: +1) +voyage Log: 0 Audit: 1 (diff: +1) + +=== SUMMARY === +Total projects: 43 +Matching counts: 8 +Different counts: 35 + +Total pages in log: 7869 +Total pages in audit-cli: 6771 +Difference: -1098 +``` + +This helps identify: +- **Matching counts**: Projects where log and audit-cli agree +- **Different counts**: Projects where counts differ (with the difference shown) +- **Only in log**: Projects found in the log but not in audit-cli output (may indicate project name mismatches) +- **Total pages**: Sum of all page counts from each source, excluding deprecated projects and projects only in audit-cli + ## Log Format Requirements The scripts expect log lines in the following formats: diff --git a/audit/gdcd/scripts/compare-page-counts.go b/audit/gdcd/scripts/compare-page-counts.go new file mode 100644 index 0000000..92fe81e --- /dev/null +++ b/audit/gdcd/scripts/compare-page-counts.go @@ -0,0 +1,291 @@ +package main + +import ( + "bufio" + "fmt" + "log" + "os" + "os/exec" + "regexp" + "strconv" + "strings" +) + +// ProjectPageCount represents the page count for a project +type ProjectPageCount struct { + ProjectName string + Count int +} + +// projectNameMapping maps log file project names to their audit-cli equivalents. +// This handles cases where the same project has different names in the GDCD logs +// versus the audit-cli output. Add new mappings here as needed. +var projectNameMapping = map[string]string{ + "scala": "scala-driver", + "cloud-docs": "atlas", + "c": "c-driver", + "cloudgov": "atlas-government", + "django": "django-mongodb", + "docs": "manual", + "docs-relational-migrator": "relational-migrator", + "laravel": "laravel-mongodb", + "pymongo": "pymongo-driver", + "pymongo-arrow": "pymongo-arrow-driver", + "mck": "kubernetes", +} + +// deprecatedProjects lists projects that should be excluded from comparison +var deprecatedProjects = map[string]bool{ + "docs-k8s-operator": true, +} + +// normalizeProjectName converts log project names to their audit-cli equivalents. +// If no mapping exists, returns the original name unchanged. +func normalizeProjectName(name string) string { + if normalized, exists := projectNameMapping[name]; exists { + return normalized + } + return name +} + +func main() { + if len(os.Args) != 3 { + fmt.Println("Usage: go run compare-page-counts.go ") + fmt.Println("Example: go run compare-page-counts.go ../logs/2025-12-10-17-58-47-app.log /path/to/docs-mongodb-internal") + os.Exit(1) + } + + logFile := os.Args[1] + docsRepoPath := os.Args[2] + + // Parse the log file to extract page counts + logCounts, err := parseLogFile(logFile) + if err != nil { + log.Fatalf("Error parsing log file: %v", err) + } + + // Check that audit-cli is available + _, err = exec.LookPath("audit-cli") + if err != nil { + log.Fatalf("audit-cli is not available: %v", err) + } + + // Run audit-cli command to get current page counts (first pass without exclusions) + auditCounts, err := runAuditCli(docsRepoPath, nil) + if err != nil { + log.Fatalf("Error running audit-cli: %v", err) + } + + // Find projects that are only in audit-cli + excludeDirs := findProjectsOnlyInAudit(logCounts, auditCounts) + + // If there are projects to exclude, run audit-cli again with exclusions + if len(excludeDirs) > 0 { + fmt.Println("=== INITIAL COMPARISON ===") + fmt.Printf("Found %d projects only in audit-cli: %v\n", len(excludeDirs), excludeDirs) + fmt.Println("\nRe-running audit-cli with exclusions...\n") + + auditCounts, err = runAuditCli(docsRepoPath, excludeDirs) + if err != nil { + log.Fatalf("Error running audit-cli with exclusions: %v", err) + } + } + + // Compare the counts and report differences + compareAndReport(logCounts, auditCounts) +} + +// parseLogFile extracts page counts from the log file +func parseLogFile(logFile string) (map[string]int, error) { + file, err := os.Open(logFile) + if err != nil { + return nil, fmt.Errorf("error opening file: %v", err) + } + defer file.Close() + + // Regular expression to match lines like "Found 78 docs pages for project csharp" + pageCountRegex := regexp.MustCompile(`Found (\d+) docs pages for project (.+)`) + counts := make(map[string]int) + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if matches := pageCountRegex.FindStringSubmatch(line); matches != nil { + count, _ := strconv.Atoi(matches[1]) + projectName := strings.TrimSpace(matches[2]) + // Normalize project name to match audit-cli naming + normalizedName := normalizeProjectName(projectName) + // Skip deprecated projects + if deprecatedProjects[normalizedName] { + continue + } + counts[normalizedName] = count + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading file: %v", err) + } + + return counts, nil +} + +// runAuditCli executes the audit-cli command and parses its output. +// If excludeDirs is provided, adds --exclude-dirs flag with comma-separated list. +func runAuditCli(docsRepoPath string, excludeDirs []string) (map[string]int, error) { + // Build the base command + cmdStr := fmt.Sprintf("source ~/.bashrc && audit-cli count pages %s --current-only --count-by-project", docsRepoPath) + + // Add exclude-dirs flag if provided + if len(excludeDirs) > 0 { + excludeList := strings.Join(excludeDirs, ",") + cmdStr = fmt.Sprintf("%s --exclude-dirs %s", cmdStr, excludeList) + } + + cmd := exec.Command("bash", "-c", cmdStr) + output, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("error running audit-cli: %v\nOutput: %s", err, string(output)) + } + + return parseAuditCliOutput(string(output)) +} + +// parseAuditCliOutput parses the output from audit-cli command +func parseAuditCliOutput(output string) (map[string]int, error) { + counts := make(map[string]int) + + // Regular expression to match lines like " csharp 77" + lineRegex := regexp.MustCompile(`^\s+([a-z0-9-]+)\s+(\d+)$`) + + scanner := bufio.NewScanner(strings.NewReader(output)) + for scanner.Scan() { + line := scanner.Text() + if matches := lineRegex.FindStringSubmatch(line); matches != nil { + projectName := strings.TrimSpace(matches[1]) + count, _ := strconv.Atoi(matches[2]) + counts[projectName] = count + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error parsing audit-cli output: %v", err) + } + + return counts, nil +} + +// findProjectsOnlyInAudit identifies projects that exist in audit-cli but not in the log +func findProjectsOnlyInAudit(logCounts, auditCounts map[string]int) []string { + var onlyInAudit []string + + for project := range auditCounts { + if _, existsInLog := logCounts[project]; !existsInLog { + onlyInAudit = append(onlyInAudit, project) + } + } + + // Sort for consistent output + for i := 0; i < len(onlyInAudit); i++ { + for j := i + 1; j < len(onlyInAudit); j++ { + if onlyInAudit[i] > onlyInAudit[j] { + onlyInAudit[i], onlyInAudit[j] = onlyInAudit[j], onlyInAudit[i] + } + } + } + + return onlyInAudit +} + +// compareAndReport compares the two sets of counts and reports differences +func compareAndReport(logCounts, auditCounts map[string]int) { + // Collect all unique project names and sort them + var allProjects []string + projectSet := make(map[string]bool) + for project := range logCounts { + if !projectSet[project] { + allProjects = append(allProjects, project) + projectSet[project] = true + } + } + for project := range auditCounts { + if !projectSet[project] { + allProjects = append(allProjects, project) + projectSet[project] = true + } + } + + // Sort projects alphabetically + for i := 0; i < len(allProjects); i++ { + for j := i + 1; j < len(allProjects); j++ { + if allProjects[i] > allProjects[j] { + allProjects[i], allProjects[j] = allProjects[j], allProjects[i] + } + } + } + + // Track statistics and differences + matching := 0 + different := 0 + onlyInLog := 0 + onlyInAudit := 0 + var differences []string + totalLogPages := 0 + totalAuditPages := 0 + + // Compare counts for each project + for _, project := range allProjects { + logCount, inLog := logCounts[project] + auditCount, inAudit := auditCounts[project] + + if !inLog { + differences = append(differences, fmt.Sprintf("%-30s Log: N/A Audit: %4d (only in audit-cli)", project, auditCount)) + onlyInAudit++ + totalAuditPages += auditCount + } else if !inAudit { + differences = append(differences, fmt.Sprintf("%-30s Log: %4d Audit: N/A (only in log)", project, logCount)) + onlyInLog++ + totalLogPages += logCount + } else if logCount != auditCount { + diff := auditCount - logCount + diffStr := fmt.Sprintf("%+d", diff) + differences = append(differences, fmt.Sprintf("%-30s Log: %4d Audit: %4d (diff: %s)", project, logCount, auditCount, diffStr)) + different++ + totalLogPages += logCount + totalAuditPages += auditCount + } else { + matching++ + totalLogPages += logCount + totalAuditPages += auditCount + } + } + + // Print results + fmt.Println("=== PAGE COUNT COMPARISON ===\n") + + if len(differences) > 0 { + fmt.Println("Projects with differences:") + fmt.Println("--------------------------------------------------") + for _, diff := range differences { + fmt.Println(diff) + } + } else { + fmt.Println("🎉 All projects have matching counts!") + } + + // Print summary + fmt.Println("\n=== SUMMARY ===") + fmt.Printf("Total projects: %d\n", len(allProjects)) + fmt.Printf("Matching counts: %d\n", matching) + fmt.Printf("Different counts: %d\n", different) + if onlyInLog > 0 { + fmt.Printf("Only in log: %d\n", onlyInLog) + } + fmt.Println() + fmt.Printf("Total pages in log: %d\n", totalLogPages) + fmt.Printf("Total pages in audit-cli: %d\n", totalAuditPages) + if totalLogPages != totalAuditPages { + diff := totalAuditPages - totalLogPages + fmt.Printf("Difference: %+d\n", diff) + } +} diff --git a/audit/gdcd/snooty/GetProjects.go b/audit/gdcd/snooty/GetProjects.go index 1d6e42b..554e16e 100644 --- a/audit/gdcd/snooty/GetProjects.go +++ b/audit/gdcd/snooty/GetProjects.go @@ -83,7 +83,8 @@ func GetProjects(client *http.Client) []types.ProjectDetails { "guides", "atlas-app-services", "mongoid-railsmdb", - "cluster-sync", // The Snooty Data API currently lists `cluster-sync` and `mongosync` as independent projects. We don't want to process twice, so ignore the `cluster-sync` entry. + "cluster-sync", // The Snooty Data API currently lists `cluster-sync` and `mongosync` as independent projects. We don't want to process twice, so ignore the `cluster-sync` entry. + "docs-k8s-operator", // The docs k8s operator is deprecated, so we should no longer process it } var collectionsToParse []types.ProjectDetails