willcritchlow · willcritchlow · Nov 12, 2019 · Nov 12, 2019 · Nov 12, 2019 · Nov 12, 2019
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 c-build/*
+bazel-*
diff --git a/README.md b/README.md
@@ -23,7 +23,8 @@ The library is released open-source to help developers build tools that better
 reflect Google's robots.txt parsing and matching.
 
 For webmasters, we included a small binary in the project that allows testing a
-single URL and user-agent against a robots.txt.
+single URL and user-agent against a robots.txt, or a pair of user-agents in a 
+hierarchy much as `googlebot-image` and other sub-Googlebots work.
 
 ## Building the library
 
@@ -74,6 +75,20 @@ bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https:
   user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
 ```
 
+##### Usage
+
+Basic usage (testing a URL with a single user-agent) takes an argument of the 
+user-agent:
+
+`bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url`
+
+To test against a sub-Googlebot such as `googlebot-image` which obeys **specific** 
+rulesets targeting that user-agent, or falls back to a second (`googlebot`)
+user-agent if there are no **specific** rulesets targeting the main 
+user-agent, pass a comma-separated pair with the specific user-agent first:
+
+`bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt googlebot-image,googlebot https://example.com/url`
+
 #### Building with CMake
 
 [CMake](https://cmake.org) is the community-supported build system for the

diff --git a/robots-OSX-x86_64 b/robots-OSX-x86_64
diff --git a/robots.cc b/robots.cc
@@ -21,7 +21,7 @@
 // with Google-specific optimizations detailed at
 //   https://developers.google.com/search/reference/robots_txt
 
-#include "robots.h"
+#include "./robots.h"
 
 #include <stdlib.h>
 
@@ -495,6 +495,24 @@ bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body,
   return !disallow();
 }
 
+bool RobotsMatcher::AllowedByRobotsTuple(absl::string_view robots_body,
+                                    const std::vector<std::string>* user_agents,
+                                    const std::string& url) {
+  std::vector<std::string> first_ua;
+  first_ua.push_back(user_agents->at(0));
+
+  bool allowed = AllowedByRobots(robots_body, &first_ua, url);
+  bool with_specific = ever_seen_specific_agent();
+
+  if (!with_specific) {
+    std::vector<std::string> second_ua;
+    second_ua.push_back(user_agents->at(1));
+    allowed = AllowedByRobots(robots_body, &second_ua, url);
+  }
+
+  return allowed;
+}
+
 bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt,
                                             const std::string& user_agent,
                                             const std::string& url) {

diff --git a/robots.h b/robots.h
@@ -117,6 +117,15 @@ class RobotsMatcher : protected RobotsParseHandler {
                        const std::vector<std::string>* user_agents,
                        const std::string& url);
 
+  // Returns true iff 'url' is allowed to be fetched by the first member
+  // of the "user_agents" vector according to a specific ruleset, or if there
+  // is no specific ruleset for that user agent, iff 'url' is allowed to be
+  // fetched by the second member under any ruleset. 'url' must be %-encoded
+  // according to RFC3986.
+  bool AllowedByRobotsTuple(absl::string_view robots_body,
+                       const std::vector<std::string>* user_agents,
+                       const std::string& url);
+
   // Do robots check for 'url' when there is only one user agent. 'url' must
   // be %-encoded according to RFC3986.
   bool OneAgentAllowedByRobots(absl::string_view robots_txt,

diff --git a/robots_main.cc b/robots_main.cc
@@ -19,7 +19,7 @@
 // Simple binary to assess whether a URL is accessible to a user-agent according
 // to records found in a local robots.txt file, based on Google's robots.txt
 // parsing and matching algorithms.
-// Usage:
+// Usage (single user-agent):
 //     robots_main <local_path_to_robotstxt> <user_agent> <url>
 // Arguments:
 // local_path_to_robotstxt: local path to a file containing robots.txt records.
@@ -32,10 +32,27 @@
 // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
 // to access 'url' based on records in 'local_path_to_robotstxt'.
 //
+// Usage (pair of user-agents):
+//     robots_main <local_path_to_robotstxt> <comma_sep_user_agent_pair> <url>
+// Arguments:
+// local_path_to_robotstxt: local path to a file containing robots.txt records.
+//   For example: /home/users/username/robots.txt
+// comma_sep_user_agent_pair: pair of user-agents (most specific first)
+//   For example: googlebot-image,googlebot
+// url: a url to be matched against records in the robots.txt. The URL must be
+// %-encoded according to RFC3986.
+//   For example: https://example.com/accessible/url.html
+// Returns: Prints a sentence with verdict about whether the first user-agent
+// is allowed or disallowed to access 'url' based on *specific* records
+// targeting that user-agent in 'local_path_to_robotstxt'. If there are no
+// specific rules targeting that user-agent, it falls back to the second
+// user-agent.
+//
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
-#include "robots.h"
+#include "./robots.h"
 
 bool LoadFile(const std::string& filename, std::string* result) {
   std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
@@ -86,13 +103,31 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  std::string user_agent = argv[2];
-  std::vector<std::string> user_agents(1, user_agent);
+  std::string input_useragents = argv[2];
+  std::vector<std::string> useragents;
+  std::string ua;
+  std::istringstream ss(input_useragents);
+
+  // if we are given a comma-separated list of user agents, explode into a
+  // vector
+  while (std::getline(ss, ua, ',')) {
+      useragents.push_back(ua);
+  }
   googlebot::RobotsMatcher matcher;
   std::string url = argv[3];
-  bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
 
-  std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
+  bool allowed;
+
+  if (useragents.size() == 2) {
+    // if we have multiple user agents, the first is the most specific
+    std::vector<std::string> ua_tuple = {useragents[0], useragents[1]};
+
+    allowed = matcher.AllowedByRobotsTuple(robots_content, &ua_tuple, url);
+  } else {
+    allowed = matcher.AllowedByRobots(robots_content, &useragents, url);
+  }
+
+  std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3]
             << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
   if (robots_content.empty()) {
     std::cout << "notice: robots file is empty so all user-agents are allowed"

diff --git a/robots_test.cc b/robots_test.cc
@@ -15,9 +15,10 @@
 // This file tests the robots.txt parsing and matching code found in robots.cc
 // against the current Robots Exclusion Protocol (REP) internet draft (I-D).
 // https://tools.ietf.org/html/draft-koster-rep
-#include "robots.h"
+#include "./robots.h"
 
 #include <string>
+#include <vector>
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
@@ -33,6 +34,13 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt,
   return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
 }
 
+bool AllowedByRobotsTuple(const absl::string_view robotstxt,
+                          const std::vector<std::string>* useragent,
+                          const std::string& url) {
+  RobotsMatcher matcher;
+  return matcher.AllowedByRobotsTuple(robotstxt, useragent, url);
+}
+
 // Google-specific: system test.
 TEST(RobotsUnittest, GoogleOnly_SystemTest) {
   const absl::string_view robotstxt =
@@ -123,6 +131,46 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
 }
 
+// Test based on the documentation at
+// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents
+// "Only one group is valid for a particular crawler"
+// "The group followed is group 1. Only the most specific group is followed,
+// all others are ignored"
+TEST(RobotsUnittest, ID_Multiple_Useragents) {
+  const absl::string_view robotstxt =
+      "user-agent: googlebot-news\n"
+      "Disallow: /bar/\n"
+      "\n"
+      "user-agent: *\n"
+      "Disallow: /baz/\n"
+      "\n\n"
+      "user-agent: googlebot\n"
+      "Disallow: /foo/\n";
+
+  const std::string url_foo = "http://foo.bar/foo/";
+  const std::string url_bar = "http://foo.bar/bar/";
+  const std::string url_baz = "http://foo.bar/baz/";
+  const std::string url_qux = "http://foo.bar/qux/";
+
+  std::vector<std::string> ua_tuple;
+  ua_tuple.push_back("googlebot-news");
+  ua_tuple.push_back("googlebot");
+
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_foo));
+  EXPECT_FALSE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_bar));
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_baz));
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_qux));
+
+  std::vector<std::string> ua_tuple_no_specific;
+  ua_tuple_no_specific.push_back("googlebot-image");
+  ua_tuple_no_specific.push_back("googlebot");
+
+  EXPECT_FALSE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_foo));
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_bar));
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_baz));
+  EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_qux));
+}
+
 // REP lines are case insensitive. See REP I-D section "Protocol Definition".
 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {