diff --git a/.gitignore b/.gitignore index 42fb7d4..63bddfc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ c-build/* +bazel-* \ No newline at end of file diff --git a/README.md b/README.md index 6b8d052..354d3ed 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,8 @@ The library is released open-source to help developers build tools that better reflect Google's robots.txt parsing and matching. For webmasters, we included a small binary in the project that allows testing a -single URL and user-agent against a robots.txt. +single URL and user-agent against a robots.txt, or a pair of user-agents in a +hierarchy much as `googlebot-image` and other sub-Googlebots work. ## Building the library @@ -74,6 +75,20 @@ bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https: user-agent 'YourBot' with url 'https://example.com/url' allowed: YES ``` +##### Usage + +Basic usage (testing a URL with a single user-agent) takes an argument of the +user-agent: + +`bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url` + +To test against a sub-Googlebot such as `googlebot-image` which obeys **specific** +rulesets targeting that user-agent, or falls back to a second (`googlebot`) +user-agent if there are no **specific** rulesets targeting the main +user-agent, pass a comma-separated pair with the specific user-agent first: + +`bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt googlebot-image,googlebot https://example.com/url` + #### Building with CMake [CMake](https://cmake.org) is the community-supported build system for the diff --git a/robots-OSX-x86_64 b/robots-OSX-x86_64 new file mode 100755 index 0000000..707e87b Binary files /dev/null and b/robots-OSX-x86_64 differ diff --git a/robots.cc b/robots.cc index bdbccea..9176544 100644 --- a/robots.cc +++ b/robots.cc @@ -21,7 +21,7 @@ // with Google-specific optimizations detailed at // https://developers.google.com/search/reference/robots_txt -#include "robots.h" +#include "./robots.h" #include @@ -495,6 +495,24 @@ bool RobotsMatcher::AllowedByRobots(absl::string_view robots_body, return !disallow(); } +bool RobotsMatcher::AllowedByRobotsTuple(absl::string_view robots_body, + const std::vector* user_agents, + const std::string& url) { + std::vector first_ua; + first_ua.push_back(user_agents->at(0)); + + bool allowed = AllowedByRobots(robots_body, &first_ua, url); + bool with_specific = ever_seen_specific_agent(); + + if (!with_specific) { + std::vector second_ua; + second_ua.push_back(user_agents->at(1)); + allowed = AllowedByRobots(robots_body, &second_ua, url); + } + + return allowed; +} + bool RobotsMatcher::OneAgentAllowedByRobots(absl::string_view robots_txt, const std::string& user_agent, const std::string& url) { diff --git a/robots.h b/robots.h index adccef5..04bc5bc 100644 --- a/robots.h +++ b/robots.h @@ -117,6 +117,15 @@ class RobotsMatcher : protected RobotsParseHandler { const std::vector* user_agents, const std::string& url); + // Returns true iff 'url' is allowed to be fetched by the first member + // of the "user_agents" vector according to a specific ruleset, or if there + // is no specific ruleset for that user agent, iff 'url' is allowed to be + // fetched by the second member under any ruleset. 'url' must be %-encoded + // according to RFC3986. + bool AllowedByRobotsTuple(absl::string_view robots_body, + const std::vector* user_agents, + const std::string& url); + // Do robots check for 'url' when there is only one user agent. 'url' must // be %-encoded according to RFC3986. bool OneAgentAllowedByRobots(absl::string_view robots_txt, diff --git a/robots_main.cc b/robots_main.cc index 0e60f02..7ab81f6 100644 --- a/robots_main.cc +++ b/robots_main.cc @@ -19,7 +19,7 @@ // Simple binary to assess whether a URL is accessible to a user-agent according // to records found in a local robots.txt file, based on Google's robots.txt // parsing and matching algorithms. -// Usage: +// Usage (single user-agent): // robots_main // Arguments: // local_path_to_robotstxt: local path to a file containing robots.txt records. @@ -32,10 +32,27 @@ // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed // to access 'url' based on records in 'local_path_to_robotstxt'. // +// Usage (pair of user-agents): +// robots_main +// Arguments: +// local_path_to_robotstxt: local path to a file containing robots.txt records. +// For example: /home/users/username/robots.txt +// comma_sep_user_agent_pair: pair of user-agents (most specific first) +// For example: googlebot-image,googlebot +// url: a url to be matched against records in the robots.txt. The URL must be +// %-encoded according to RFC3986. +// For example: https://example.com/accessible/url.html +// Returns: Prints a sentence with verdict about whether the first user-agent +// is allowed or disallowed to access 'url' based on *specific* records +// targeting that user-agent in 'local_path_to_robotstxt'. If there are no +// specific rules targeting that user-agent, it falls back to the second +// user-agent. +// #include #include +#include -#include "robots.h" +#include "./robots.h" bool LoadFile(const std::string& filename, std::string* result) { std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate); @@ -86,13 +103,31 @@ int main(int argc, char** argv) { return 1; } - std::string user_agent = argv[2]; - std::vector user_agents(1, user_agent); + std::string input_useragents = argv[2]; + std::vector useragents; + std::string ua; + std::istringstream ss(input_useragents); + + // if we are given a comma-separated list of user agents, explode into a + // vector + while (std::getline(ss, ua, ',')) { + useragents.push_back(ua); + } googlebot::RobotsMatcher matcher; std::string url = argv[3]; - bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url); - std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3] + bool allowed; + + if (useragents.size() == 2) { + // if we have multiple user agents, the first is the most specific + std::vector ua_tuple = {useragents[0], useragents[1]}; + + allowed = matcher.AllowedByRobotsTuple(robots_content, &ua_tuple, url); + } else { + allowed = matcher.AllowedByRobots(robots_content, &useragents, url); + } + + std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3] << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl; if (robots_content.empty()) { std::cout << "notice: robots file is empty so all user-agents are allowed" diff --git a/robots_test.cc b/robots_test.cc index 369efcd..46a68fc 100644 --- a/robots_test.cc +++ b/robots_test.cc @@ -15,9 +15,10 @@ // This file tests the robots.txt parsing and matching code found in robots.cc // against the current Robots Exclusion Protocol (REP) internet draft (I-D). // https://tools.ietf.org/html/draft-koster-rep -#include "robots.h" +#include "./robots.h" #include +#include #include "gtest/gtest.h" #include "absl/strings/str_cat.h" @@ -33,6 +34,13 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt, return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url); } +bool AllowedByRobotsTuple(const absl::string_view robotstxt, + const std::vector* useragent, + const std::string& url) { + RobotsMatcher matcher; + return matcher.AllowedByRobotsTuple(robotstxt, useragent, url); +} + // Google-specific: system test. TEST(RobotsUnittest, GoogleOnly_SystemTest) { const absl::string_view robotstxt = @@ -123,6 +131,46 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) { EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo)); } +// Test based on the documentation at +// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents +// "Only one group is valid for a particular crawler" +// "The group followed is group 1. Only the most specific group is followed, +// all others are ignored" +TEST(RobotsUnittest, ID_Multiple_Useragents) { + const absl::string_view robotstxt = + "user-agent: googlebot-news\n" + "Disallow: /bar/\n" + "\n" + "user-agent: *\n" + "Disallow: /baz/\n" + "\n\n" + "user-agent: googlebot\n" + "Disallow: /foo/\n"; + + const std::string url_foo = "http://foo.bar/foo/"; + const std::string url_bar = "http://foo.bar/bar/"; + const std::string url_baz = "http://foo.bar/baz/"; + const std::string url_qux = "http://foo.bar/qux/"; + + std::vector ua_tuple; + ua_tuple.push_back("googlebot-news"); + ua_tuple.push_back("googlebot"); + + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_foo)); + EXPECT_FALSE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_bar)); + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_baz)); + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple, url_qux)); + + std::vector ua_tuple_no_specific; + ua_tuple_no_specific.push_back("googlebot-image"); + ua_tuple_no_specific.push_back("googlebot"); + + EXPECT_FALSE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_foo)); + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_bar)); + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_baz)); + EXPECT_TRUE(AllowedByRobotsTuple(robotstxt, &ua_tuple_no_specific, url_qux)); +} + // REP lines are case insensitive. See REP I-D section "Protocol Definition". // https://tools.ietf.org/html/draft-koster-rep#section-2.1 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {