diff --git a/.gitignore b/.gitignore
index c06106cf..e94f7b3a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ tools/packages
*buildstamp
src/spidermonkey/js
src/pactester
+src/pacparse
# OS specific files
.DS_Store
diff --git a/docs/html/pacparse.1.html b/docs/html/pacparse.1.html
new file mode 100644
index 00000000..a2773467
--- /dev/null
+++ b/docs/html/pacparse.1.html
@@ -0,0 +1,120 @@
+
+
+
+
+
+"pacparse"("1") manual page
+
+
+Table of Contents
+
+
+pacparse - tool to parse Proxy Auto-Config (PAC) files
+
+pacparse
+-p pacfile -u url [-h host] [-c clientip] [-U pacurl] [-46edv]
+
+pacparse
+ is a tool to parse Proxy Auto-Config (PAC) files. It returns the proxy config
+string for the given URL and PAC file. pacparse uses the pacparser C library
+for most of its functionality.
+
+
+
+
+- -p pacfile
+- PAC file to parse. Specify
+"-" to read from standard input.
+
+- -u url
+- URL to pass as a parameter to the
+PAC file’s FindProxyForURL function.
+
+- -h host
+- Host part of the URL. If not specified,
+it is determined from the URL.
+
+- -c clientip
+- Client’s IP address, as returned
+by the function myIpAddress() in PAC files. If not specified, it defaults
+to the IP address associated with the hostname of the machine on which
+the tool is running, or 127.0.0.1 if that can’t be found.
+
+- -U pacurl
+- URL that
+the PAC file came from, used to identify the client IP address in a more
+reliable way. The tool parses the host name from the URL, attempts to connect
+to each address associated with that host name with a UDP socket until
+one is successful, and then uses the IP address associated with the client
+side of that socket.
+
+- -4
+- Use only IPv4 addresses for -U.
+
+- -6
+- Use only IPv6 addresses
+for -U.
+
+- -e
+- Enable Microsoft PAC extensions (dnsResolveEx, myIpAddressEx, isResolvableEx).
+
+
+- -d
+- Enable debugging messages.
+
+- -v
+- Print version and exit.
+
+
+
+
+To find out
+the proxy config string for the PAC file "wpad.dat" and the URL "http://www.google.com
+":
+
+$ pacparse -p wpad.dat -u http://www.google.com
+
+
For a client with IP address
+10.0.12.123:
+$ pacparse -p wpad.dat -c 10.0.12.123 -u http://www.google.com
+
+
For a
+PAC file hosted at http://wpad/wpad.dat:
+
+$ curl -s http://wpad/wpad.dat
+ | \
+ pacparse -p - -u http://google.com
+ -U http://wpad/wpad.dat
+
+
+pacwget(1)
+, pacparser_init(3)
+
+
+If you have come across a bug
+in pacparse, please submit a bug report at https://github.com/pacparser/pacparser/issues.
+
+
+
+Written by Manu Garg (http://www.manugarg.com) and Dave Dykstra.
+
+Homepage:
+https://github.com/pacparser/pacparser.
+
+
+
+
+Table of Contents
+
+
+
diff --git a/docs/html/pacwget.1.html b/docs/html/pacwget.1.html
new file mode 100644
index 00000000..3b6ce207
--- /dev/null
+++ b/docs/html/pacwget.1.html
@@ -0,0 +1,200 @@
+
+
+
+
+
+"pacwget"("1") manual page
+
+
+Table of Contents
+
+
+pacwget - robustly get http
+ URLs using multiple proxies and servers
+
+
+pacwget [--only-proxies] [GNU_WGET_OPTIONS]
+
+pacwget is
+a tool that uses GNU wget in such a way that target http
+ URLs are retrieved
+even if some of multiple proxies and/or target servers are not functioning.
+ The "pac" part of the name comes from its support of Proxy Auto-Config
+(PAC) files for configuring proxies.
+The configuration of proxies (including
+via PAC URLs) comes from the environment (see below), but multiple servers
+are recognized through round-robin DNS names of the host part specified
+in the target http
+ URLs. With each proxy (unless the option --only-proxies
+is given), pacwget first tries the target URL and if that fails and the
+host part of the URL is a round-robin DNS name, it tries replacing the host
+part of the URL with each IP address from the round-robin while using the
+same proxy.
+
+pacwget uses the following environment variables:
+
+
+
+- HTTP_PROXIES
+- A semicolon-separated list of URLs to try as http
+ proxies in
+order. The last one in the list may be "DIRECT" which means to use no proxy
+and connect directly to the host server in target http
+ URLs being retrieved.
+
+
+- http_proxy
+
+- If HTTP_PROXIES is not set, but http_proxy
+ is, then it is used
+as a single http
+ proxy URL. Note that it may identify a round-robin of more
+than one proxy, but direct connections to the target server is not an option.
+
+
+- PAC_URLS
+- If neither HTTP_PROXIES nor http_proxy
+ is set, then PAC_URLS is
+used as a semicolon-separated list of URLs to try to read Proxy Auto-Config
+files to parse for a list of http
+ proxies. The word "auto" is converted
+to "http://wpad/wpad.dat
+" as is commonly used for Web Proxy Auto Discovery.
+ The last one in the list may be "DIRECT" which means to directly connect
+to the target server if no PAC file can be read. Otherwise the PAC URLs
+may begin with http://
+ or file://, although if file:// is used then the
+myIpAddress() function available inside the PAC file uses a less reliable
+method to determine the client’s IP address (using a DNS lookup on the hostname
+instead of using the client IP from the socket connecting to the PAC file’s
+http server). The url & host parameters to the FindProxyForURL function in
+the PAC file are derived from the first command line http://
+ parameter.
+ If a PAC file is successfully read, it must return a list of proxies or
+"DIRECT", otherwise it is a fatal error and no further PAC URLs are tried.
+ The default value of $PAC_URLS if it is not set is "auto; DIRECT".
+
+
+
+First,
+note that unlike with wget, the ordering of options is significant with
+pacwget: only the options that come before a URL apply to that URL. In
+this way, different options can be specified with multiple URLs in the
+same invocation of pacwget. The same list of proxies are applied to each
+http://
+ URL, although wget is invoked separately for each URL on the pacwget
+command line.
+There is one option added by pacwget:
+
+
+- --only-proxies
+- Print to
+stdout a $HTTP_PROXIES-like list of proxies (that is, semicolon separated
+and may end in "DIRECT") that would be used instead of downloading the
+given URL(s) with wget. This is useful for downloading and parsing PAC
+files. Requires one URL that starts with http://.
+
+
+
+All other options are
+passed to wget, but some cause additional action in pacwget and are described
+here:
+
+
+- --connect-timeout=SECS
+- Sets the connection timeout to SECS seconds for
+retrieving both PAC URLs and target URLs. If a proxy or a target server
+does not respond in that amount of time, the next one is tried. Default
+5.
+
+- --read-timeout=SECS
+- Sets the read timeout to SECS seconds for retrieving
+both PAC URLs and target URLs. If no data is received from a proxy or a
+target server in that amount of time, the next one is tried. Default 10.
+
+
+- -T SECS
+- Sets both the connect and read timeout to SECS seconds.
+
+- --tries=N
+- Try
+all wget connections for both PAC URLs and target URLs N times. Default
+1.
+
+- --inet4-only or "-4"
+- Use only IPv4 addresses for both wget and for the myIpAddress()
+function in PAC files.
+
+- --inet6-only or "-6"
+- Use only IPv6 addresses for both
+wget and for the myIpAddress() function in PAC files.
+
+- --debug or "-d"
+- In addition
+to adding debug messages to all uses of wget, also enable debugging in
+PAC file parsing.
+
+- --verbose or "-v"
+- This is the default for wget for the target
+URL, but if this is explicitly set then it is also used for PAC URLs. In
+addition, if neither debug nor verbose is set, PAC URLs are retrieved with
+the wget --quiet option.
+
+
+
+
+To retrieve target URL "http://www.google.com
+"
+using proxies defined in "http://wpad/wpad.dat
+" and not allow direct connections:
+
+$ export PAC_URLS=auto
+$ pacwget http://www.google.com
+
+To try an additional WPAD server after the
+usual one and allow direct connections if that also doesn’t work:
+$ export
+PAC_URLS="auto; http://wpad.shared.domain/wpad.dat;
+ DIRECT"
+$ pacwget http://www.google.com
+
+To directly set a list of possible proxies,
+with debugging enabled:
+$ export HTTP_PROXIES="http://squid:3128;http://squid.friend.dom:3128
+"
+
+$ pacwget -d http://www.google.com
+
+
+pacparse(1)
+, pacparser_init(3)
+
+
+
+If you have come across a bug in pacwget, please submit a bug report
+at https://github.com/pacparser/pacparser/issues
+
+
+Written by Dave Dykstra.
+
+
+Homepage: https://github.com/pacparser/pacparser
+
+
+
+
+Table of Contents
+
+
+
diff --git a/docs/man/man1/pacparse.1 b/docs/man/man1/pacparse.1
new file mode 100644
index 00000000..658fbc11
--- /dev/null
+++ b/docs/man/man1/pacparse.1
@@ -0,0 +1,75 @@
+.TH "pacparse" "1" "" "" ""
+.SH "NAME"
+pacparse \- tool to parse Proxy Auto\-Config (PAC) files
+.SH "SYNOPSIS"
+.B pacparse \-p pacfile \-u url [\-h host] [\-c clientip] [\-U pacurl] [\-46edv]
+.SH "DESCRIPTION"
+.B pacparse
+is a tool to parse Proxy Auto\-Config (PAC) files. It returns the
+proxy config string for the given URL and PAC file.
+.B pacparse
+uses the pacparser C library for most of its functionality.
+.SH "OPTIONS"
+.TP
+.B \-p pacfile
+PAC file to parse. Specify "-" to read from standard input.
+.TP
+.B \-u url
+URL to pass as a parameter to the PAC file's FindProxyForURL function.
+.TP
+.B \-h host
+Host part of the URL. If not specified, it is determined from the URL.
+.TP
+.B \-c clientip
+Client's IP address, as returned by the function myIpAddress() in PAC files.
+If not specified, it defaults to the IP address associated with the hostname
+of the machine on which the tool is running, or 127.0.0.1 if that can't
+be found.
+.TP
+.B \-U pacurl
+URL that the PAC file came from, used to identify the client IP address in
+a more reliable way. The tool parses the host name from the URL,
+attempts to connect to each address associated with that host name
+with a UDP socket until one is successful, and then uses the IP address
+associated with the client side of that socket.
+.TP
+.B \-4
+Use only IPv4 addresses for -U.
+.TP
+.B \-6
+Use only IPv6 addresses for -U.
+.TP
+.B \-e
+Enable Microsoft PAC extensions (dnsResolveEx, myIpAddressEx, isResolvableEx).
+.TP
+.B \-d
+Enable debugging messages.
+.TP
+.B \-v
+Print version and exit.
+.SH "EXAMPLES"
+.PP
+To find out the proxy config string for the PAC file "wpad.dat" and the URL
+"http://www.google.com":
+.PP
+$ pacparse \-p wpad.dat \-u http://www.google.com
+
+For a client with IP address 10.0.12.123:
+.PP
+$ pacparse \-p wpad.dat \-c 10.0.12.123 \-u http://www.google.com
+
+For a PAC file hosted at http://wpad/wpad.dat:
+.PP
+$ curl \-s http://wpad/wpad.dat | \\
+ pacparse \-p \- \-u http://google.com \-U http://wpad/wpad.dat
+.SH "SEE ALSO"
+pacwget(1),
+pacparser_init(3)
+.SH "BUGS"
+If you have come across a bug in pacparse, please submit a bug report at
+https://github.com/pacparser/pacparser/issues.
+.SH "AUTHOR"
+Written by Manu Garg (http://www.manugarg.com) and Dave Dykstra.
+.SH "RESOURCES"
+Homepage: https://github.com/pacparser/pacparser.
+
diff --git a/docs/man/man1/pacwget.1 b/docs/man/man1/pacwget.1
new file mode 100644
index 00000000..3350fd81
--- /dev/null
+++ b/docs/man/man1/pacwget.1
@@ -0,0 +1,148 @@
+.TH "pacwget" "1" "" "" ""
+.SH "NAME"
+pacwget \- robustly get http URLs using multiple proxies and servers
+.SH "SYNOPSIS"
+.B pacwget [--only-proxies] [GNU_WGET_OPTIONS]
+.SH "DESCRIPTION"
+.B pacwget
+is a tool that uses GNU wget in such a way that target http URLs are
+retrieved even if some of multiple proxies and/or target servers are not
+functioning. The "pac" part of the name comes from its support of
+Proxy Auto\-Config (PAC) files for configuring proxies.
+.P
+The configuration of proxies (including via PAC URLs) comes from the
+environment (see below), but multiple servers are recognized through
+round\-robin DNS names of the host part specified in the target http
+URLs. With each proxy (unless the option
+.B --only-proxies
+is given),
+.B pacwget
+first tries the target URL and if that fails and the host part of the URL
+is a round\-robin DNS name, it tries replacing the host part of the URL
+with each IP address from the round\-robin while using the same proxy.
+.SH "ENVIRONMENT"
+.B pacwget
+uses the following environment variables:
+.TP
+.B HTTP_PROXIES
+A semicolon\-separated list of URLs to try as http proxies in order.
+The last one in the list may be "DIRECT" which means to use no proxy
+and connect directly to the host server in target http URLs being
+retrieved.
+.TP
+.B http_proxy
+If HTTP_PROXIES is not set, but http_proxy is, then it is used as a
+single http proxy URL. Note that it may identify a round\-robin of
+more than one proxy, but direct connections to the target server is not
+an option.
+.TP
+.B PAC_URLS
+If neither HTTP_PROXIES nor http_proxy is set, then PAC_URLS is used
+as a semicolon\-separated list of URLs to try to read Proxy Auto\-Config
+files to parse for a list of http proxies. The word "auto"
+is converted to "http://wpad/wpad.dat" as is commonly used for Web
+Proxy Auto Discovery. The last one in the list may be "DIRECT" which
+means to directly connect to the target server if no PAC file can be
+read. Otherwise the PAC URLs may begin with http:// or file://, although
+if file:// is used then the myIpAddress() function available inside the
+PAC file uses a less reliable method to determine the client's IP
+address (using a DNS lookup on the hostname instead of using the
+client IP from the socket connecting to the PAC file's http server).
+The url & host parameters to the FindProxyForURL function in the PAC
+file are derived from the first command line http:// parameter. If a
+PAC file is successfully read, it must return a list of proxies or
+"DIRECT", otherwise it is a fatal error and no further PAC URLs are
+tried. The default value of $PAC_URLS if it is not set is
+"auto; DIRECT".
+.SH "OPTIONS"
+First, note that unlike with wget, the ordering of options is significant with
+.BR pacwget :
+only the options that come before a URL apply to that URL. In this
+way, different options can be specified with multiple URLs in the same
+invocation of
+.BR pacwget .
+The same list of proxies are applied to each http:// URL, although wget is
+invoked separately for each URL on the
+.B pacwget
+command line.
+.P
+There is one option added by
+.BR pacwget :
+.TP
+.B \-\-only\-proxies
+Print to stdout a $HTTP_PROXIES-like list of proxies (that is,
+semicolon separated and may end in "DIRECT") that would be used
+instead of downloading the given URL(s) with wget. This is useful
+for downloading and parsing PAC files. Requires one URL that starts
+with http://.
+.P
+All other options are passed to wget, but some cause additional action in
+.BR pacwget
+and are described here:
+.TP
+.B \-\-connect\-timeout=SECS
+Sets the connection timeout to SECS seconds for retrieving both PAC URLs
+and target URLs. If a proxy or a target server does not respond in
+that amount of time, the next one is tried. Default 5.
+.TP
+.B \-\-read\-timeout=SECS
+Sets the read timeout to SECS seconds for retrieving both PAC URLs and
+target URLs. If no data is received from a proxy or a target server in that
+amount of time, the next one is tried. Default 10.
+.TP
+.B "\-T SECS"
+Sets both the connect and read timeout to SECS seconds.
+.TP
+.B "\-\-tries=N"
+Try all wget connections for both PAC URLs and target URLs N times.
+Default 1.
+.TP
+.B "\-\-inet4\-only" or "\-4"
+Use only IPv4 addresses for both wget and for the myIpAddress() function
+in PAC files.
+.TP
+.B "\-\-inet6\-only" or "\-6"
+Use only IPv6 addresses for both wget and for the myIpAddress() function
+in PAC files.
+.TP
+.B "\-\-debug" or "\-d"
+In addition to adding debug messages to all uses of wget, also enable
+debugging in PAC file parsing.
+.TP
+.B "\-\-verbose" or "\-v"
+This is the default for wget for the target URL, but if this is
+explicitly set then it is also used for PAC URLs. In addition, if
+neither debug nor verbose is set, PAC URLs are retrieved with the
+wget \-\-quiet option.
+.SH "EXAMPLES"
+.PP
+To retrieve target URL "http://www.google.com" using proxies defined
+in "http://wpad/wpad.dat" and not allow direct connections:
+.PP
+$ export PAC_URLS=auto
+.br
+$ pacwget http://www.google.com
+.P
+To try an additional WPAD server after the usual one and allow
+direct connections if that also doesn't work:
+.PP
+$ export PAC_URLS="auto; http://wpad.shared.domain/wpad.dat; DIRECT"
+.br
+$ pacwget http://www.google.com
+.P
+To directly set a list of possible proxies, with debugging enabled:
+.PP
+$ export HTTP_PROXIES="http://squid:3128;http://squid.friend.dom:3128"
+.br
+$ pacwget -d http://www.google.com
+.SH "SEE ALSO"
+pacparse(1),
+pacparser_init(3)
+.SH "BUGS"
+If you have come across a bug in pacwget, please submit a bug report at
+https://github.com/pacparser/pacparser/issues
+.SH "AUTHOR"
+Written by Dave Dykstra.
+.SH "RESOURCES"
+Homepage: https://github.com/pacparser/pacparser
+
diff --git a/src/Makefile b/src/Makefile
index 9393e937..54fd4c82 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -90,7 +90,7 @@ DOC_PREFIX = $(PREFIX)/share/doc/pacparser
MAN_PREFIX = $(PREFIX)/share/man
.PHONY: clean pymod install-pymod
-all: testpactester
+all: testpactester pacparse
jsapi_buildstamp: spidermonkey/js/src
cd spidermonkey && SMCFLAGS="$(SHFLAGS) $(SMCFLAGS)" $(MAKE) jsapi
@@ -120,6 +120,9 @@ testpactester: pactester $(LIBRARY_LINK)
echo "Running tests for pactester."
NO_INTERNET=$(NO_INTERNET) ../tests/runtests.sh
+pacparse: pacparse.c pacparser.h $(LIBRARY_LINK)
+ $(CC) $(CFLAGS) $(LDFLAGS) pacparse.c -o pacparse -lpacparser -L. -I.
+
docs:
../tools/generatedocs.sh
@@ -128,8 +131,10 @@ install: all
install -m 644 $(LIBRARY) $(LIB_PREFIX)/$(LIBRARY)
ln -sf $(LIBRARY) $(LIB_PREFIX)/$(LIBRARY_LINK)
install -m 755 pactester $(BIN_PREFIX)/pactester
+ install -m 755 pacparse $(BIN_PREFIX)/pacparse
+ install -m 755 pacwget $(BIN_PREFIX)/pacwget
install -m 644 pacparser.h $(INC_PREFIX)/pacparser.h
- # install pactester manpages
+ # install pactester, pacparse, and pacwget manpages
install -d $(MAN_PREFIX)/man1/
(test -d ../docs && install -m 644 ../docs/man/man1/*.1 $(MAN_PREFIX)/man1/) || true
# install pacparser manpages
@@ -162,7 +167,7 @@ install-pymod: pymod
cd pymod && ARCHFLAGS="" $(PYTHON) setup.py install --root="$(DESTDIR)/" $(EXTRA_ARGS)
clean:
- rm -f $(LIBRARY_LINK) $(LIBRARY) pacparser.o pactester pymod/pacparser_o_buildstamp jsapi_buildstamp
+ rm -f $(LIBRARY_LINK) $(LIBRARY) pacparser.o pactester pacparse.o pacparse pymod/pacparser_o_buildstamp jsapi_buildstamp
rm -rf dist
cd pymod && $(PYTHON) setup.py clean --all
cd spidermonkey && $(MAKE) clean
diff --git a/src/Makefile.win32 b/src/Makefile.win32
index 08d83907..50636059 100644
--- a/src/Makefile.win32
+++ b/src/Makefile.win32
@@ -66,6 +66,9 @@ pacparser.lib: pacparser.dll pacparser.def
pactester: pactester.c pacparser.h pacparser.o
$(CC) pactester.c pacparser.o -o pactester -ljs -Lspidermonkey -lws2_32
+pacparse: pacparse.c pacparser.h pacparser.o
+ $(CC) pacparse.c pacparser.o -o pacparse -ljs -Lspidermonkey -lws2_32
+
dist: pacparser.dll pactester pacparser.def
if exist dist rmdir /s /q dist
mkdir dist
@@ -98,7 +101,7 @@ pymod-dist-%:
cd pymod && py -$* setup.py dist
clean:
- $(RM) pacparser.dll *.lib pacparser.def pacparser.exp pacparser.o pactester.exe libpacparser.a
+ $(RM) pacparser.dll *.lib pacparser.def pacparser.exp pacparser.o pactester.exe pacparse.exe libpacparser.a
$(MAKE) -C spidermonkey -f Makefile.win32 clean
cd pymod && $(PYTHON) setup.py clean --all
$(RM) dist
diff --git a/src/pacparse.c b/src/pacparse.c
new file mode 100644
index 00000000..8b8456c1
--- /dev/null
+++ b/src/pacparse.c
@@ -0,0 +1,351 @@
+// This file implements a command called pacparse for downloading and
+// parsing proxy-auto-config files. Intended to be used from scripts
+// including pacwget.
+// Author: Dave Dykstra
+//
+// This file is based on pactester.c from the pacparser library
+// (https://github.com/pacparser/pacparser/blob/master/src/pactester.c)
+// Copyright (C) 2008 Manu Garg.
+// Author: Manu Garg
+//
+// pacparse is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// pacparse is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+#ifndef PACPARSE_VERSION
+#define PACPARSE_VERSION "1.0"
+#endif
+
+#include "pacparser.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+int pacdebug=0;
+char *progname="pacparse";
+
+void usage()
+{
+ fprintf(stderr, "\nUsage: %s -p pacfile -u url [-h host] "
+ "[-c clientip] [-U pacurl] [-46edv]", progname);
+ fprintf(stderr, "\nOptions:\n");
+ fprintf(stderr, " -p pacfile : PAC file to parse (specify '-' to read "
+ "from standard input)\n");
+ fprintf(stderr, " -u url : URL parameter to the PAC file's "
+ "FindProxyForURL function\n");
+ fprintf(stderr, " -h host : Host part of the URL (default "
+ "parsed from url)\n");
+ fprintf(stderr, " -c clientip : client IP address (as returned by "
+ "myIpAddress() function\n");
+ fprintf(stderr, " in PAC files, defaults to IP address "
+ "of client hostname)\n");
+ fprintf(stderr, " -U pacurl : URL that pacfile came from, to identify "
+ "client IP address\n");
+ fprintf(stderr, " (creates UDP socket to host, more reliable "
+ "than default\n");
+ fprintf(stderr, " on clients with multiple IP addresses)\n");
+ fprintf(stderr, " -4 : use only IPv4 addresses for -U\n");
+ fprintf(stderr, " -6 : use only IPv6 addresses for -U\n");
+ fprintf(stderr, " -e : enable microsoft extensions "
+ "(Ex functions)\n");
+ fprintf(stderr, " -d : enable debugging messages\n");
+ fprintf(stderr, " -v : print version and exit\n");
+}
+
+char *get_host_from_url(const char *url)
+{
+ // copy url to a pointer that we'll use to seek through the string.
+ char *p = strdup(url);
+ // Move to :
+ while (*p != ':' && *p != '\0')
+ p++;
+ if (p[0] == '\0'|| // We reached end without hitting :
+ p[1] != '/' || p[2] != '/' // Next two characters are not //
+ ) {
+ fprintf(stderr, "%s: Not a proper URL\n", progname);
+ return NULL;
+ }
+ p = p + 3; // Get past '://'
+ // Host part starts from here.
+ char *host = p;
+ if (*p == '\0' || *p == '/' || *p == ':') { // If host part is null.
+ fprintf(stderr, "%s: Not a proper URL\n", progname);
+ return NULL;
+ }
+ // Seek until next /, : or end of string.
+ while (*p != '/' && *p != ':' && *p != '\0')
+ p++;
+ *p = '\0';
+ return host;
+}
+
+void set_myip_from_host(const char *host, int ipversion)
+{
+ int err, fd;
+ struct addrinfo addrinfo, *res, *r;
+ struct sockaddr_in sinbuf;
+ struct sockaddr_in6 sin6buf;
+ socklen_t namelen;
+ char errbuf[4096];
+ char ipbuf[256];
+
+ memset(&addrinfo, 0, sizeof(addrinfo));
+ switch(ipversion)
+ {
+ case 4:
+ addrinfo.ai_family = AF_INET;
+ break;
+ case 6:
+ addrinfo.ai_family = AF_INET6;
+ break;
+ default:
+ addrinfo.ai_family = AF_UNSPEC;
+ break;
+ }
+ addrinfo.ai_socktype = SOCK_DGRAM;
+ addrinfo.ai_protocol = IPPROTO_UDP;
+
+ if ((err = getaddrinfo(host, 0, &addrinfo, &res)) != 0) {
+ fprintf(stderr, "%s: error from getaddrinfo on %s: %s\n",
+ progname, host, gai_strerror(err));
+ exit(2);
+ }
+ errbuf[0] = '\0';
+ ipbuf[0] = '\0';
+ for (r = res; r; r = r->ai_next) {
+ if (pacdebug && (errbuf[0] != '\0')) {
+ fprintf(stderr, "DEBUG: trying next addr after %s\n", errbuf);
+ }
+ if ((fd = socket(r->ai_family, r->ai_socktype, r->ai_protocol)) < 0) {
+ fprintf(stderr, "%s: error creating socket for %s: %s\n",
+ progname, host, strerror(errno));
+ exit(2);
+ }
+ if (connect(fd, r->ai_addr, r->ai_addrlen) < 0) {
+ snprintf(errbuf, sizeof(errbuf),
+ "error connecting UDP socket(s) to %s, last error: %s",
+ host, strerror(errno));
+ close(fd);
+ continue;
+ }
+ if (r->ai_family == AF_INET) {
+ namelen = sizeof(sinbuf);
+ if (getsockname(fd, (struct sockaddr *)&sinbuf, &namelen) < 0) {
+ fprintf(stderr, "%s: error on getsockname from socket to %s: %s\n",
+ progname, host, strerror(errno));
+ exit(2);
+ }
+ if (inet_ntop(AF_INET, &sinbuf.sin_addr, ipbuf, sizeof(ipbuf)) == 0) {
+ fprintf(stderr, "%s: error on inet_ntop from getsockname to %s: %s\n",
+ progname, host, strerror(errno));
+ exit(2);
+ }
+ }
+ else if (r->ai_family == AF_INET6) {
+ namelen = sizeof(sin6buf);
+ if (getsockname(fd, (struct sockaddr *)&sin6buf, &namelen) < 0) {
+ fprintf(stderr, "%s: error on getsockname from socket to %s: %s\n",
+ progname, host, strerror(errno));
+ exit(2);
+ }
+ if (inet_ntop(AF_INET6, &sin6buf.sin6_addr, ipbuf, sizeof(ipbuf)) == 0) {
+ fprintf(stderr, "%s: error on inet_ntop from getsockname to %s: %s\n",
+ progname, host, strerror(errno));
+ exit(2);
+ }
+ }
+ else {
+ fprintf(stderr, "%s: unknown address family %d\n",
+ progname, r->ai_family);
+ exit(2);
+ }
+ break;
+ }
+ freeaddrinfo(res);
+ close(fd);
+ if (ipbuf[0] == '\0') {
+ if (errbuf[0] == '\0')
+ fprintf(stderr, "%s: could not determine IP for %s, error unknown\n",
+ progname, host);
+ else
+ fprintf(stderr, "%s: %s\n", progname, errbuf);
+ exit(2);
+ }
+ if (pacdebug) {
+ fprintf(stderr, "DEBUG: Setting myip to %s\n", ipbuf);
+ }
+ pacparser_setmyip(ipbuf);
+}
+
+int main(int argc, char* argv[])
+{
+ char *pacfile=NULL, *url=NULL, *host=NULL, *clientip=NULL, *pacurl=NULL;
+ int ipversion = 0;
+ int enable_microsoft_extensions = 0;
+ signed char c;
+ while ((c = getopt(argc, argv, "edv46p:u:h:c:U:")) != -1)
+ switch (c)
+ {
+ case 'v':
+ printf("%s %s; pacparser library %s\n", progname, PACPARSE_VERSION,
+ pacparser_version());
+ return 0;
+ case 'p':
+ pacfile = optarg;
+ break;
+ case 'u':
+ url = optarg;
+ break;
+ case 'h':
+ host = optarg;
+ break;
+ case 'c':
+ clientip = optarg;
+ break;
+ case 'U':
+ pacurl = optarg;
+ break;
+ case '4':
+ ipversion = 4;
+ break;
+ case '6':
+ ipversion = 6;
+ break;
+ case 'e':
+ enable_microsoft_extensions = 1;
+ break;
+ case 'd':
+ pacdebug = 1;
+ putenv("PACPARSER_DEBUG=1");
+ putenv("DEBUG=1"); /* for older versions of the library */
+ break;
+ case '?':
+ usage();
+ return 1;
+ default:
+ abort ();
+ }
+
+ if (!pacfile) {
+ fprintf(stderr, "%s: You didn't specify the PAC file\n", progname);
+ usage();
+ return 1;
+ }
+
+ if(enable_microsoft_extensions)
+ pacparser_enable_microsoft_extensions();
+
+ // initialize pacparser
+ if (!pacparser_init()) {
+ fprintf(stderr, "%s: Could not initialize pacparser\n", progname);
+ return 1;
+ }
+
+ // Read pacfile from stdin
+ if (strcmp("-", pacfile) == 0) {
+ char *script;
+ int buffsize = 4096;
+ int maxsize = 1024 * 1024; // Limit the max script size to 1 MB
+ size_t script_size = 1; // For the null terminator
+ char buffer[buffsize];
+
+ script = (char*) malloc(sizeof(char) * buffsize);
+ if (script == NULL) {
+ fprintf(stderr,"%s: Failed to allocate memory for the script\n",
+ progname);
+ return(1);
+ }
+ script[0] = '\0'; // Null terminate to prepare for strcat
+
+ while(fgets(buffer, buffsize, stdin)) {
+ if (strlen(buffer) == 0) break;
+ char *old = script;
+ script_size += strlen(buffer);
+ if (script_size > maxsize) {
+ fprintf(stderr, "Input file is too big. Maximum allowed size is: %d",
+ maxsize);
+ free(script);
+ return 1;
+ }
+ script = realloc(script, script_size);
+ if (script == NULL) {
+ fprintf(stderr, "%s: Failed to realloc %d bytes of memory for the script %s\n",
+ progname, (int)script_size, script);
+ free(old);
+ return 1;
+ }
+ strcat(script, buffer);
+ }
+
+ if (ferror(stdin)) {
+ free(script);
+ fprintf(stderr, "%s: Error reading from stdin\n", progname);
+ return 1;
+ }
+
+ if(!pacparser_parse_pac_string(script)) {
+ fprintf(stderr, "%s: Could not parse the pac script:\n%s\n",
+ progname, script);
+ free(script);
+ pacparser_cleanup();
+ return 1;
+ }
+ free(script);
+ }
+ else {
+ if(!pacparser_parse_pac_file(pacfile)) {
+ fprintf(stderr, "%s: Could not parse the pac file: %s\n",
+ progname, pacfile);
+ pacparser_cleanup();
+ return 1;
+ }
+ }
+
+ if(clientip)
+ pacparser_setmyip(clientip);
+ else if(pacurl) {
+ char *pachost;
+ pachost = get_host_from_url(pacurl);
+ if (!pachost) {
+ fprintf(stderr, "%s: Error finding hostname in %s\n", progname, pacurl);
+ exit(2);
+ }
+ set_myip_from_host(pachost, ipversion);
+ }
+
+ char *proxy;
+
+ if (url) {
+ if (!host)
+ host = get_host_from_url(url);
+ if (host) {
+ proxy = NULL;
+ proxy = pacparser_find_proxy(url, host);
+ if (proxy == NULL) {
+ fprintf(stderr, "%s: Problem in finding proxy for %s\n", progname, url);
+ pacparser_cleanup();
+ return 1;
+ }
+ if (proxy) printf("%s\n", proxy);
+ }
+ }
+
+ pacparser_cleanup();
+ return 0;
+}
diff --git a/src/pacwget b/src/pacwget
new file mode 100755
index 00000000..e51ce1ec
--- /dev/null
+++ b/src/pacwget
@@ -0,0 +1,272 @@
+#!/bin/bash
+# Wget wrapper to support Proxy Auto Config files, multiple proxies,
+# and round-robin server names.
+# Requires wget, and pacparse command which is part of pacparser package
+# Written by Dave Dykstra, March 2013
+#
+# pacwget is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 3 of the License, or (at your option) any later version.
+#
+# pacwget is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+
+ME=pacwget
+
+usage()
+{
+ (
+ echo "Usage: $ME [--only-proxies] [wget_options]"
+ echo
+ echo '$PAC_URLS is a semicolon-separated list of URLs to try to read'
+ echo ' Proxy Auto Config files from if $HTTP_PROXIES and $http_proxy are'
+ echo ' not set. "auto" in the list is equivalent to http://wpad/wpad.dat'
+ echo ' and "DIRECT" at the end indicates direct connection to server.'
+ echo ' Everything else must begin with http:// or file://.'
+ echo ' Default is "auto; DIRECT".'
+ echo '$HTTP_PROXIES is a semicolon-separated list of http proxy URLs'.
+ echo ' "DIRECT" at the end indicates direct connection to server.'
+ echo '$http_proxy is single http proxy URL.'
+ echo 'For http:// URLs, tries all proxies and servers until one succeeds,'
+ echo ' including round-robin DNS names.'
+ echo 'By default adds these wget options:'
+ echo ' --tries=1 --connect-timeout=5 --read-timeout=10'
+ echo 'Also adds wget -q when reading PAC URLs if -v or -d not given'
+ echo 'If --only-proxies is given, prints to stdout a $HTTP_PROXIES-like list of'
+ echo ' proxies that would be used and does not load the given http URL with wget.'
+ ) >&2
+ exit 1
+}
+
+if [ $# = 0 ]; then
+ usage
+fi
+
+CONNECTTIMEOUT=""
+READTIMEOUT=""
+PREV_ARG=""
+AHTTPURL=""
+AURL=""
+IPVERSION=""
+ONLYPROXIES=false
+PACPARSEDEBUG=""
+PACWGETQUIET="-q"
+for ARG; do
+ case "$PREVARG" in
+ -*T*)
+ CONNECTTIMEOUT="$ARG"
+ READTIMEOUT="$ARG"
+ next
+ ;;
+ esac
+ case "$ARG" in
+ --connect-timeout=*)
+ CONNECTTIMEOUT=${ARG#*=}
+ ;;
+ --read-timeout=*)
+ READTIMEOUT=${ARG#*=}
+ ;;
+ --only-proxies)
+ ONLYPROXIES=true;
+ ;;
+ --inet4-only|-4)
+ IPVERSION="-4"
+ ;;
+ --inet6-only|-6)
+ IPVERSION="-6"
+ ;;
+ --debug|-d)
+ PACPARSEDEBUG="-d"
+ PACWGETQUIET=""
+ ;;
+ --verbose|-v)
+ PACWGETQUIET=""
+ ;;
+ http://*)
+ if [ -z "$AHTTPURL" ]; then
+ AHTTPURL="$ARG"
+ fi
+ ;;
+ *://*)
+ AURL="$ARG"
+ ;;
+ esac
+done
+
+PACWGETARGS="--tries=1 $PACWGETQUIET $IPVERSION"
+WGETARGS="--tries=1"
+if [ -n "$CONNECTTIMEOUT" ]; then
+ PACWGETARGS="$PACWGETARGS --connect-timeout=$CONNECTTIMEOUT"
+else
+ PACWGETARGS="$PACWGETARGS --connect-timeout=5"
+ WGETARGS="$WGETARGS --connect-timeout=5"
+fi
+if [ -n "$READTIMEOUT" ]; then
+ PACWGETARGS="$PACWGETARGS --read-timeout=$READTIMEOUT"
+else
+ PACWGETARGS="$PACWGETARGS --read-timeout=10"
+ WGETARGS="$WGETARGS --read-timeout=10"
+fi
+if [ -n "$PACPARSEDEBUG" ]; then
+ PACWGETARGS="$PACWGETARGS -d"
+fi
+
+if [ -z "$AHTTPURL" ]; then
+ if [ -z "$AURL" ] || $ONLYPROXIES; then
+ usage
+ fi
+ # no HTTP URLs but there are other URLs so just let wget handle it
+ exec wget $WGETARGS "$@"
+fi
+
+if [ -n "$HTTP_PROXIES" ]; then
+ PROXIES="${HTTP_PROXIES//;/ }"
+elif [ -n "$http_proxy" ]; then
+ PROXIES="$http_proxy"
+else
+ if [ -z "$PAC_URLS" ]; then
+ PAC_URLS="auto; DIRECT"
+ fi
+ PACTMPFILE=`mktemp /tmp/pacwgetXXXXXXXXXX`
+ trap "rm -f $PACTMPFILE" 0
+ for PACURL in ${PAC_URLS//;/ }; do
+ if [ "$PACURL" = "auto" ]; then
+ PACURL="http://wpad/wpad.dat"
+ elif [ "$PACURL" = "DIRECT" ]; then
+ PROXIES="DIRECT"
+ break
+ fi
+ if [ "$PACURL" != "${PACURL#file://}" ]; then
+ PACFILE=/${PACURL#file://*/}
+ if [ ! -r "$PACFILE" ]; then
+ echo "$ME: $PACFILE does not exist or is not readable" >&2
+ exit 2
+ fi
+ PROXYLIST="`pacparse $IPVERSION $PACPARSEDEBUG -p $PACFILE -u "$AHTTPURL"`"
+ PACPARSERET=$?
+ elif [ "$PACURL" != "${PACURL#http://}" ]; then
+ if [ -n "$PACPARSEDEBUG" ]; then
+ echo "DEBUG: wgetting $PACURL"
+ fi
+ if wget $PACWGETARGS -O$PACTMPFILE $PACURL; then
+ PROXYLIST="`pacparse $IPVERSION $PACPARSEDEBUG -p $PACTMPFILE -u "$AHTTPURL" -U "$PACURL"`"
+ PACPARSERET=$?
+ else
+ # wget of a PAC file failed, silently continue to next one
+ continue
+ fi
+ else
+ echo "$ME: $PACURL isn't auto or DIRECT" >&2
+ echo " and doesn't begin with http:// or file://" >&2
+ exit 2
+ fi
+ if [ "$PACPARSERET" != 0 ]; then
+ echo "$ME: failed Proxy Auto Config parse of $PACURL" >&2
+ echo " with URL $AHTTPURL" >&2
+ exit $PACPARSERET
+ fi
+ case "$PROXYLIST" in
+ "PROXY "*|"DIRECT")
+ PROXIES="${PROXYLIST//PROXY /}"
+ PROXIES="${PROXIES//;/ }"
+ break
+ ;;
+ esac
+ echo "$ME: Proxy Auto Config parse of $PACURL" >&2
+ echo " with URL $AHTTPURL" >&2
+ echo " did not return list starting with PROXY or DIRECT" >&2
+ echo " Instead it returned: $PROXYLIST" >&2
+ exit 2
+ done
+ if [ -z "$PROXIES" ]; then
+ echo "$ME: no proxy found from PAC_URLS=\"$PAC_URLS\"" >&2
+ exit 2
+ fi
+ rm -f $PACTMPFILE
+ trap 0
+fi
+
+if $ONLYPROXIES; then
+ PROXIES="`echo $PROXIES`" # consolidate multiple blanks to 1
+ echo ${PROXIES// /;}
+ exit
+fi
+
+# Execute wget for each URL separately
+# It would nice to be able to execute wget once for all URLs, but it sometimes
+# returns success even if one of the URLs fails. Also, it never reuses a
+# connection to a proxy anyway so it doesn't make a huge difference. At
+# least the list of proxies determined above can be reused
+RET=0
+for ARG; do
+ case "$ARG" in
+ http://*);;
+ *://*)
+ wget $WGETARGS "$ARG"
+ RET=$?
+ if [ $RET = 0 ]; then
+ continue
+ fi
+ break
+ ;;
+ *)
+ WGETARGS="$WGETARGS $ARG"
+ continue
+ ;;
+ esac
+
+ # http:// -- try wget until one of the proxies suceeds
+ for PROXY in $PROXIES; do
+ if [ "$PROXY" = DIRECT ]; then
+ unset http_proxy
+ else
+ export http_proxy="$PROXY"
+ fi
+ wget $WGETARGS "$ARG"
+ RET=$?
+ if [ $RET = 0 ]; then
+ break
+ fi
+ if [ "$PROXY" != DIRECT ]; then
+ # if using a proxy, and server has more than one address,
+ # try individual addresses too if can find them, to make
+ # sure all have been tried
+ AHOST="${AHTTPURL#http://}"
+ AHOST="${AHOST%%/*}"
+ AHOST="${AHOST%:*}"
+ IPS="`host $AHOST 2>/dev/null|sed -n "s/.* has address //p"`"
+ if [ `echo "$IPS"|wc -l` -lt 2 ]; then
+ continue
+ fi
+ for IP in $IPS; do
+ RET=0
+ wget $WGETARGS "${ARG/$AHOST/$IP}"
+ RET=$?
+ if [ $RET = 0 ]; then
+ break
+ fi
+ done
+ if [ $RET = 0 ]; then
+ break
+ fi
+ fi
+ if [ $RET = 0 ]; then
+ break
+ fi
+ done
+ if [ $RET != 0 ]; then
+ echo "$ME: error getting $ARG" >&2
+ echo " with HTTP_PROXIES=${PROXIES// /; }" >&2
+ break
+ fi
+done
+
+exit $RET