diff --git a/doc/wget.texi b/doc/wget.texi index d9ed17d68e..f3b75ed565 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -1916,6 +1916,10 @@ case. Turn on recursive retrieving. @xref{Recursive Download}, for more details. The default maximum depth is 5. +@item --queue-type=@var{queuetype} +Specify the queue type (@pxref{Recursive Download}). Accepted values are @samp{fifo} (the default) +and @samp{lifo}. + @item -l @var{depth} @itemx --level=@var{depth} Specify recursion maximum depth level @var{depth} (@pxref{Recursive @@ -2296,6 +2300,14 @@ documents linked by them, and so on. In other words, Wget first downloads the documents at depth 1, then those at depth 2, and so on until the specified maximum depth. +The @dfn{queue type} is FIFO (default) or LIFO. FIFO download (dequeue) +the first enqueued files first. LIFO download the last enqueued files +first. LIFO can prevent that links expire before they're downloaded +because it downloads them directly after their parent page and +therefore directly after the parent page and its temporary links are +generated if it's a dynamic page. Pages sometimes use temporary links +to prevent direct links to files. + The maximum @dfn{depth} to which the retrieval may descend is specified with the @samp{-l} option. The default maximum depth is five layers. diff --git a/src/init.c b/src/init.c index 569b25b2bd..71b1203215 100644 --- a/src/init.c +++ b/src/init.c @@ -104,6 +104,7 @@ CMD_DECLARE (cmd_spec_htmlify); CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_prefer_family); CMD_DECLARE (cmd_spec_progress); +CMD_DECLARE (cmd_spec_queue_type); CMD_DECLARE (cmd_spec_recursive); CMD_DECLARE (cmd_spec_regex_type); CMD_DECLARE (cmd_spec_restrict_file_names); @@ -247,6 +248,7 @@ static const struct { { "proxypasswd", &opt.proxy_passwd, cmd_string }, /* deprecated */ { "proxypassword", &opt.proxy_passwd, cmd_string }, { "proxyuser", &opt.proxy_user, cmd_string }, + { "queuetype", &opt.queue_type, cmd_spec_queue_type }, { "quiet", &opt.quiet, cmd_boolean }, { "quota", &opt.quota, cmd_bytes_sum }, #ifdef HAVE_SSL @@ -403,6 +405,8 @@ defaults (void) opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; + opt.queue_type = queue_type_fifo; + opt.regex_type = regex_type_posix; opt.max_redirect = 20; @@ -1441,6 +1445,23 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored _GL_UN return true; } +/* Validate --queue-type and set the choice. */ + +static bool +cmd_spec_queue_type (const char *com, const char *val, void *place_ignored _GL_UNUSED) +{ + static const struct decode_item choices[] = { + { "fifo", queue_type_fifo }, + { "lifo", queue_type_lifo }, + }; + int queue_type = queue_type_fifo; + int ok = decode_string (val, choices, countof (choices), &queue_type); + if (!ok) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + opt.queue_type = queue_type; + return ok; +} + /* Validate --regex-type and set the choice. */ static bool diff --git a/src/main.c b/src/main.c index 6feb1403cc..b4e5eab5a5 100644 --- a/src/main.c +++ b/src/main.c @@ -272,6 +272,7 @@ static struct cmdline_option option_data[] = { "proxy-passwd", 0, OPT_VALUE, "proxypassword", -1 }, /* deprecated */ { "proxy-password", 0, OPT_VALUE, "proxypassword", -1 }, { "proxy-user", 0, OPT_VALUE, "proxyuser", -1 }, + { "queue-type", 0, OPT_VALUE, "queuetype", -1 }, { "quiet", 'q', OPT_BOOLEAN, "quiet", -1 }, { "quota", 'Q', OPT_VALUE, "quota", -1 }, { "random-file", 0, OPT_VALUE, "randomfile", -1 }, @@ -736,6 +737,8 @@ WARC options:\n"), Recursive download:\n"), N_("\ -r, --recursive specify recursive download\n"), + N_("\ + --queue-type=TYPE queue type (fifo|lifo).\n"), N_("\ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite)\n"), N_("\ diff --git a/src/options.h b/src/options.h index b99512650a..6f209cb2b1 100644 --- a/src/options.h +++ b/src/options.h @@ -46,6 +46,10 @@ struct options bool relative_only; /* Follow only relative links. */ bool no_parent; /* Restrict access to the parent directory. */ + enum { + queue_type_fifo, + queue_type_lifo + } queue_type; /* Recursion queue type */ int reclevel; /* Maximum level of recursion */ bool dirstruct; /* Do we build the directory structure as we go along? */ diff --git a/src/recur.c b/src/recur.c index b6b9dc6a9c..e0b72aa41f 100644 --- a/src/recur.c +++ b/src/recur.c @@ -51,6 +51,64 @@ as that of the covered work. */ #include "css-url.h" #include "spider.h" +/* Linked list bubble sort from http://stackoverflow.com/questions/19522121 */ + +void ll_bubblesort(struct urlpos **pp) +{ + // p always points to the head of the list + struct urlpos *p = *pp; + *pp = 0; + + while (p) + { + struct urlpos **lhs = &p; + struct urlpos **rhs = &p->next; + bool swapped = false; + + // keep going until qq holds the address of a null pointer + while (*rhs) + { + // if the right side is greater than the left side + if ((*rhs)->link_expect_html > (*lhs)->link_expect_html) + { + // swap linked node ptrs, then swap *back* their next ptrs + struct urlpos *tmp = *lhs; + *lhs = *rhs; + *rhs = tmp; + tmp = (*lhs)->next; + (*lhs)->next = (*rhs)->next; + (*rhs)->next = tmp; + lhs = &(*lhs)->next; + swapped = true; + } + else + { // no swap. advance both pointer-pointers + lhs = rhs; + rhs = &(*rhs)->next; + } + } + + // link last node to the sorted segment + *rhs = *pp; + + // if we swapped, detach the final node, terminate the list, and continue. + if (swapped) + { + // take the last node off the list and push it into the result. + *pp = *lhs; + *lhs = 0; + } + + // otherwise we're done. since no swaps happened the list is sorted. + // set the output parameter and terminate the loop. + else + { + *pp = p; + break; + } + } +} + /* Functions for maintaining the URL queue. */ struct queue_element { @@ -62,6 +120,7 @@ struct queue_element { struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ + struct queue_element *prev; /* previous element in queue */ struct queue_element *next; /* next element in queue */ }; @@ -88,9 +147,9 @@ url_queue_delete (struct url_queue *queue) xfree (queue); } -/* Enqueue a URL in the queue. The queue is FIFO: the items will be - retrieved ("dequeued") from the queue in the order they were placed - into it. */ +/* Enqueue a URL in the queue. The queue is FIFO (LIFO): the items will be + retrieved ("dequeued") from the queue in the (opposite) order they were + placed into it. */ static void url_enqueue (struct url_queue *queue, struct iri *i, @@ -104,6 +163,7 @@ url_enqueue (struct url_queue *queue, struct iri *i, qel->depth = depth; qel->html_allowed = html_allowed; qel->css_allowed = css_allowed; + qel->prev = NULL; qel->next = NULL; ++queue->count; @@ -119,7 +179,11 @@ url_enqueue (struct url_queue *queue, struct iri *i, i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); if (queue->tail) + { + if (opt.queue_type == queue_type_lifo) + qel->prev = queue->tail; queue->tail->next = qel; + } queue->tail = qel; if (!queue->head) @@ -134,14 +198,36 @@ url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { - struct queue_element *qel = queue->head; + struct queue_element *qel; + + switch (opt.queue_type) + { + default: + case queue_type_fifo: + qel = queue->head; + break; + case queue_type_lifo: + qel = queue->tail; + break; + } if (!qel) return false; - queue->head = queue->head->next; - if (!queue->head) - queue->tail = NULL; + switch (opt.queue_type) + { + default: + case queue_type_fifo: + queue->head = queue->head->next; + if (!queue->head) + queue->tail = NULL; + break; + case queue_type_lifo: + queue->tail = queue->tail->prev; + if (!queue->tail) + queue->head = NULL; + break; + } *i = qel->iri; *url = qel->url; @@ -407,6 +493,10 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi) if (strip_auth) referer_url = url_string (url_parsed, URL_AUTH_HIDE); + /* Place html pages on top */ + if (opt.queue_type == queue_type_lifo) + ll_bubblesort(&child); + for (; child; child = child->next) { if (child->ignore_when_downloading)