xref: /curl/docs/examples/crawler.c (revision 37fb50a8)
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com>
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  * To compile:
24  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
25  *
26  */
27 /* <DESC>
28  * Web crawler based on curl and libxml2 to stress-test curl with
29  * hundreds of concurrent connections to various servers.
30  * </DESC>
31  */
32 
33 #include <libxml/HTMLparser.h>
34 #include <libxml/xpath.h>
35 #include <libxml/uri.h>
36 #include <curl/curl.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <math.h>
40 #include <signal.h>
41 
42 /* Parameters */
43 static int max_con = 200;
44 static int max_total = 20000;
45 static int max_requests = 500;
46 static size_t max_link_per_page = 5;
47 static int follow_relative_links = 0;
48 static const char *start_page = "https://www.reuters.com";
49 
50 static int pending_interrupt = 0;
sighandler(int dummy)51 static void sighandler(int dummy)
52 {
53   (void)dummy;
54   pending_interrupt = 1;
55 }
56 
57 /* resizable buffer */
58 typedef struct {
59   char *buf;
60   size_t size;
61 } memory;
62 
grow_buffer(void * contents,size_t sz,size_t nmemb,void * ctx)63 static size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
64 {
65   size_t realsize = sz * nmemb;
66   memory *mem = (memory*) ctx;
67   char *ptr = realloc(mem->buf, mem->size + realsize);
68   if(!ptr) {
69     /* out of memory */
70     printf("not enough memory (realloc returned NULL)\n");
71     return 0;
72   }
73   mem->buf = ptr;
74   memcpy(&(mem->buf[mem->size]), contents, realsize);
75   mem->size += realsize;
76   return realsize;
77 }
78 
make_handle(const char * url)79 static CURL *make_handle(const char *url)
80 {
81   CURL *handle = curl_easy_init();
82   memory *mem;
83 
84   /* Important: use HTTP2 over HTTPS */
85   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
86   curl_easy_setopt(handle, CURLOPT_URL, url);
87 
88   /* buffer body */
89   mem = malloc(sizeof(memory));
90   mem->size = 0;
91   mem->buf = malloc(1);
92   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
93   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
94   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
95 
96   /* For completeness */
97   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
98   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
99   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
100   /* only allow redirects to HTTP and HTTPS URLs */
101   curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
102   curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
103   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
104   /* each transfer needs to be done within 20 seconds! */
105   curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
106   /* connect fast or fail */
107   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
108   /* skip files larger than a gigabyte */
109   curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
110                    (curl_off_t)1024*1024*1024);
111   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
112   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
113   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
114   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
115   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
116   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
117   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
118   return handle;
119 }
120 
121 /* HREF finder implemented in libxml2 but could be any HTML parser */
follow_links(CURLM * multi_handle,memory * mem,const char * url)122 static size_t follow_links(CURLM *multi_handle, memory *mem, const char *url)
123 {
124   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
125              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
126   htmlDocPtr doc = htmlReadMemory(mem->buf, (int)mem->size, url, NULL, opts);
127   size_t count;
128   int i;
129   xmlChar *xpath;
130   xmlNodeSetPtr nodeset;
131   xmlXPathContextPtr context;
132   xmlXPathObjectPtr result;
133   if(!doc)
134     return 0;
135   xpath = (xmlChar*) "//a/@href";
136   context = xmlXPathNewContext(doc);
137   result = xmlXPathEvalExpression(xpath, context);
138   xmlXPathFreeContext(context);
139   if(!result)
140     return 0;
141   nodeset = result->nodesetval;
142   if(xmlXPathNodeSetIsEmpty(nodeset)) {
143     xmlXPathFreeObject(result);
144     return 0;
145   }
146   count = 0;
147   for(i = 0; i < nodeset->nodeNr; i++) {
148     double r = rand();
149     int x = (int)(r * nodeset->nodeNr / RAND_MAX);
150     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
151     xmlChar *href = xmlNodeListGetString(doc, node, 1);
152     char *link;
153     if(follow_relative_links) {
154       xmlChar *orig = href;
155       href = xmlBuildURI(href, (xmlChar *) url);
156       xmlFree(orig);
157     }
158     link = (char *) href;
159     if(!link || strlen(link) < 20)
160       continue;
161     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
162       curl_multi_add_handle(multi_handle, make_handle(link));
163       if(count++ == max_link_per_page)
164         break;
165     }
166     xmlFree(link);
167   }
168   xmlXPathFreeObject(result);
169   return count;
170 }
171 
is_html(char * ctype)172 static int is_html(char *ctype)
173 {
174   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
175 }
176 
main(void)177 int main(void)
178 {
179   CURLM *multi_handle;
180   int msgs_left;
181   int pending;
182   int complete;
183   int still_running;
184 
185   signal(SIGINT, sighandler);
186   LIBXML_TEST_VERSION
187   curl_global_init(CURL_GLOBAL_DEFAULT);
188   multi_handle = curl_multi_init();
189   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
190   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
191 
192   /* enables http/2 if available */
193 #ifdef CURLPIPE_MULTIPLEX
194   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
195 #endif
196 
197   /* sets html start page */
198   curl_multi_add_handle(multi_handle, make_handle(start_page));
199 
200   pending = 0;
201   complete = 0;
202   still_running = 1;
203   while(still_running && !pending_interrupt) {
204     int numfds;
205     CURLMsg *m;
206 
207     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
208     curl_multi_perform(multi_handle, &still_running);
209 
210     /* See how the transfers went */
211     m = NULL;
212     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
213       if(m->msg == CURLMSG_DONE) {
214         CURL *handle = m->easy_handle;
215         char *url;
216         memory *mem;
217         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
218         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
219         if(m->data.result == CURLE_OK) {
220           long res_status;
221           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
222           if(res_status == 200) {
223             char *ctype;
224             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
225             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
226             if(is_html(ctype) && mem->size > 100) {
227               if(pending < max_requests && (complete + pending) < max_total) {
228                 pending += follow_links(multi_handle, mem, url);
229                 still_running = 1;
230               }
231             }
232           }
233           else {
234             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
235           }
236         }
237         else {
238           printf("[%d] Connection failure: %s\n", complete, url);
239         }
240         curl_multi_remove_handle(multi_handle, handle);
241         curl_easy_cleanup(handle);
242         free(mem->buf);
243         free(mem);
244         complete++;
245         pending--;
246       }
247     }
248   }
249   curl_multi_cleanup(multi_handle);
250   curl_global_cleanup();
251   return 0;
252 }
253