xref: /curl/docs/examples/htmltitle.cpp (revision 127eb0d8)
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  ***************************************************************************/
24 /* <DESC>
25  * Get a web page, extract the title with libxml.
26  * </DESC>
27 
28  Written by Lars Nilsson
29 
30  GNU C++ compile command line suggestion (edit paths accordingly):
31 
32  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
33  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
34 */
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <string>
39 #include <curl/curl.h>
40 #include <libxml/HTMLparser.h>
41 
42 //
43 //  Case-insensitive string comparison
44 //
45 
46 #ifdef _MSC_VER
47 #define COMPARE(a, b) (!_stricmp((a), (b)))
48 #else
49 #define COMPARE(a, b) (!strcasecmp((a), (b)))
50 #endif
51 
52 //
53 //  libxml callback context structure
54 //
55 
56 struct Context
57 {
ContextContext58   Context(): addTitle(false) { }
59 
60   bool addTitle;
61   std::string title;
62 };
63 
64 //
65 //  libcurl variables for error strings and returned data
66 
67 static char errorBuffer[CURL_ERROR_SIZE];
68 static std::string buffer;
69 
70 //
71 //  libcurl write callback function
72 //
73 
writer(char * data,size_t size,size_t nmemb,std::string * writerData)74 static int writer(char *data, size_t size, size_t nmemb,
75                   std::string *writerData)
76 {
77   if(writerData == NULL)
78     return 0;
79 
80   writerData->append(data, size*nmemb);
81 
82   return size * nmemb;
83 }
84 
85 //
86 //  libcurl connection initialization
87 //
88 
init(CURL * & conn,char * url)89 static bool init(CURL *&conn, char *url)
90 {
91   CURLcode code;
92 
93   conn = curl_easy_init();
94 
95   if(conn == NULL) {
96     fprintf(stderr, "Failed to create CURL connection\n");
97     exit(EXIT_FAILURE);
98   }
99 
100   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101   if(code != CURLE_OK) {
102     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
103     return false;
104   }
105 
106   code = curl_easy_setopt(conn, CURLOPT_URL, url);
107   if(code != CURLE_OK) {
108     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
109     return false;
110   }
111 
112   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
113   if(code != CURLE_OK) {
114     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
115     return false;
116   }
117 
118   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
119   if(code != CURLE_OK) {
120     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
121     return false;
122   }
123 
124   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
125   if(code != CURLE_OK) {
126     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
127     return false;
128   }
129 
130   return true;
131 }
132 
133 //
134 //  libxml start element callback function
135 //
136 
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)137 static void StartElement(void *voidContext,
138                          const xmlChar *name,
139                          const xmlChar **attributes)
140 {
141   Context *context = static_cast<Context *>(voidContext);
142 
143   if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) {
144     context->title = "";
145     context->addTitle = true;
146   }
147   (void) attributes;
148 }
149 
150 //
151 //  libxml end element callback function
152 //
153 
EndElement(void * voidContext,const xmlChar * name)154 static void EndElement(void *voidContext,
155                        const xmlChar *name)
156 {
157   Context *context = static_cast<Context *>(voidContext);
158 
159   if(COMPARE(reinterpret_cast<char *>(name), "TITLE"))
160     context->addTitle = false;
161 }
162 
163 //
164 //  Text handling helper function
165 //
166 
handleCharacters(Context * context,const xmlChar * chars,int length)167 static void handleCharacters(Context *context,
168                              const xmlChar *chars,
169                              int length)
170 {
171   if(context->addTitle)
172     context->title.append(reinterpret_cast<char *>(chars), length);
173 }
174 
175 //
176 //  libxml PCDATA callback function
177 //
178 
Characters(void * voidContext,const xmlChar * chars,int length)179 static void Characters(void *voidContext,
180                        const xmlChar *chars,
181                        int length)
182 {
183   Context *context = static_cast<Context *>(voidContext);
184 
185   handleCharacters(context, chars, length);
186 }
187 
188 //
189 //  libxml CDATA callback function
190 //
191 
cdata(void * voidContext,const xmlChar * chars,int length)192 static void cdata(void *voidContext,
193                   const xmlChar *chars,
194                   int length)
195 {
196   Context *context = static_cast<Context *>(voidContext);
197 
198   handleCharacters(context, chars, length);
199 }
200 
201 //
202 //  libxml SAX callback structure
203 //
204 
205 static htmlSAXHandler saxHandler =
206 {
207   NULL,
208   NULL,
209   NULL,
210   NULL,
211   NULL,
212   NULL,
213   NULL,
214   NULL,
215   NULL,
216   NULL,
217   NULL,
218   NULL,
219   NULL,
220   NULL,
221   StartElement,
222   EndElement,
223   NULL,
224   Characters,
225   NULL,
226   NULL,
227   NULL,
228   NULL,
229   NULL,
230   NULL,
231   NULL,
232   cdata,
233   NULL
234 };
235 
236 //
237 //  Parse given (assumed to be) HTML text and return the title
238 //
239 
parseHtml(const std::string & html,std::string & title)240 static void parseHtml(const std::string &html,
241                       std::string &title)
242 {
243   htmlParserCtxtPtr ctxt;
244   Context context;
245 
246   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
247                                   XML_CHAR_ENCODING_NONE);
248 
249   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
250   htmlParseChunk(ctxt, "", 0, 1);
251 
252   htmlFreeParserCtxt(ctxt);
253 
254   title = context.title;
255 }
256 
main(int argc,char * argv[])257 int main(int argc, char *argv[])
258 {
259   CURL *conn = NULL;
260   CURLcode code;
261   std::string title;
262 
263   // Ensure one argument is given
264 
265   if(argc != 2) {
266     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
267     exit(EXIT_FAILURE);
268   }
269 
270   curl_global_init(CURL_GLOBAL_DEFAULT);
271 
272   // Initialize CURL connection
273 
274   if(!init(conn, argv[1])) {
275     fprintf(stderr, "Connection initialization failed\n");
276     exit(EXIT_FAILURE);
277   }
278 
279   // Retrieve content for the URL
280 
281   code = curl_easy_perform(conn);
282   curl_easy_cleanup(conn);
283 
284   if(code != CURLE_OK) {
285     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
286     exit(EXIT_FAILURE);
287   }
288 
289   // Parse the (assumed) HTML code
290   parseHtml(buffer, title);
291 
292   // Display the extracted title
293   printf("Title: %s\n", title.c_str());
294 
295   return EXIT_SUCCESS;
296 }
297