1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 * SPDX-License-Identifier: curl
22 *
23 ***************************************************************************/
24 /* <DESC>
25 * Get a web page, extract the title with libxml.
26 * </DESC>
27
28 Written by Lars Nilsson
29
30 GNU C++ compile command line suggestion (edit paths accordingly):
31
32 g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
33 -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
34 */
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <string>
39 #include <curl/curl.h>
40 #include <libxml/HTMLparser.h>
41
42 //
43 // Case-insensitive string comparison
44 //
45
46 #ifdef _WIN32
47 #define COMPARE(a, b) (!_stricmp((a), (b)))
48 #else
49 #define COMPARE(a, b) (!strcasecmp((a), (b)))
50 #endif
51
52 //
53 // libxml callback context structure
54 //
55
56 struct Context
57 {
ContextContext58 Context(): addTitle(false) { }
59
60 bool addTitle;
61 std::string title;
62 };
63
64 //
65 // libcurl variables for error strings and returned data
66
67 static char errorBuffer[CURL_ERROR_SIZE];
68 static std::string buffer;
69
70 //
71 // libcurl write callback function
72 //
73
writer(char * data,size_t size,size_t nmemb,std::string * writerData)74 static size_t writer(char *data, size_t size, size_t nmemb,
75 std::string *writerData)
76 {
77 if(writerData == NULL)
78 return 0;
79
80 writerData->append(data, size*nmemb);
81
82 return size * nmemb;
83 }
84
85 //
86 // libcurl connection initialization
87 //
88
init(CURL * & conn,const char * url)89 static bool init(CURL *&conn, const char *url)
90 {
91 CURLcode code;
92
93 conn = curl_easy_init();
94
95 if(conn == NULL) {
96 fprintf(stderr, "Failed to create CURL connection\n");
97 exit(EXIT_FAILURE);
98 }
99
100 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101 if(code != CURLE_OK) {
102 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
103 return false;
104 }
105
106 code = curl_easy_setopt(conn, CURLOPT_URL, url);
107 if(code != CURLE_OK) {
108 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
109 return false;
110 }
111
112 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
113 if(code != CURLE_OK) {
114 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
115 return false;
116 }
117
118 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
119 if(code != CURLE_OK) {
120 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
121 return false;
122 }
123
124 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
125 if(code != CURLE_OK) {
126 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
127 return false;
128 }
129
130 return true;
131 }
132
133 //
134 // libxml start element callback function
135 //
136
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)137 static void StartElement(void *voidContext,
138 const xmlChar *name,
139 const xmlChar **attributes)
140 {
141 Context *context = static_cast<Context *>(voidContext);
142
143 if(COMPARE(reinterpret_cast<const char *>(name), "TITLE")) {
144 context->title = "";
145 context->addTitle = true;
146 }
147 (void) attributes;
148 }
149
150 //
151 // libxml end element callback function
152 //
153
EndElement(void * voidContext,const xmlChar * name)154 static void EndElement(void *voidContext,
155 const xmlChar *name)
156 {
157 Context *context = static_cast<Context *>(voidContext);
158
159 if(COMPARE(reinterpret_cast<const char *>(name), "TITLE"))
160 context->addTitle = false;
161 }
162
163 //
164 // Text handling helper function
165 //
166
handleCharacters(Context * context,const xmlChar * chars,int length)167 static void handleCharacters(Context *context,
168 const xmlChar *chars,
169 int length)
170 {
171 if(context->addTitle)
172 context->title.append(reinterpret_cast<const char *>(chars),
173 (unsigned long)length);
174 }
175
176 //
177 // libxml PCDATA callback function
178 //
179
Characters(void * voidContext,const xmlChar * chars,int length)180 static void Characters(void *voidContext,
181 const xmlChar *chars,
182 int length)
183 {
184 Context *context = static_cast<Context *>(voidContext);
185
186 handleCharacters(context, chars, length);
187 }
188
189 //
190 // libxml CDATA callback function
191 //
192
cdata(void * voidContext,const xmlChar * chars,int length)193 static void cdata(void *voidContext,
194 const xmlChar *chars,
195 int length)
196 {
197 Context *context = static_cast<Context *>(voidContext);
198
199 handleCharacters(context, chars, length);
200 }
201
202 //
203 // libxml SAX callback structure
204 //
205
206 static htmlSAXHandler saxHandler =
207 {
208 NULL,
209 NULL,
210 NULL,
211 NULL,
212 NULL,
213 NULL,
214 NULL,
215 NULL,
216 NULL,
217 NULL,
218 NULL,
219 NULL,
220 NULL,
221 NULL,
222 StartElement,
223 EndElement,
224 NULL,
225 Characters,
226 NULL,
227 NULL,
228 NULL,
229 NULL,
230 NULL,
231 NULL,
232 NULL,
233 cdata,
234 NULL,
235 0,
236 0,
237 0,
238 0,
239 NULL
240 };
241
242 //
243 // Parse given (assumed to be) HTML text and return the title
244 //
245
parseHtml(const std::string & html,std::string & title)246 static void parseHtml(const std::string &html,
247 std::string &title)
248 {
249 htmlParserCtxtPtr ctxt;
250 Context context;
251
252 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
253 XML_CHAR_ENCODING_NONE);
254
255 htmlParseChunk(ctxt, html.c_str(), (int)html.size(), 0);
256 htmlParseChunk(ctxt, "", 0, 1);
257
258 htmlFreeParserCtxt(ctxt);
259
260 title = context.title;
261 }
262
main(int argc,char * argv[])263 int main(int argc, char *argv[])
264 {
265 CURL *conn = NULL;
266 CURLcode code;
267 std::string title;
268
269 // Ensure one argument is given
270
271 if(argc != 2) {
272 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
273 exit(EXIT_FAILURE);
274 }
275
276 curl_global_init(CURL_GLOBAL_DEFAULT);
277
278 // Initialize CURL connection
279
280 if(!init(conn, argv[1])) {
281 fprintf(stderr, "Connection initialization failed\n");
282 exit(EXIT_FAILURE);
283 }
284
285 // Retrieve content for the URL
286
287 code = curl_easy_perform(conn);
288 curl_easy_cleanup(conn);
289
290 if(code != CURLE_OK) {
291 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
292 exit(EXIT_FAILURE);
293 }
294
295 // Parse the (assumed) HTML code
296 parseHtml(buffer, title);
297
298 // Display the extracted title
299 printf("Title: %s\n", title.c_str());
300
301 return EXIT_SUCCESS;
302 }
303