1 /***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 * SPDX-License-Identifier: curl
22 *
23 ***************************************************************************/
24 /* <DESC>
25 * Get a web page, extract the title with libxml.
26 * </DESC>
27
28 Written by Lars Nilsson
29
30 GNU C++ compile command line suggestion (edit paths accordingly):
31
32 g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
33 -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
34 */
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <string>
39 #include <curl/curl.h>
40 #include <libxml/HTMLparser.h>
41
42 //
43 // Case-insensitive string comparison
44 //
45
46 #ifdef _MSC_VER
47 #define COMPARE(a, b) (!_stricmp((a), (b)))
48 #else
49 #define COMPARE(a, b) (!strcasecmp((a), (b)))
50 #endif
51
52 //
53 // libxml callback context structure
54 //
55
56 struct Context
57 {
ContextContext58 Context(): addTitle(false) { }
59
60 bool addTitle;
61 std::string title;
62 };
63
64 //
65 // libcurl variables for error strings and returned data
66
67 static char errorBuffer[CURL_ERROR_SIZE];
68 static std::string buffer;
69
70 //
71 // libcurl write callback function
72 //
73
writer(char * data,size_t size,size_t nmemb,std::string * writerData)74 static int writer(char *data, size_t size, size_t nmemb,
75 std::string *writerData)
76 {
77 if(writerData == NULL)
78 return 0;
79
80 writerData->append(data, size*nmemb);
81
82 return size * nmemb;
83 }
84
85 //
86 // libcurl connection initialization
87 //
88
init(CURL * & conn,char * url)89 static bool init(CURL *&conn, char *url)
90 {
91 CURLcode code;
92
93 conn = curl_easy_init();
94
95 if(conn == NULL) {
96 fprintf(stderr, "Failed to create CURL connection\n");
97 exit(EXIT_FAILURE);
98 }
99
100 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
101 if(code != CURLE_OK) {
102 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
103 return false;
104 }
105
106 code = curl_easy_setopt(conn, CURLOPT_URL, url);
107 if(code != CURLE_OK) {
108 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
109 return false;
110 }
111
112 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
113 if(code != CURLE_OK) {
114 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
115 return false;
116 }
117
118 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
119 if(code != CURLE_OK) {
120 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
121 return false;
122 }
123
124 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
125 if(code != CURLE_OK) {
126 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
127 return false;
128 }
129
130 return true;
131 }
132
133 //
134 // libxml start element callback function
135 //
136
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)137 static void StartElement(void *voidContext,
138 const xmlChar *name,
139 const xmlChar **attributes)
140 {
141 Context *context = static_cast<Context *>(voidContext);
142
143 if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) {
144 context->title = "";
145 context->addTitle = true;
146 }
147 (void) attributes;
148 }
149
150 //
151 // libxml end element callback function
152 //
153
EndElement(void * voidContext,const xmlChar * name)154 static void EndElement(void *voidContext,
155 const xmlChar *name)
156 {
157 Context *context = static_cast<Context *>(voidContext);
158
159 if(COMPARE(reinterpret_cast<char *>(name), "TITLE"))
160 context->addTitle = false;
161 }
162
163 //
164 // Text handling helper function
165 //
166
handleCharacters(Context * context,const xmlChar * chars,int length)167 static void handleCharacters(Context *context,
168 const xmlChar *chars,
169 int length)
170 {
171 if(context->addTitle)
172 context->title.append(reinterpret_cast<char *>(chars), length);
173 }
174
175 //
176 // libxml PCDATA callback function
177 //
178
Characters(void * voidContext,const xmlChar * chars,int length)179 static void Characters(void *voidContext,
180 const xmlChar *chars,
181 int length)
182 {
183 Context *context = static_cast<Context *>(voidContext);
184
185 handleCharacters(context, chars, length);
186 }
187
188 //
189 // libxml CDATA callback function
190 //
191
cdata(void * voidContext,const xmlChar * chars,int length)192 static void cdata(void *voidContext,
193 const xmlChar *chars,
194 int length)
195 {
196 Context *context = static_cast<Context *>(voidContext);
197
198 handleCharacters(context, chars, length);
199 }
200
201 //
202 // libxml SAX callback structure
203 //
204
205 static htmlSAXHandler saxHandler =
206 {
207 NULL,
208 NULL,
209 NULL,
210 NULL,
211 NULL,
212 NULL,
213 NULL,
214 NULL,
215 NULL,
216 NULL,
217 NULL,
218 NULL,
219 NULL,
220 NULL,
221 StartElement,
222 EndElement,
223 NULL,
224 Characters,
225 NULL,
226 NULL,
227 NULL,
228 NULL,
229 NULL,
230 NULL,
231 NULL,
232 cdata,
233 NULL
234 };
235
236 //
237 // Parse given (assumed to be) HTML text and return the title
238 //
239
parseHtml(const std::string & html,std::string & title)240 static void parseHtml(const std::string &html,
241 std::string &title)
242 {
243 htmlParserCtxtPtr ctxt;
244 Context context;
245
246 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
247 XML_CHAR_ENCODING_NONE);
248
249 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
250 htmlParseChunk(ctxt, "", 0, 1);
251
252 htmlFreeParserCtxt(ctxt);
253
254 title = context.title;
255 }
256
main(int argc,char * argv[])257 int main(int argc, char *argv[])
258 {
259 CURL *conn = NULL;
260 CURLcode code;
261 std::string title;
262
263 // Ensure one argument is given
264
265 if(argc != 2) {
266 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
267 exit(EXIT_FAILURE);
268 }
269
270 curl_global_init(CURL_GLOBAL_DEFAULT);
271
272 // Initialize CURL connection
273
274 if(!init(conn, argv[1])) {
275 fprintf(stderr, "Connection initialization failed\n");
276 exit(EXIT_FAILURE);
277 }
278
279 // Retrieve content for the URL
280
281 code = curl_easy_perform(conn);
282 curl_easy_cleanup(conn);
283
284 if(code != CURLE_OK) {
285 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
286 exit(EXIT_FAILURE);
287 }
288
289 // Parse the (assumed) HTML code
290 parseHtml(buffer, title);
291
292 // Display the extracted title
293 printf("Title: %s\n", title.c_str());
294
295 return EXIT_SUCCESS;
296 }
297