在C语言和Python中实现爬虫的方法有所不同,但基本原理相同。这里分别给出一个简单的C语言和Python爬虫示例。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
char **response = (char **)userp;
*response = realloc(*response, realsize + 1);
if (*response == NULL) {
return 0;
}
memcpy(*response, contents, realsize);
(*response)[realsize] = '\0';
return realsize;
}
int main(void) {
CURL *curl;
CURLcode res;
char *response = NULL;
curl_global_init(CURL_GLOBAL_DEFAULT);
curl = curl_easy_init();
if(curl) {
curl_easy_setopt(curl, CURLOPT_URL, "https://example.com");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
res = curl_easy_perform(curl);
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
} else {
printf("Response:\n%s\n", response);
}
curl_easy_cleanup(curl);
}
curl_global_cleanup();
free(response);
return 0;
}
import requests
from bs4 import BeautifulSoup
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
print("Response:")
print(soup.prettify())
这两个示例都实现了一个简单的爬虫,从指定的URL获取HTML内容并打印出来。C语言示例使用了libcurl库,而Python示例使用了requests库。在Python示例中,我们还使用了BeautifulSoup库来解析HTML内容。