OkHttpClient和Jsoup进行网页爬取

发布时间:2020-07-21 11:21:38 作者:lifeneedyou
来源:网络 阅读:614
通过http请求,返回一个json格式的数据,然后将json数据转化为java对象返回给调用方。Http采用OkHttp库,json转化采用fastjson库。

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.ok.http.client</groupId>
<artifactId>okhttp</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>okhttp</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>

<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>bson</artifactId>
    <version>3.6.4</version>
</dependency>

<dependency>
    <groupId>com.google.cloud.trace.instrumentation.jdbc</groupId>
    <artifactId>driver</artifactId>
    <version>0.1.1</version>
    <type>pom</type>
</dependency>
<dependency>
    <groupId>ch.qos.logback.contrib</groupId>
    <artifactId>logback-mongodb-access</artifactId>
    <version>0.1.5</version>
</dependency>
<!-- MongoDB数据库连接驱动 -->
<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>mongo-java-driver</artifactId>
    <version>3.0.0</version>
</dependency>
<dependency>
    <groupId>com.squareup.okio</groupId>
    <artifactId>okio</artifactId>
    <version>1.11.0</version>

</dependency>
<dependency>
    <groupId>com.squareup.okhttp3</groupId>
    <artifactId>okhttp</artifactId>
    <version>3.6.0</version>
</dependency>

<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.47</version>
</dependency>

<dependency>
  <groupId>junit</groupId>
  <artifactId>junit</artifactId>
  <version>3.8.1</version>
  <scope>test</scope>
</dependency>

</dependencies>
</project>

package com.ok.http.client.okhttp;

import java.util.List;
import java.util.Map;

public class ExecuteTask {
public static void main(String[] args) throws Exception {
// 调用downloadHtml下载网页
CrawlData crawlData = new CrawlData();
String url = null;
url = "http://top.chinaz.com/all/index.html";
System.out.println("开始爬取,请等待.");
String htmlBody = crawlData.downloadHtml(url);
System.out.println("爬取成功");
// 将下载的数据进行分析
List<Map<String, Object>> dataList = Analysis.analysisData(htmlBody);
System.out.println("数据解析成功");
for (Map<String, Object> data : dataList) {
StoreData.adds(data);
System.out.println("存储成功");
}
}
}

package com.ok.http.client.okhttp;

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

/**

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;

public class Analysis {
/**

public class StoreData{

public static void adds(Map<String,Object> dataMap){
    try{
        // 连接到 mongodb 服务String siteRank
        MongoClient mongoClient = new MongoClient( "localhost" , 27017 );
        // 连接到数据库
        MongoDatabase mongoDatabase = mongoClient.getDatabase("sit_rank");
        System.out.println(mongoDatabase);
        System.out.println("成功连接数据库");

        MongoCollection<Document> collection = mongoDatabase.getCollection("information");
        System.out.println(collection);
        System.out.println("集合 information 选择成功");
        //插入文档
        /**
         * 1. 创建文档 org.bson.Document 参数为key-value的格式
         * 2. 创建文档集合List<Document>
         * 3. 将文档集合插入数据库集合中 mongoCollection.insertMany(List<Document>) 插入单个文档可以用 mongoCollection.insertOne(Document)
         * */
        String siteName=null;String domainName=null;String AlexaRank=null;String Synopsis=null;
                String score=null;String siteRank=null;String webSite=null;String RecordInformation=null;
        JSONObject josn = JSONObject.parseObject(dataMap.toString());
                Document document = new Document(josn);
        document.put("_id",siteName);
        document.append("domainName", domainName);
        document.append("AlexaRank",AlexaRank);
        document.append("Synopsis",Synopsis);
        document.append("score",score);
        document.append("siteRank",siteRank);
        document.append("webSite",webSite);
        document.append("RecordInformation",RecordInformation);
        collection.insertOne(document);
        System.out.println("文档插入成功");
        //关闭mongodb连接
        mongoClient.close();
        System.out.println("MongoDB连接已关闭");
    }catch(Exception e){
        System.err.println( e.getClass().getName() + ": " + e.getMessage() );
    }
}

}

推荐阅读:
  1. HtmlUnit、httpclient、jsoup爬取网页信息并解析
  2. SpringBoot中如何实现使用Jsoup爬取网站数据

免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。

okhttpclient soup clien

上一篇:纯CSS如何实现热气球的效果

下一篇:用vue如何实现手机触屏滑动功能的代码示例

相关阅读

您好,登录后才能下订单哦!

密码登录
登录注册
其他方式登录
点击 登录注册 即表示同意《亿速云用户服务条款》