最近接触了下java的爬虫,文本信息爬完了,就想看看图片怎么爬,于是就研究了一下,案例爬取的是CSDN的今日推荐的图片
Jsoup + HttpClients来实现爬虫
所需pom依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- 文件下载 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
爬取代码,定义和思路都写在注释里面的了
package com.xy.test;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test2 {
public static void main(String[] args) throws ClientProtocolException, IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "utf-8");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select(".img_box img");
for (int i = 0; i < 15; i++) {
Element element = elements.get(i);
// 获取 <img /> 的 src
String url = element.attr("src");
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全 ✔
HttpGet PicturehttpGet = new HttpGet(url);
System.out.println("asdasd:"+PicturehttpGet);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("D://img//" + i + ".jpg"));
pictureResponse.close(); // pictureResponse关闭
response.close(); // response关闭
httpclient.close(); // httpClient关闭
效果如下:
