[Java][华为云Java编程创造营][学习笔记][第三阶段][04_Java爬虫实战][二]
【摘要】 1.4,综合实战1:Jsoup爬取博客 综合实战1:Jsoup爬取博客(1)以CSDN认证的企业博客,华为开发者论坛为例,需要实现的需求:获取博主的博客文章数量获取博主的博客文章列表的页数获取博主的每篇博客文章的信息(网址,创建时间,标题)获取每篇博客文章中的图片链接将所有获取的数据存储到数据库中 综合实战1:Jsoup爬取博客(2)步骤:1,博主博客网页分析2,项目搭建及环境准备(JDK...
1.4,综合实战1:Jsoup爬取博客
综合实战1:Jsoup爬取博客(1)
- 以CSDN认证的企业博客,华为开发者论坛为例,需要实现的需求:
- 获取博主的博客文章数量
- 获取博主的博客文章列表的页数
- 获取博主的每篇博客文章的信息(网址,创建时间,标题)
- 获取每篇博客文章中的图片链接
- 将所有获取的数据存储到数据库中
综合实战1:Jsoup爬取博客(2)
- 步骤:
- 1,博主博客网页分析
- 2,项目搭建及环境准备(JDK1.8,IntelliJ IDEA,MySQL5.7.32)
- 3,编写数据获取,数据处理,数据存储工具类
- 4,主函数完成业务逻辑
综合实战1:Jsoup爬取博客(3)
代码实现
-
1,新建
maven
工程,新建包com.huawei
,新建数据库配置文件db.properties
-
2,在
pom.xml
导入Jsoup
和MySQL
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>tempjava4</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<!--Jsoup-->
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
<!--MySQL-->
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.25</version>
</dependency>
</dependencies>
</project>
- 3,在
db.properties
配置数据库信息
#mysql
className=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost:3306/db_blog?characterEncoding=utf8
username=填写数据库登录名
password=填写数据库密码
-
4,用面向对象的角度来分析需求得出实体类有文章,博主,图片
- 博主对文章,一对多;文章对图片,一对多
-
5,数据库建表
-
如果建立数据库时没有选择UTF8格式,那么存储中文的时候就会报错
-
create table t_article
(
id int auto_increment
primary key,
b_id varchar(50) null,
url varchar(300) null,
create_time varchar(50) null,
title varchar(300) null
);
create table t_blogger
(
id varchar(50) not null
primary key,
article_count int null,
page_count int null
);
create table t_picture
(
id int auto_increment
primary key,
a_id int null,
url varchar(300) null
);
- 6,实体类Article
package com.huawei.entity;
/*
* 文章实体类
* */
public class Article
{
//文章id
private Integer id;
//博主id
private String bId;
//文章的url
private String url;
//文章的创建时间
private String createTime;
//文章的标题
private String title;
public Article()
{
}
@Override
public String toString()
{
return "Article{" +
"id=" + id +
", bId='" + bId + '\'' +
", url='" + url + '\'' +
", createTime='" + createTime + '\'' +
", title='" + title + '\'' +
'}';
}
public Integer getId()
{
return id;
}
public void setId(Integer id)
{
this.id = id;
}
public String getbId()
{
return bId;
}
public void setbId(String bId)
{
this.bId = bId;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url = url;
}
public String getCreateTime()
{
return createTime;
}
public void setCreateTime(String createTime)
{
this.createTime = createTime;
}
public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public Article(Integer id, String bId, String url, String createTime, String title)
{
this.id = id;
this.bId = bId;
this.url = url;
this.createTime = createTime;
this.title = title;
}
}
- 7,实体类Blogger
package com.huawei.entity;
/*
* 博主实体类
* */
public class Blogger
{
//博主id
private String id;
//博主博客文章的数量
private Integer articleCount;
//博客文章的总页面数
private Integer pageCount;
public Blogger()
{
}
public Blogger(String id, Integer articleCount, Integer pageCount)
{
this.id = id;
this.articleCount = articleCount;
this.pageCount = pageCount;
}
@Override
public String toString()
{
return "Blogger{" +
"id='" + id + '\'' +
", articleCount=" + articleCount +
", pageCount=" + pageCount +
'}';
}
public String getId()
{
return id;
}
public void setId(String id)
{
this.id = id;
}
public Integer getArticleCount()
{
return articleCount;
}
public void setArticleCount(Integer articleCount)
{
this.articleCount = articleCount;
}
public Integer getPageCount()
{
return pageCount;
}
public void setPageCount(Integer pageCount)
{
this.pageCount = pageCount;
}
}
- 8,实体类Picture
package com.huawei.entity;
/*
* 图片实体类
* */
public class Picture
{
//图片id
private Integer id;
//文章id
private Integer aId;
//图片的url
private String url;
public Picture()
{
}
@Override
public String toString()
{
return "Picture{" +
"id=" + id +
", aId=" + aId +
", url='" + url + '\'' +
'}';
}
public Integer getId()
{
return id;
}
public void setId(Integer id)
{
this.id = id;
}
public Integer getaId()
{
return aId;
}
public void setaId(Integer aId)
{
this.aId = aId;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url = url;
}
public Picture(Integer id, Integer aId, String url)
{
this.id = id;
this.aId = aId;
this.url = url;
}
}
- 9,工具类BlogUtil
package com.huawei.util;
import com.huawei.entity.Article;
import com.huawei.entity.Picture;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/*
* 博客数据爬取工具类
* */
public class BlogUtil
{
/*
* 获取博客文章数量
* */
public static int getArticleCount(String blogHome)
{
//1,获取文档对象
Document doc = null;
try
{
doc = Jsoup.connect(blogHome).get();
} catch (IOException e)
{
e.printStackTrace();
}
//2,查找包含博客数量的元素
Element countElement = doc.select("span.count").first();
//3,取出元素包含的文本,这里为博客数量
String articleCount = countElement.text();
return Integer.parseInt(articleCount);
}
/*
* 获得博客文章页数
* */
public static int getArticlePageCount(int articleCount)
{
//向上取整,获得页面数量
int pageCount = (int) Math.ceil(articleCount / Constants.PAGE_SIZE);
return pageCount;
}
/*
* 获取博客列表
* */
public static List<Article> getArticleList(int pageBegin, int pageEnd, String blogHome, String bId)
{
//1,创建博客列表
ArrayList<Article> articles = new ArrayList<>((pageEnd - pageBegin + 1) * ((int) Constants.PAGE_SIZE));
//2,定义变量:博客列表的网址
String articleListUrl = null;
for (int i = pageBegin; i <= pageEnd; i++)
{
//拼接URL
articleListUrl = blogHome + Constants.ARTICLE_LIST_URI + i;
//1,获取document对象
Document doc = null;
try
{
doc = Jsoup.connect(articleListUrl).get();
} catch (IOException e)
{
e.printStackTrace();
}
//2,查找包含博客列表的元素
Element articleList = doc.select("div.article-list").first();
//3,查找每篇博客的元素
Elements articleElements = articleList.select("div.article-item-box.csdn-tracking-statistics");
for (Element element : articleElements)
{
//获取文章的URL
String url = element.select("h4>a").first().attr("href");
//获取文章的标题
String title = element.select("h4>a").first().text().substring(3);//获取标题后去除前面的"原创 ",共3个字符
//获取文章的创建时间
String createTime = element.select("span.date").first().text();
Article article = new Article();
article.setUrl(url);
article.setTitle(title);
article.setCreateTime(createTime);
article.setbId(bId);
articles.add(article);
}
}
return articles;
}
/*
* 获取博客图片
* */
public static List<Picture> getArticlePictures(String articleUrl)
{
ArrayList<Picture> pictures = new ArrayList<>();
//1,获取文档对象
Document doc = null;
try
{
doc = Jsoup.connect(articleUrl).get();
} catch (IOException e)
{
e.printStackTrace();
}
//2,获取文章元素下面的所有图片元素,并进行遍历
Element article = doc.select("article.baidu_pl").first();
Elements images = article.select("img");
for (Element image : images)
{
String picUrl = image.attr("src");
Picture picture = new Picture();
picture.setUrl(picUrl);
pictures.add(picture);
}
return pictures;
}
}
- 10,工具类Constants
package com.huawei.util;
/*
* 常量类
* */
public class Constants
{
//定义常量字符串
/*
* BLOG_HOME:华为开发者论坛 https://hwdevelopers.blog.csdn.net/
* */
public static final String BLOG_HOME="https://hwdevelopers.blog.csdn.net";
/*
* 博客文章列表拼接url,注意:如果BLOG_HOME的尾部加上了 / ,那么下方的头部就不要加上 /
* */
public static final String ARTICLE_LIST_URI="/article/list/";
/*
* 页面博客数量
* */
public static final double PAGE_SIZE=40.0;
}
- 11,工具类JDBCUtils
package com.huawei.util;
import java.io.IOException;
import java.io.InputStream;
import java.sql.*;
import java.util.Properties;
/*
* JDBC工具类
* */
public class JDBCUtils
{
//创建全局属性
private static String className;
private static String url;
private static String username;
private static String password;
static
{
//加载配置文件,获取连接信息
Properties properties = new Properties();
try
{
InputStream in = JDBCUtils.class.getClassLoader().getResourceAsStream("db.properties");
//读取配置文件,获取信息
properties.load(in);
//根据key获取value
className = properties.getProperty("className");
url = properties.getProperty("url");
username = properties.getProperty("username");
password = properties.getProperty("password");
} catch (IOException e)
{
e.printStackTrace();
}
}
static
{
//加载驱动程序
try
{
Class.forName(className);
} catch (ClassNotFoundException e)
{
e.printStackTrace();
}
}
//获取数据库连接对象
public static Connection getConn()
{
try
{
return DriverManager.getConnection(url, username, password);
} catch (SQLException e)
{
e.printStackTrace();
throw new RuntimeException("数据库连接错误!");
}
}
//关闭连接,释放资源
public static void closeAll(Connection conn, Statement stat, ResultSet rs)
{
if (rs != null)
{
try
{
rs.close();
} catch (SQLException e)
{
e.printStackTrace();
}
}
if (stat != null)
{
try
{
stat.close();
} catch (SQLException e)
{
e.printStackTrace();
}
}
if (conn != null)
{
try
{
conn.close();
} catch (SQLException e)
{
e.printStackTrace();
}
}
}
//方法重载
public static void closeAll(Connection conn, PreparedStatement pstmt)
{
if (pstmt != null)
{
try
{
pstmt.close();
} catch (SQLException e)
{
e.printStackTrace();
}
}
if (conn != null)
{
try
{
conn.close();
} catch (SQLException e)
{
e.printStackTrace();
}
}
}
public static ResultSet executeQuery(String preparedSql, Object... param)
{
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet res = null;
/*处理SQL,执行SQL*/
try
{
conn = getConn();//得到数据库连接
pstmt = conn.prepareStatement(preparedSql);//得到PreparedStatement对象
if (param != null)
{
for (int i = 0; i < param.length; i++)
{
pstmt.setObject(i + 1, param[i]);//为预编译sql设置参数
}
}
res = pstmt.executeQuery();//执行SQL语句
} catch (SQLException e)
{
e.printStackTrace();
} finally
{
JDBCUtils.closeAll(conn, pstmt, res);
}
return res;
}
public static int executeUpdate(String preparedSql, Object... param)
{
Connection conn = null;
PreparedStatement pstmt = null;
int key = 0;//主键
/*处理SQL,执行SQL*/
try
{
conn = getConn();//得到数据库连接
//注意:MySQL5.1.7需要显式添加一个参数Statement.RETURN_GENERATED_KEYS
pstmt = conn.prepareStatement(preparedSql,Statement.RETURN_GENERATED_KEYS);//得到PreparedStatement对象
if (param != null)
{
for (int i = 0; i < param.length; i++)
{
pstmt.setObject(i + 1, param[i]);//为预编译SQL设置参数
}
}
pstmt.executeUpdate();//执行SQL语句
ResultSet resultSet = pstmt.getGeneratedKeys();
if (resultSet.next())
{
key = resultSet.getInt(1);//获取主键返回
}
} catch (SQLException e)
{
e.printStackTrace();//处理异常
} finally
{
JDBCUtils.closeAll(conn, pstmt);
}
return key;
}
}
- 12,工具类StringUtil
package com.huawei.util;
/*
* 字符串工具类
* */
public class StringUtil
{
/*
* 博主id处理
* */
public static String subId(String url)//url为博客网址
{
if (url != null && url.trim().length() != 0)//非空判断
{
/*int index = url.lastIndexOf("/");//切割博主id
String id = url.substring(index + 1);
return id;*/
int index1=url.indexOf("https://");
int index2 = url.indexOf(".");
String id = url.substring(index1, index2);
id = id.replace("https://", "");
return id;
} else
{
return "";
}
}
}
- 13,启动类Main
package com;
import com.huawei.entity.Article;
import com.huawei.entity.Blogger;
import com.huawei.entity.Picture;
import com.huawei.util.BlogUtil;
import com.huawei.util.Constants;
import com.huawei.util.JDBCUtils;
import com.huawei.util.StringUtil;
import java.util.List;
public class Main
{
public static String insertBlogger = "insert into t_blogger(id,article_count,page_count) value(?,?,?)";
public static String insertArticle = "insert into t_article(b_id,url,create_time,title) value(?,?,?,?)";
public static String insertPicture = "insert into t_picture(a_id,url) value(?,?)";
public static void main(String[] args)
{
int articleCount = BlogUtil.getArticleCount(Constants.BLOG_HOME);
int pageCount = BlogUtil.getArticlePageCount(articleCount);
//博主对象
Blogger blogger = new Blogger(StringUtil.subId(Constants.BLOG_HOME), articleCount, pageCount);
System.out.println(blogger);
JDBCUtils.executeUpdate(insertBlogger, blogger.getId(), blogger.getArticleCount(), blogger.getPageCount());
//文章列表
List<Article> articleList = BlogUtil.getArticleList(
1, 2, Constants.BLOG_HOME,
StringUtil.subId(Constants.BLOG_HOME));
for (Article article : articleList)
{
System.out.println(article);
final int aid = JDBCUtils.executeUpdate(insertArticle, article.getbId(), article.getUrl(), article.getCreateTime(), article.getTitle());
//图片列表
List<Picture> pictures = BlogUtil.getArticlePictures(article.getUrl());
for (Picture picture : pictures)
{
System.out.println(picture);
JDBCUtils.executeUpdate(insertPicture, aid, picture.getUrl());
}
}
}
}
结果显示
【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)