且构网

分享程序员开发的那些事...
且构网 - 分享程序员编程开发的那些事

爬取博客内容记录到数据库

更新时间:2022-03-10 00:28:33

一、需求分析
网站:https://www.cnblogs.com/

分析:

需要的数据:标题、摘要、原文地址、发布时间

存储数据库
爬取博客内容记录到数据库

二、设计数据库
标题、摘要、原文地址、发布时间

文章表:id主键 title标题summary摘要detailurl详细地址pubtime发布时间ctime创建时间

SQL脚本:

create database db_data1906;
use db_data1906;
create table t_bkyarticle(id int primary key auto_increment,title varchar(100),summary text,detailurl varchar(200),pubtime date,ctime date);
三、实现编码
技术栈:SpringBoot

1.新建项目

SpringBoot

2、依赖jar

3、逐层编写代码

实体层

@TableName("t_bkyarticle")
@Data
public class BkyArticle {

@TableId(type = IdType.AUTO)
private Integer id;
private String title;
private String summary;
private String detailurl;
private Date pubtime;
private Date ctime;

}
持久层

public interface BkyArticleDao extends BaseMapper {

@Insert("insert into t_bkyarticle(title,summary,detailurl,pubtime,ctime) values(#{title},#{summary},#{detailurl},#{pubtime},now())")
int save(BkyArticle article);

}
业务逻辑层

public interface BkyArticleService extends IService {

boolean saveEntity(BkyArticle article);

}
@Service
public class BkyArticleServiceImpl extends ServiceImpl implements BkyArticleService {

@Override
public boolean saveEntity(BkyArticle article) {
    return getBaseMapper().save(article)>0;
}

}
4、编写爬虫核心代码

自定义页面处理器

@Service
public class BkyArticlePage implements PageProcessor {

private String baseUrl="https://www.cnblogs.com/";
@Override
public void process(Page page) {
    //1、解析当前页面的内容
    List<String> titles=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/text()").all();
    List<String> urls=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/h3/a/@href").all();
    List<String> infos=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/p[@class='post_item_summary']/text()").all();
    List<String> times=page.getHtml().xpath("div[@id='post_list']/div[@class='post_item']/div[@class='post_item_body']/div[@class='post_item_foot']/a/text()").all();
    //2、组装解析的结果
    List<BkyArticle> articles=new ArrayList<>();
    for(int i=0;i<titles.size();i++){
        BkyArticle article=new BkyArticle();
        article.setTitle(titles.get(i));
        article.setSummary(infos.get(i));
        article.setDetailurl(urls.get(i));
        article.setPubtime(parseTime(getTimeStr(times.get(i))));
        articles.add(article);
    }
    //3、传递给了结果处理器
    page.putField("list",articles);

    //4、分页查询 获取分页的路径并标记继续爬取
    if(page.getUrl().get().equals(baseUrl)){
        //计算所有的分页请路径
        List<String> pageurls=new ArrayList<>();
        List<String>allpages=page.getHtml().xpath("div[@id='paging_block']/div[@class='pager']/a/text()").all();
        int maxPage=Integer.parseInt(allpages.get(allpages.size()-2));
        for(int i=2;i<=maxPage;i++){
            pageurls.add(baseUrl+"/sitehome/p/"+i);
        }
        //设置继续爬取的网页
        page.addTargetRequests(pageurls);
    }
}
private String getTimeStr(String s){
    String s1=s.trim();
    if(s1.indexOf(" ")>0){
        return s.substring(s.indexOf(' ')+1);
    }else {
        return null;
    }
}
private Date parseTime(String time){
    if(time!=null) {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
        try {
            return sdf.parse(time);
        } catch (ParseException e) {
            e.printStackTrace();
            return new Date();
        }
    }else {
        return new Date();
    }
}
private Site site=Site.me().setTimeOut(6000).setSleepTime(2000);

@Override
public Site getSite() {
    return site;
}

}
结果处理器

@Repository
public class BkyArticPipeline implements Pipeline {

@Autowired
private BkyArticleDao bkyArticleDao;
@Override
public void process(ResultItems resultItems, Task task) {
    List<BkyArticle> articleList=resultItems.get("list");
    System.out.println("爬取数据:"+articleList.size());
    for(BkyArticle a:articleList){
        bkyArticleDao.save(a);
    }
}

}
5、编写启动接口

控制器 实现爬取的运行

@Api
@RestController
public class BkyArticController {

@Autowired
private BkyArticleService bkyArticleService;
@Autowired
private BkyArticlePage page;
@Autowired
private BkyArticPipeline pipeline;
//启动爬虫
@GetMapping("/api/spider/start.do")
public R start(){
    Spider.create(page).addPipeline(pipeline).addUrl("https://www.cnblogs.com/").thread(5).run();
    return R.ok("爬取已经启动");
}
//查询爬取数据
@GetMapping("api/bkyartic/all.do")
public R all(){
    return R.ok(bkyArticleService.list());
}

}
6、配置Swagger

@Configuration //配置文件
public class SwaggerConfig {

//创建文档说明
public ApiInfo createAI(){
    ApiInfo apiInfo=new ApiInfoBuilder().title("文章接口").description("实现一款基于爬虫实现的数据接口").contact(new Contact("Feri","http://www.17feri.top","xingfei_work@163.com")).build();
    return apiInfo;
}
//创建Swagger扫描信息
@Bean
public Docket createD(){
    return new Docket(DocumentationType.SWAGGER_2).apiInfo(createAI()).select().
            apis(RequestHandlerSelectors.basePackage("com.feri.point.controller")).build();
}

}
7、启动测试

爬取博客内容记录到数据库