shua.js 笔记
        【摘要】 
                    let superagent = require('superagent')let cheerio = require('cheerio') let baseUrl = 'https://blog.csdn.net/github_35631540/article/list/1'// https://blog.csdn.net/githu...
    
    
    
    
  
   - 
    
     
    
    
     
      let superagent = require('superagent')
     
    
 
   - 
    
     
    
    
     
      let cheerio = require('cheerio')
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      let baseUrl = 'https://blog.csdn.net/github_35631540/article/list/1'
     
    
 
   - 
    
     
    
    
     
      // https://blog.csdn.net/github_35631540/article/list/1?t=1&
     
    
 
   - 
    
     
    
    
     
      // https://blog.csdn.net/github_35631540/article/list/2?t=1&
     
    
 
   - 
    
     
    
    
     
      // https://blog.csdn.net/github_35631540/article/list/3?t=1&
     
    
 
   - 
    
     
    
    
     
      let blogHrefArr = []
     
    
 
   - 
    
     
    
    
     
      let totalPage = 4
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      const setData = {
     
    
 
   - 
    
     
    
    
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
     
    
 
   - 
    
     
    
    
       'Referrer': 'https://blog.csdn.net/github_35631540?t=1',
     
    
 
   - 
    
     
    
    
       'Content-Type': 'text/html; charset=UTF-8',
     
    
 
   - 
    
     
    
    
     
      }
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      let mainFun = () => {
     
    
 
   - 
    
     
    
    
     
        superagent
     
    
 
   - 
    
     
    
    
     
          .get(`${baseUrl}`)
     
    
 
   - 
    
     
    
    
     
          .set(setData)
     
    
 
   - 
    
     
    
    
     
          .end((err, res) => {
     
    
 
   - 
    
     
    
    
           let $ = cheerio.load(res.text)
     
    
 
   - 
    
     
    
    
           let len = $('.article-list .content a').length
     
    
 
   - 
    
     
    
    
           if (len > 0) {
     
    
 
   - 
    
     
    
    
             console.log(`获取到${len}条博客记录`)
     
    
 
   - 
    
     
    
    
             console.log(`开始爬取博客记录....`)
     
    
 
   - 
    
     
    
    
             for (let i = 0; i < $('.article-list .content a').length; i++) {
     
    
 
   - 
    
     
    
    
               let blogItem = {
     
    
 
   - 
    
     
    
    
                 name: $('.article-list h4').eq(i).text().replace(/\s+/g, ''),
     
    
 
   - 
    
     
    
    
                 href: $('.article-list .content a').eq(i).attr('href'),
     
    
 
   - 
    
     
    
    
     
                }
     
    
 
   - 
    
     
    
    
               getBlogDetail(blogItem)
     
    
 
   - 
    
     
    
    
     
                blogHrefArr.push(blogItem)
     
    
 
   - 
    
     
    
    
     
              }
     
    
 
   - 
    
     
    
    
             // console.log(blogHrefArr)
     
    
 
   - 
    
     
    
    
     
            }
     
    
 
   - 
    
     
    
    
     
        })
     
    
 
   - 
    
     
    
    
     
      }
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      let getBlogDetail = (blogItem) => {
     
    
 
   - 
    
     
    
    
     
        superagent
     
    
 
   - 
    
     
    
    
     
        .get(`${blogItem.href}`)
     
    
 
   - 
    
     
    
    
     
        .set(setData)
     
    
 
   - 
    
     
    
    
     
        .end((err, res) => {
     
    
 
   - 
    
     
    
    
         if(res.statusCode === 200) {
     
    
 
   - 
    
     
    
    
           console.log(`爬取成功:__${blogItem.name}`)
     
    
 
   - 
    
     
    
    
     
          }
     
    
 
   - 
    
     
    
    
     
        })
     
    
 
   - 
    
     
    
    
     
      }
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      // 使用递归获取所有页的博客链接
     
    
 
   - 
    
     
    
    
     
      let getAllBlogHreef = (n) => {
     
    
 
   - 
    
     
    
    
     
        superagent
     
    
 
   - 
    
     
    
    
     
          .get(`https://blog.csdn.net/github_35631540/article/list/${n+1}?t=1&`)
     
    
 
   - 
    
     
    
    
     
          .set(setData)
     
    
 
   - 
    
     
    
    
     
          .end((err,res) => {
     
    
 
   - 
    
     
    
    
           let $ = cheerio.load(res.text)
     
    
 
   - 
    
     
    
    
           let len = $('.article-list .content a').length
     
    
 
   - 
    
     
    
    
           if (len > 0) {
     
    
 
   - 
    
     
    
    
             console.log(`获取到${len}条博客记录`)
     
    
 
   - 
    
     
    
    
             console.log(`开始获取博客地址....`)
     
    
 
   - 
    
     
    
    
             for (let i = 0; i < $('.article-list .content a').length; i++) {
     
    
 
   - 
    
     
    
    
               let blogItem = {
     
    
 
   - 
    
     
    
    
                 name: $('.article-list h4').eq(i).text().replace(/\s+/g, ''),
     
    
 
   - 
    
     
    
    
                 href: $('.article-list .content a').eq(i).attr('href'),
     
    
 
   - 
    
     
    
    
     
                }
     
    
 
   - 
    
     
    
    
               getBlogDetail(blogItem)
     
    
 
   - 
    
     
    
    
               // blogHrefArr.push(blogItem)
     
    
 
   - 
    
     
    
    
     
              }
     
    
 
   - 
    
     
    
    
     
              n++
     
    
 
   - 
    
     
    
    
             if(n<totalPage){
     
    
 
   - 
    
     
    
    
               getAllBlogHreef(n)
     
    
 
   - 
    
     
    
    
     
              }else{
     
    
 
   - 
    
     
    
    
               return blogHrefArr
     
    
 
   - 
    
     
    
    
     
              }
     
    
 
   - 
    
     
    
    
     
            }
     
    
 
   - 
    
     
    
    
     
        })
     
    
 
   - 
    
     
    
    
     
      }
     
    
 
   - 
    
     
    
    
      
     
    
 
   - 
    
     
    
    
     
      console.log(getAllBlogHreef(0))
     
    
 
  
 
 
cd E:\Main_Pro\FUN\upblog & node shua.js
 
文章来源: fizzz.blog.csdn.net,作者:拿我格子衫来,版权归原作者所有,如需转载,请联系作者。
原文链接:fizzz.blog.csdn.net/article/details/90032834
        【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
            cloudbbs@huaweicloud.com
        
        
        
        
        
        
        - 点赞
 - 收藏
 - 关注作者
 
            
           
评论(0)