侧边栏壁纸
博主头像
woku博主等级

成功的路上并不拥挤

  • 累计撰写 50 篇文章
  • 累计创建 13 个标签
  • 累计收到 3 条评论

Koa2+Puppeteer爬虫项目-Puppeteer的基本使用

woku
2021-12-10 / 0 评论 / 0 点赞 / 86 阅读 / 4,168 字

是什么

Node.js提供的一个专门爬取的库

使用

  • koa2-generator脚手架创建koa2项目(boss-web-crawler)
  • npm i puppeteer -s
  • 根路由中爬取页面轮播图数据
router.get('/', async (ctx, next) => {
  // 启动
  const bs = await pt.launch(),
    url = 'https://www.lagou.com/',
    // 打开页面
    pg = await bs.newPage()
  // 页面跳转到指定URL
  await pg.goto(url, {
    timeout: 30 * 1000,
    waitUntil: 'networkidle2'
  })

  const result = await pg.evaluate(() => {
    // 这个函数内部的环境就是页面的环境
    const $ = window.$,
      $sliderBox = $(".banner_bg li")

    let sliderData = []
    $sliderBox.each((index, item) => {
      if (index === 0 || index === $sliderBox.length - 1) return
      const el = $(item),
        el_link = el.find('a');
      const dataItem = {
        cId: el_link.attr('data-lg-tj-no'),
        href: el_link.prop('href'),
        imgUrl: el_link.find('img').prop('src'),
      }
      sliderData.push(dataItem)
    })
    return sliderData
  })
  console.log(result)
  // 关闭
  await bs.close()

})

注意:launch``newPage``evaluate都需要await异步等待。

开启子进程

使用child_process开启子进程

新建puppeteer文件夹,里面crawler.js文件

const pt = require('puppeteer')
;
(async () => {
    // 启动
    const bs = await pt.launch(),
        url = 'https://www.lagou.com/',
        // 打开页面
        pg = await bs.newPage()
    // 页面跳转到指定URL
    await pg.goto(url, {
        timeout: 30 * 1000,
        waitUntil: 'networkidle2'
    })

    const result = await pg.evaluate(() => {
        // 这个函数内部的环境就是页面的环境
        const $ = window.$,
            $sliderBox = $(".banner_bg li")

        let sliderData = []
        $sliderBox.each((index, item) => {
            if (index === 0 || index === $sliderBox.length - 1) return
            const el = $(item),
                el_link = el.find('a');
            const dataItem = {
                cId: el_link.attr('data-lg-tj-no'),
                href: el_link.prop('href'),
                imgUrl: el_link.find('img').prop('src'),
            }
            sliderData.push(dataItem)
        })
        return sliderData
    })
    // 关闭
    await bs.close()

    process.send(result)
    setTimeout(() => {
        process.exit(0)
    });

})()

routers路由模块的index.js

const router = require('koa-router')(),
 cp = require('child_process'),
 { resolve } = require('path')


router.get('/', async (ctx, next) => {
  const script = resolve(__dirname, '../puppeteer/crawler.js')
  child = cp.fork(script, [])
  
  let invoked = false
  // 进程成功了发送了消息
  child.on('message', data => {
    console.log(data)
  })

  // 进程退出
  child.on('exit', code => {
    if (invoked) return
    invoked = true
    console.log(code)
  })

  // 进程发生错误
  child.on('error', err => {
    if (invoked) return
    invoked = true
    console.log(err)
  })
})


module.exports = router

封装爬虫和子进程程序

  • 针对路由,用控制器分层,封装逻辑
const router = require('koa-router')(),
      crawlerController = require('../controllers/Crawler')

router.prefix('/crawler')
router.get('/crawler_slider_data', crawlerController.crawlerSliderData)


module.exports = router

针对路由为/crawler/crawler_slider_data的处理函数,调用控制器层里的某个方法。

  • 新建控制器controllers/Crawler.js
class Crawler {
    crawlerSliderData() {
       .....
    }
}

module.exports = new Crawler()
  • crawlerSliderData函数中写逻辑(开启子进程,puppeteer爬取数据)
  • 开启子进程可以作为一个工具函数,通过参数options进行配置(根目录-> libs文件夹-> utils.js)

libs/utils.js


const cp = require('child_process'),
{ resolve } = require('path'),


module.exports = {
    startProcess(options) {
        const script = resolve(__dirname, options.path),
              child = cp.fork(script, [])
        let invoked = false
        child.on('message', data => {
            options.message(data)
        })
        child.on('exit', code => {
            if (invoked) return
            invoked = true
            options.exit(code)
        })
        child.on('error', error => {
            if (invoked) return
            invoked = true
            options.error(error)
        })
    },
}
  • 控制器的crawlerController.crawlerSliderData中调用startProcess函数开启子进程
const {
    startProcess,
    qiniuUpload
} = require('../libs/utils')
const {
    qiniu
} = require('../config/user_config')
class Crawler {
    crawlerSliderData() {
        startProcess({
            path: '../crawler/slider.js',
            async message(data) {
                console.log(data)
            },
            async exit(code) {
                console.log(code)
            },
            async error(err) {
                console.log(err)
            }
        })
    }
}

module.exports = new Crawler()

其中:path: '../crawler/slider.js',是具体爬取页面轮播图数据的逻辑

  • puppeteer进行爬取也可以封装成一个具体的库,参数options传递相关配置(根目录-> libs文件夹-> Crawler.js)
const pt = require('puppeteer')

module.exports = async function(options) {
    const bs = await pt.launch(),
    bg = await bs.newPage(),
    // 爬取哪个页面?
    url = options.url;

    await bg.goto(url, {
        waitUntil: 'networkidle2'
    })
    // 分析结束后,你要干什么?
    const result = await bg.evaluate(options.callback)

    process.send(result)
    setTimeout(() => {
        process.exit(0)
    }, 1000)
}
  • /crawler/slider.js文件中就可以调用上一步中封装的爬取函数
const crawler = require('../libs/Crawler')

crawler({
    url: 'https://www.lagou.com/',
    callback() {
        const $ = window.$,
            $sliderBox = $(".banner_bg li")

        let sliderData = []
        $sliderBox.each((index, item) => {
            if (index === 0 || index === $sliderBox.length - 1) return
            const el = $(item),
                el_link = el.find('a');
            const dataItem = {
                cId: el_link.attr('data-lg-tj-no'),
                href: el_link.prop('href'),
                imgUrl: el_link.find('img').prop('src'),
                imgKey: ''
            }
            sliderData.push(dataItem)
        })
        return sliderData
    }
})

封装后的目录结构为:

0

评论区