instagram 内容抓取

抓取说明

1、需要登录信息,即抓取时需要附带cookie,同时需要user-agent

2、数据获取接口及下载均有频率限制,无间隔的请求(几百个资源)会被限制,在被限制后睡眠一定时间继续。

3、内容抓取分为两个入口

  • 一个是抓取某个用户发布的所有资源
  • 一个是抓取某个tag下的所有资源

两种入口附带的cookie不同,请求的URL不同。

4、抓取步骤:

  1. 电脑端登陆ins,保存 cookiequery_hashuser-agent信息。后续所有请求附带cookieuser-agent
  2. 模拟请求个人主页/tag主页,通过解析HTML页面,得到userId/tag name。同时拿到第一页的数据及下页cursor。
  3. 通过API接口,根据cursor持续获取多页数据。所有数据获取完毕后开始下载。
  4. 返回的数据中,图片资源可以直接下载。视频资源需要再次请求视频地址获取接口获得视频地址,然后再下载。

5、请求数据接口:

user:

https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D

tag:

https://www.instagram.com/graphql/query/?query_hash=1780c1b186e2c37de9f7da95ce41bb67&variables=%7B%22tag_name%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D

获取视频的地址:

https://www.instagram.com/p/%s/?__a=1

核心代码

/**
 * 获取指定用户的主页
 */
const getHtml = item => {
    let userName = item.name,
        type = item.type
    let url
    if (item.type == 'user') {
        url = `${baseUrl}${userName}/`
        headers.cookie = userCookie
    } else {
        url = `${baseUrl}explore/tags/${userName}/`
        headers.cookie = tagCookie
    }
    let options = {
        method: 'GET',
        url: url,
        headers: headers
    }

return new Promise((resolve, reject) => {
    request(options, function (error, response, body) {
        if (error) return reject(error);

        const $ = cheerio.load(body)
        let html = $.html()

        // 获取uid/tag name
        userId = item.type == 'user' ? html.match(/"profilePage_([0-9]+)"/)[1] : html.match(/"name":"([a-zA-Z_]+)",/)[1]
        log.info(`${userName} id/name 获取成功 ${userId}`)

        // 获取首页数据
        data = html.match(/<script type="text\/javascript">window._sharedData = (.*?);<\/script>/)[1]
        data = JSON.parse(data)

        let edges, count, pageInfo, cursor, flag, totalPage

        let firstPageDate

        if (item.type == 'user') {
            firstPageDate = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media
        } else {
            firstPageDate = data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media
        }

        edges = firstPageDate.edges
        count = firstPageDate.count
        pageInfo = firstPageDate.page_info

        cursor = pageInfo.end_cursor
        flag = pageInfo.has_next_page
        totalPage = Math.ceil(count / purePage)

        // 存储首页信息
        edges.forEach(item => {
            item.mode = type
            storeMedia(item)
        })

        // 返回分页信息
        return resolve({
            totalPage: totalPage,
            userId: userId,
            cursor: cursor
        })

    });
})

}

/**

  • 获取该用户的所有内容
  • /
    const getAllUrls = (item, totalPage, uid, cursor) => {
    let userName = item.name
    let actions = [async.constant(item, uid, cursor)]
    let limit = totalPage > pageLimit ? pageLimit : totalPage
    for (let i = 0; i < limit; i++) {
      actions.push(fetchData)
    
    }
    log.info(${userName} 数据共 ${totalPage} 页)
    return new Promise((resolve, reject) => {
      async.waterfall(actions, (error, result) =&gt; {
          log.info(`${userName} 的所有帖子数据获取成功,共${media.length}个帖子,视频${videoCount}个,图片${imgCount}个`, )
          fetchPageCount = 0
          //console.log(media)
          return resolve(media)
      })
    
    })

}

/**

  • 请求获取数据

  • /
    const fetchData = (item, uid, offset, next) => {

    let userName = item.name,

      type = item.type
    

    let url

    if (item.type == ‘user’) {

      url = util.format(fetchUserUrl, uid, offset)
      headers.cookie = userCookie
    

    } else {

      url = util.format(fetchTagUrl, uid, offset)
      headers.cookie = tagCookie
    

    }

    let options = {

      method: 'GET',
      url: url,
      headers: headers
    

    };

    request(options, function (error, response, body) {

      if (error) {
          log.error('fetch data error', error)
          log.info('休息1min~')
          return setTimeout(function () {
              return next(null, item, uid, offset)
          }, 1 * 60 * 1000)
      }
    
      let data
      try {
          data = JSON.parse(body)
      } catch (error) {
          log.error('json序列化失败', error)
          return next(null, item, uid, offset) 
      }
      
      if (data.status == 'fail') {
          log.error('返回内容失败', data)
          log.info('休息1min~')
          //return next(data.message)
          return setTimeout(function () {
              return next(null, item, uid, offset)
          }, 1 * 60 * 1000)
      }
    
      let listData
      try {
          if (item.type == 'user') {
              listData = data.data.user.edge_owner_to_timeline_media
          } else {
              listData = data.data.hashtag.edge_hashtag_to_media
          }
      } catch (error) {
          log.error('数据获取失败', error)
          next(error)
      }
    
      let edges = listData.edges
      edges.forEach(item =&gt; {
          item.mode = type
          storeMedia(item)
      })
      let {
          has_next_page,
          end_cursor
      } = listData.page_info
    
      log.info(`page:${++fetchPageCount} ${userName} 数据获取成功,帖子 ${edges.length} 个, has_next_page: ${has_next_page} ,end_cursor: ${end_cursor}`)
    
      if (!has_next_page) {
          return next('所有数据获取完毕,无下页')
      }
      setTimeout(function () {
          return next(null, item, uid, end_cursor)
      }, 2000)
    

    });

}

/**

  • 根据视频的shortcode获取视频的下载地址

  • /
    const fetchVideoUrl = (mode, shortcode) => {
    let url = util.format(getVideoUrl, shortcode)

    if (mode == ‘user’) {

      headers.cookie = userCookie
    

    } else {

      headers.cookie = tagCookie
    

    }
    let options = {

      method: 'GET',
      url: url,
      headers: headers
    

    }
    return new Promise((resolve, reject) => {

      request(options, function (error, response, body) {
          let videoUrl = ''
          if (error) {
              log.error(`获取 ${shortcode} 视频地址失败`, error)
              return resolve(videoUrl)
          }
    
          try {
              let data = JSON.parse(body)
              videoUrl = data.graphql.shortcode_media.video_url
          } catch (error) {
              log.error(`获取 ${shortcode} videoUrl 为空`)
          }
          return resolve(videoUrl)
      })
    

    })

}

/**

  • 根据不同的类型存储数据
  • /
    const storeMedia = async item => {
    let result = {
      id: item.node.id,
      desc: item.node.edge_media_to_caption.edges[0] ? item.node.edge_media_to_caption.edges[0].node.text : ''
    
    }
    if (item.node.is_video) {
      // video
      // 如果有video_url直接获取
      // 如果没有video_url,通过接口获取
      let videoUrl = item.node.video_url
      if (!videoUrl) videoUrl = await fetchVideoUrl(item.mode, item.node.shortcode)
      if (videoUrl) {
          result.type = 'video'
          result.url = videoUrl
          videoCount++
      }
    
    } else {
      // img
      let imgUrl = item.node.display_url
      if (imgUrl) {
          result.type = 'img'
          result.url = imgUrl
          imgCount++
      }
    
    }
    media.push(result)
    }

/**

  • 下载视频/图片

  • /
    const download = (category, media, next) => {

    let isExist = isFileExist(media.id)
    if (isExist) return next(null)

    let filePath
    if (media.type == ‘video’) {

      filePath = `${videoDlPath}/${media.id}.mp4`
    

    } else if (media.type == ‘img’) {

      filePath = `${imgDlPath}/${media.id}.jpg`
    

    } else return next(null)

    let st = new Date()
    request(media.url)

      .on('response', function (res) {
          // create file write stream
          let fws = fs.createWriteStream(filePath);
          // setup piping
          res.pipe(fws);
          // finish
          res.on('end', function (e) {
              let et = new Date()
              let ut = timeUsed((et - st) / 1000)
              log.info(`${videoDl + imgDl} finish download ${category} ${filePath},用时${ut}`)
              saveJsonData(media.type, {
                  id: media.id,
                  category: category,
                  desc: media.desc
              })
              if (media.type == 'video') videoDl++
              else imgDl++
    
              return next(null)
          });
          // error handler
          res.on('error', err =&gt; {
              log.error('download error', err)
              return next(null)
          })
      })
      .on('error', function (err) {
          log.error('request source failed', media.url, err)
          // 大约3分钟可恢复
          log.info('超频啦!休息1分钟~')
          setTimeout(function () {
              return next(null)
          }, 1 * 60 * 1000)
    
      })
    

}

/**

  • 视频是否已下载
  • /
    const isFileExist = id => {
    let videoPath = ${videoDlPath}/${id}.mp4
    let imgPath = ${imgDlPath}/${id}.jpg
    if (fs.existsSync(videoPath)) {
      log.info('video file exist', videoPath)
      return true
    
    } else if (fs.existsSync(imgPath)) {
      log.info('img file exist', imgPath)
      return true
    
    } else return false
    }

/**

  • 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存

  • /
    const saveJsonData = (type, data) => {
    try {

      // 读取已有json信息
      let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
      jsonFile += `/data.json`
    
      let jsonData = []
      if (fs.existsSync(jsonFile)) {
          fileData = fs.readFileSync(jsonFile, {
              encoding: 'utf8'
          })
          if (fileData) {
              jsonData = JSON.parse(fileData)
          }
      }
      // 写入
      jsonData.push(data)
      fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
    

    } catch (error) {

      log.error('写入json文件失败', data)
    

    }

}

const clearData = () => {
media = []
videoCount = 0
imgCount = 0
videoDl = 0
imgDl = 0
}

/**

  • 下载某用户/标签下获取的所有资源
  • /
    const downloadAll = (userName, data) => {
    let dlActions = data.map(item => next => {
      download(userName, item, next)
    
    })
    return new Promise((resolve, reject) => {
      async.series(dlActions, (error, result) =&gt; {
          return resolve(result)
      })
    
    })
    }

/**

  • 用时显示
  • /
    const timeUsed = t => {
    // [1s, 1m)
    if (t < 60) return ${Math.ceil(t)}s
    // [1m, 1h)
    else if (t >= 60 && t < 60 * 60) return ${Math.floor(t/60)}m${Math.floor(t%60)}s
    // [1h, 1d)
    else if (t >= 60 * 60 && t < 60 * 60 * 24) return ${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
    // [1d, ~)
    else return ${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
    }

/**

  • 某个用户/标签的抓取任务

  • /
    const task = async (item, next) => {
    let userName = item.name

    let {

      totalPage,
      userId,
      cursor
    

    } = await getHtml(item).catch(error => {

      log.error('fetch error', error)
      return next(null)
    

    })

    let data = await getAllUrls(item, totalPage, userId, cursor)

    clearData()

    let st = new Date()
    let download = await downloadAll(userName, data)
    let et = new Date()
    let ut = timeUsed((et - st) / 1000)
    log.info(${userName} 所有下载完成, video ${videoDl} 个,img ${imgDl} 个,共用时 ${ut})
    clearData()
    return next(null)

}

const main = () => {
let actions = target.map(item => next => {
task(item, next)
})
async.series(actions, (error, result) => {
log.info(所有 ${result.length} 个任务完成, error)
process.exit(0)
})
}

main()

完整代码: https://github.com/flute/instagram-crawler

coub.com 内容抓取

抓取说明

1、总共17个分类。

2、数据获取

  • url:https://coub.com/api/v2/timeline/hot/movies/half?per_page=25
  • 说明:movies 为分类。 per_page 为每页返回的数据量[1,25]。首次获取只需传入 page=1 即为第一页的数据。下次请求附带字段 anchor 为上次请求返回的 next 参数即可。

3、每个资源的属性:

  • 唯一标志: id、permalink
  • 资源描述: titile

4、下载

coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。
下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl

5、分类数组

["animals-pets", "mashup", "anime", "movies", "gaming", "cartoons", "art", "music", "sports", "science-technology", "celebrity", "nature-travel", "fashion", "dance", "cars", "nsfw"]

核心代码:

/**
 * 获取视频列表,每次请求返回10个视频
 * @param {number} page 请求的页数
 * @param {number} anchor 保证数据的不重复性
 */
function getCoubVideoList(c, page = 1, anchor, next) {
    if (!c) {
        log.error('category empty', c)
        return next(new Error('category empty'), null)
    }
    var options = {
        method: 'GET',
        url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
        //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
        qs: {
            page: page,
            per_page: per_page
        }
    };
    if (anchor) options.qs.anchor = anchor

request(options, function (error, response, body) {
    if (error) {
        next(error, null)
        return
    }
    let data = JSON.parse(body)

    if (data &amp;&amp; data.coubs &amp;&amp; data.coubs.length) {
        log.info(`获取视频列表成功 page ${page}`, data.next, data.coubs.length)
        //videoList.push(data.data)
        videoList = videoList.concat(data.coubs)
        return next(null, c, ++data.page, data.next)
    } else {
        log.info('获取内容为空 page ${page}')
        return next(null, c, ++data.page, data.next)
    }
});

}

/**

  • 获取指定分类的总页数
  • /
    const getTotalPage = (c) => {
    var options = {
      method: 'GET',
      url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
      //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
      qs: {
          page: 1,
          per_page: per_page
      }
    
    };
    return new Promise((resolve, reject) => {
      request(options, function (error, response, body) {
          if (error) return reject(new Error(error))
          let data = JSON.parse(body)
          if (data &amp;&amp; data.total_pages) {
              log.info(`获取${c}总页数成功`, data.total_pages)
              return resolve(data.total_pages)
          } else {
              log.info(`获取${c}总页数失败`)
              return reject(new Error('页数为空'))
          }
      });
    
    })

}

/**

  • 获取多页的视频
  • /
    const getMultiVideo = async c => {
    // 总页数
    let totalPage = await getTotalPage(c)
    // 每页依次队列获取
    let actions = [async.constant(c, startPage, startAnchor)]
    for (let i = 1; i <= totalPage; i++) {
      actions.push(getCoubVideoList)
    
    }
    return new Promise((resolve, reject) => {
      async.waterfall(actions, function (err, result) {
          log.info(`finish crawler ${c} videos`, err, videoList.length)
          if (err) return reject(new Error(err))
          return resolve(videoList)
      })
    
    })
    }

/**

  • 根据视频的permalink下载视频

  • @param {string} id video permalink

  • /
    async function downloadFile(c, video, next) {
    if (!video || !video.permalink) return next(null, ‘’)
    let id = video.permalink

    let filename = ${dlPath}/${id}.mp4

    let isExist = isFileExist(id)
    // 文件已存在
    if (isExist) {

      return next(null, filename)
    

    }

    // 下载操作
    const coub = await Coub.fetch(http://coub.com/view/${id}).catch(error => {

      console.log('fetch error', error)
      return next(null, '')
    

    })
    if (!coub) return next(null, ‘’)
    coub.attachAudio()
    if (fastMode) coub.addOption(‘-c’, ‘copy’)
    coub.addOption(‘-shortest’)
    let ts = new Date()
    coub.write(filename)

      .then(result =&gt; {
          let te = new Date()
          let tu = (te - ts) / 1000
          log.info(`${downloadCount}:finish download ${c} ${id}.mp4`, filename, `用时${tu}s`)
          downloadCount++
          // 视频信息
          let videoInfo = {
              desc: video.title,
              category: c,
              filename: `${id}.mp4`
          }
          // 实时写入json
          saveJsonData(videoInfo)
    
          dlFilesJson.push(videoInfo)
          return next(null, result)
          //return resolve(result)
      })
      .catch(error =&gt; {
          log.error(`download error ${id}.mp4`, error)
          return next(error, '')
          //return reject(error)
      })
    

    }

/**

  • 视频是否已下载

  • /
    const isFileExist = id => {
    let oldPath = path.resolve(__dirname, ./src/video/${id}.mp4);
    let newPath = path.resolve(__dirname, ./downloads/video/${id}.mp4);
    let weeklyPath = path.resolve(__dirname, ./weekly/video/${id}.mp4);
    let monthlyPath = path.resolve(__dirname, ./monthly/video/${id}.mp4);
    let quarterPath = path.resolve(__dirname, ./quarter/video/${id}.mp4);
    let halfPath = path.resolve(__dirname, ./half/video/${id}.mp4);

    if (fs.existsSync(oldPath)) {

      log.info('file exist', oldPath)
      return true
    

    } else if (fs.existsSync(newPath)) {

      log.info('file exist', newPath)
      return true
    

    } else if(fs.existsSync(weeklyPath)){

      log.info('file exist', weeklyPath)
      return true
    

    } else if(fs.existsSync(monthlyPath)){

      log.info('file exist', monthlyPath)
      return true
    

    } else if(fs.existsSync(quarterPath)){

      log.info('file exist', quarterPath)
      return true
    

    } else if(fs.existsSync(halfPath)){

      log.info('file exist', halfPath)
      return true
    

    } else return false
    }

/**

  • 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存

  • @param {*} data

  • /
    const saveJsonData = data => {
    try {

      // 读取已有json信息
      let jsonFile = `${jsonPath}/all.json`
    
      let jsonData = []
      if (fs.existsSync(jsonFile)) {
          fileData = fs.readFileSync(jsonFile, {
              encoding: 'utf8'
          })
          if (fileData) {
              jsonData = JSON.parse(fileData)
          }
      }
      // 写入
      jsonData.push(data)
      fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
    

    } catch (error) {

      log.error('写入json文件失败', data)
    

    }

}

/**

  • 使用-C模式,将视频与音频快速合并,速度快,但问题视频较多,视频声音不正常。

  • 使用非-C模式,速度较慢,且由于合并时占用cpu较大,多个视频合并任务同时进行时,电脑基本会卡死

  • 最终采用非-C模式,保证每个视频的音频正常。同时为保证电脑不死机,以队列模式依次处理。唯一缺陷是耗时。

  • /
    async function doDownload(c) {
    let result = await getMultiVideo(c)
    videoList = []
    let data = []
    result.forEach(item => data = data.concat(item))
    log.info(要抓取的 ${c} 类型的视频总数为 ${data.length} 个)

    let actions = data.map(video => next => {

      downloadFile(c, video, next)
    

    })

    return new Promise((resolve, reject) => {

      let st = new Date()
      async.series(actions, function (err, result) {
          let et = new Date()
          let ut = timeUsed((et - st) / 1000)
          log.info(`finish download ${c} video, 耗时 ${ut}`, err, result.length)
    
          if (err) return reject(new Error(err))
          // 每个分类的json
          fs.writeFileSync(`${jsonPath}/${c}.json`, JSON.stringify(dlFilesJson));
          dlFilesJson = []
          downloadCount = 1
          return resolve(result)
      })
    

    })

}

async function main() {

let animals_pets = await doDownload('animals-pets')
let mashup = await doDownload('mashup')
let anime = await doDownload('anime')
let movies = await doDownload('movies')
let gaming = await doDownload('gaming')
let cartoons = await doDownload('cartoons')
let art = await doDownload('art')
let music = await doDownload('music')
let news  = await doDownload('news')
let sports = await doDownload('sports')
let science_technology = await doDownload('science-technology')
let celebrity = await doDownload('celebrity')
let nature_travel = await doDownload('nature-travel')
let fashion = await doDownload('fashion')
let dance = await doDownload('dance')
let cars = await doDownload('cars')
let nsfw = await doDownload('nsfw')

return true

}

/**

  • 用时显示
  • /
    const timeUsed = t => {
    // [1s, 1m)
    if (t < 60) return ${Math.round(t)}s
    // [1m, 1h)
    else if (t >= 60 && t < 60 * 60) return ${Math.floor(t/60)}m${Math.floor(t%60)}s
    // [1h, 1d)
    else if (t >= 60 * 60 && t < 60 * 60 * 24) return ${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
    // [1d, ~)
    else return ${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
    }

main()
.then(result => {
let endTime = new Date()
let usedTime = timeUsed((endTime - startTime) / 1000)
log.info(all downloads finish,${result} 个视频,共耗时 ${usedTime}, )
})
.catch(error => {
log.error(‘download error’, error)
})
.then(() => {
process.exit(0)
})

process.on(‘uncaughtException’, err => {
log.info(err)
log.info(JSON.stringify(dlFilesJson))
})

完整代码: https://github.com/flute/coub-crawler

9GAG.com 内容抓取

抓取说明

1、总共52个分类。

2、数据获取

  • url:https://9gag.com/v1/group-posts/group/cute/type/hot?c=10
  • 说明:cute 为分类。首次获取只需传入 c=10 即为前十条数据。下次请求附带上次请求返回的 nextCursor 参数即可。每次请求返回10条数据。

3、每个资源的属性:

  • 唯一标志: id
  • 资源描述: titile

4、资源分三种类型,根据images属性下的字段区分

  1. image  属性:image460    image700  
  2. gif  属性:image460    image460sv  image460svwm    image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为0,及为无声,即为GIF  
  3. video  属性:image460    image460sv  image460svwm   image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为1,及为有声,即为video  

5、内容字段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
image460: {
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
webpUrl: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460swp.webp"
width: 460
}

image460sv: {
duration: 32
h265Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svh265.mp4"
hasAudio: 1
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460sv.mp4"
vp9Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svvp9.webm"
width: 460
}

image460svwm: {
duration: 32
hasAudio: 1
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svwm.webm"
width: 460
}

image700: {
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
width: 460
}

6、分类数组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
[
'funny',
'cute',
'anime-manga',
'ask9gag',
'awesome',
'basketball',
'car',
'comic',
'cosplay',
'country',
'classicalartmemes',
'imadedis',
'drawing',
'animefanart',
'food',
'football',
'fortnite',
'gaming',
'gif',
'girl',
'girly',
'guy',
'history',
'horror',
'home',
'kpop',
'leagueoflegends',
'lego',
'movie-tv',
'music',
'overwatch',
'pcmr',
'photography',
'pokemon',
'politics',
'relationship',
'pubg',
'roastme',
'savage',
'starwars',
'satisfying',
'school',
'science',
'superhero',
'surrealmemes',
'sport',
'travel',
'timely',
'video',
'warhammer',
'wallpaper',
'wtf'
]

核心代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/**
* 获取内容
*/
const get9gagList = (category, offset, next) = & gt; {

var options = {
method: 'GET',
url: `https://9gag.com/v1/group-posts/group/${category}/type/${type}?`
};

if (offset == '') {
options.url += 'c=10'
} else if (offset == -1) {
return next('complete')
} else {
options.url += offset
}

request(options, function (error, response, body) {
if (error) {
next(error, null)
return
}
let data = JSON.parse(body)

if (data & amp; & amp; data.data & amp; & amp; data.data.posts & amp; & amp; data.data.posts.length) {
log.info(`获取 ${category} 视频列表成功 offset ${offset? offset: 'c=10'}`, data.data.posts.length)
//videoList.push(data.data)
videoList = videoList.concat(data.data.posts)
return next(null, category, data.data.nextCursor)
} else {
log.info(`获取 ${category} 内容为空 offset ${offset},所有数据获取完毕 。`)
return next(null, category, -1)
}
});

}

/**
* 批量获取内容列表
*/
const getMultiList = async category = & gt; {
// 每页依次队列获取
let actions = [async.constant(category, '')]
for (let i = 1; i & lt; = pageCount; i++) {
actions.push(get9gagList)
}
return new Promise((resolve, reject) = & gt; {
async.waterfall(actions, function (err, result) {
log.info(`finish crawler ${category} videos`, err, videoList.length)
//if (err) return reject(new Error(err))
if (err) log.info(err)
return resolve(videoList)
})
})
}

/**
* 下载视频/图片
*/
const download = (category, media, next) = & gt; {
//return new Promise((resolve, reject) =&gt; {
let isExist = isFileExist(media.id)
if (isExist) return next(null)

let filePath
if (media.type == 'video') {
filePath = `${videoDlPath}/${media.id}.mp4`
} else if (media.type == 'img') {
filePath = `${imgDlPath}/${media.id}.jpg`
} else return next(null)

request(media.url)
.on('response', function (res) {
// create file write stream
var fws = fs.createWriteStream(filePath);
// setup piping
res.pipe(fws);
// finish
res.on('end', function (e) {
log.info(`finish download ${category} ${filePath}`)
saveJsonData(media.type, {
id: media.id,
category: category,
desc: media.desc
})
if (media.type == 'video') videoAmount++
else imgAmount++

//return resolve(filePath)
return next(null)
});
// error handler
res.on('error', err = & gt; {
log.error('download error', err)
//return reject(err)
return next(null)
})
});
//})
}

/**
* 视频是否已下载
*/
const isFileExist = id = & gt; {
let videoPath = `${videoDlPath}/${id}.mp4`
let imgPath = `${imgDlPath}/${id}.jpg`
if (fs.existsSync(videoPath)) {
log.info('video file exist', videoPath)
return true
} else if (fs.existsSync(imgPath)) {
log.info('img file exist', imgPath)
return true
} else return false
}

/**
* 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
*/
const saveJsonData = (type, data) = & gt; {
try {
// 读取已有json信息
let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
jsonFile += `/data.json`

let jsonData = []
if (fs.existsSync(jsonFile)) {
fileData = fs.readFileSync(jsonFile, {
encoding: 'utf8'
})
if (fileData) {
jsonData = JSON.parse(fileData)
}
}
// 写入
jsonData.push(data)
fs.writeFileSync(jsonFile, JSON.stringify(jsonData));

} catch (error) {
log.error('写入json文件失败', data)
}

}

/**
* 将无声MP4转为gif图
*/
const convertVideoToGift = () = & gt; {
let videoPath = './233.mp4'
var command = ffmpeg(videoPath)
.format('gif');
command.save('./233.gif');
}

/**
* 内容筛选,只下载有声视频
*/
const mediaFilter = data = & gt; {
let results = [],
videos = [],
imgs = []
for (let i = 0; i & lt; data.length; i++) {
let video = data[i]
if (video.images.image460sv & amp; & amp; video.images.image460sv.hasAudio & amp; & amp; video.images.image460sv.url) {
// 有声视频
videos.push({
id: video.id,
type: 'video',
url: video.images.image460sv.url,
desc: video.title
})
} else if (!video.images.image460sv & amp; & amp; video.images.image460.url) {
// 图片
imgs.push({
id: video.id,
type: 'img',
url: video.images.image460.url,
desc: video.title
})
}
}
return {
results: results.concat(videos, imgs),
video: videos.length,
img: imgs.length
}
}

/**
* 每个分类的抓取任务
*/
const task = async (category, next) = & gt; {
let videoLists = await getMultiList(category)
videoList = []
log.info('数据获取成功', videoLists.length)
let {
results: videos,
video,
img
} = mediaFilter(videoLists)
log.info(`${videoLists.length} 个内容,有声视频共 ${video} 个,图片共 ${img} 个`)

let dlActions = videos.map(video = & gt; next = & gt; {
return download(category, video, next)
})

async.series(dlActions, (err, result) = & gt; {
if (err) {
log.error(`finish【${category}】all download error`, error)
return next(error)
}
log.info(`finish【${category}】all downloads success`, result.filter(item = & gt; item).length)
return next(null)
})
}

const main = () = & gt; {

let actions = category.map(item = & gt; next = & gt; {
return task(item, next)
})

return new Promise((resolve, reject) = & gt; {
async.series(actions, function (err, result) {
if (err) return reject(new Error(err))
return resolve(result)
})
})
}

main()
.then(result = & gt; {
log.info(`awsome! all ${result.length} tasks finish success! video: ${videoAmount} 个, img: ${imgAmount} 个`, )
})
.catch(error = & gt; {
log.info(`all tasks finish error! video: ${videoAmount}, img: ${imgAmount}`, error)
})
.then(() = & gt; {
process.exit(0)
})

完整代码: https://github.com/flute/9gag-crawler

Nodejs 爬虫使用 eventproxy 控制并发

use

target url:https://cnodejs.org/

cd nodejs

mkdir test && cd test

touch node.js

抓取其首页数据,共40篇文章。

  1. 首先爬取首页篇文章的URL,将得到的40篇文章的URL存入数组articleUrlArr
  2. 然后爬取每篇文章的详细内容。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
var eventproxy = require('eventproxy');
var superagent = require('superagent');
var cheerio = require('cheerio');

var articleUrlArr = [],
baseUrl = 'https://cnodejs.org/';

superagent.get(baseUrl).end(function(err, res) {

if (err) {
console.log(err);
}
var $ = cheerio.load(res.text);
// 获取该页所有文章的链接
$('#topic_list .topic_title').each(function(idx, elements) {

var href = baseUrl+elements.attribs.href;
articleUrlArr.push(href);
});
//打印文章数据
console.log(articleUrlArr);
});

https://ludis-1252396698.cos.ap-beijing.myqcloud.com/ludis/nodejs.png
然后再分别爬取40篇文章的详细内容,即发出40个并发请求,同时爬取数据,调用eventproxy 的 #after API。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
articleUrlArr.forEach(function (url) {
superagent.get(url).end(function (err, res) {
console.log('fetch ' + url + ' successful');
ep.emit('task', [url, res.text]);
});
});
ep.after('task', articleUrlArr.length, function (data) {
// data为一个数组,包含了40次ep.emit('task', pair)中的pair
data = data.map(function (topicPair) {
// 接下来都是 jquery 的用法了
var url = topicPair[0];
var html = topicPair[1];
var $ = cheerio.load(html);
return ({
title: $('.topic_full_title').text().trim(),
href: url,
comment1: $('.reply_content').eq(0).text().trim(),
});
});

console.log('final:');
console.log(data);
});