coub.com 内容抓取

抓取说明

1、总共17个分类。

2、数据获取

  • url:https://coub.com/api/v2/timeline/hot/movies/half?per_page=25
  • 说明:movies 为分类。 per_page 为每页返回的数据量[1,25]。首次获取只需传入 page=1 即为第一页的数据。下次请求附带字段 anchor 为上次请求返回的 next 参数即可。

3、每个资源的属性:

  • 唯一标志: id、permalink
  • 资源描述: titile

4、下载

coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。
下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl

5、分类数组

["animals-pets", "mashup", "anime", "movies", "gaming", "cartoons", "art", "music", "sports", "science-technology", "celebrity", "nature-travel", "fashion", "dance", "cars", "nsfw"]

核心代码:

/**
 * 获取视频列表,每次请求返回10个视频
 * @param {number} page 请求的页数
 * @param {number} anchor 保证数据的不重复性
 */
function getCoubVideoList(c, page = 1, anchor, next) {
    if (!c) {
        log.error('category empty', c)
        return next(new Error('category empty'), null)
    }
    var options = {
        method: 'GET',
        url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
        //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
        qs: {
            page: page,
            per_page: per_page
        }
    };
    if (anchor) options.qs.anchor = anchor

request(options, function (error, response, body) {
    if (error) {
        next(error, null)
        return
    }
    let data = JSON.parse(body)

    if (data && data.coubs && data.coubs.length) {
        log.info(`获取视频列表成功 page ${page}`, data.next, data.coubs.length)
        //videoList.push(data.data)
        videoList = videoList.concat(data.coubs)
        return next(null, c, ++data.page, data.next)
    } else {
        log.info('获取内容为空 page ${page}')
        return next(null, c, ++data.page, data.next)
    }
});

}

/**

  • 获取指定分类的总页数
  • /
    const getTotalPage = (c) => {
    var options = {
      method: 'GET',
      url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`,
      //url: `https://coub.com/api/v2/timeline/hot/${c}/half`,
      qs: {
          page: 1,
          per_page: per_page
      }
    
    };
    return new Promise((resolve, reject) => {
      request(options, function (error, response, body) {
          if (error) return reject(new Error(error))
          let data = JSON.parse(body)
          if (data && data.total_pages) {
              log.info(`获取${c}总页数成功`, data.total_pages)
              return resolve(data.total_pages)
          } else {
              log.info(`获取${c}总页数失败`)
              return reject(new Error('页数为空'))
          }
      });
    
    })

}

/**

  • 获取多页的视频
  • /
    const getMultiVideo = async c => {
    // 总页数
    let totalPage = await getTotalPage(c)
    // 每页依次队列获取
    let actions = [async.constant(c, startPage, startAnchor)]
    for (let i = 1; i <= totalPage; i++) {
      actions.push(getCoubVideoList)
    
    }
    return new Promise((resolve, reject) => {
      async.waterfall(actions, function (err, result) {
          log.info(`finish crawler ${c} videos`, err, videoList.length)
          if (err) return reject(new Error(err))
          return resolve(videoList)
      })
    
    })
    }

/**

  • 根据视频的permalink下载视频

  • @param {string} id video permalink

  • /
    async function downloadFile(c, video, next) {
    if (!video || !video.permalink) return next(null, ‘’)
    let id = video.permalink

    let filename = ${dlPath}/${id}.mp4

    let isExist = isFileExist(id)
    // 文件已存在
    if (isExist) {

      return next(null, filename)
    

    }

    // 下载操作
    const coub = await Coub.fetch(http://coub.com/view/${id}).catch(error => {

      console.log('fetch error', error)
      return next(null, '')
    

    })
    if (!coub) return next(null, ‘’)
    coub.attachAudio()
    if (fastMode) coub.addOption(‘-c’, ‘copy’)
    coub.addOption(‘-shortest’)
    let ts = new Date()
    coub.write(filename)

      .then(result =&gt; {
          let te = new Date()
          let tu = (te - ts) / 1000
          log.info(`${downloadCount}:finish download ${c} ${id}.mp4`, filename, `用时${tu}s`)
          downloadCount++
          // 视频信息
          let videoInfo = {
              desc: video.title,
              category: c,
              filename: `${id}.mp4`
          }
          // 实时写入json
          saveJsonData(videoInfo)
    
          dlFilesJson.push(videoInfo)
          return next(null, result)
          //return resolve(result)
      })
      .catch(error =&gt; {
          log.error(`download error ${id}.mp4`, error)
          return next(error, '')
          //return reject(error)
      })
    

    }

/**

  • 视频是否已下载

  • /
    const isFileExist = id => {
    let oldPath = path.resolve(__dirname, ./src/video/${id}.mp4);
    let newPath = path.resolve(__dirname, ./downloads/video/${id}.mp4);
    let weeklyPath = path.resolve(__dirname, ./weekly/video/${id}.mp4);
    let monthlyPath = path.resolve(__dirname, ./monthly/video/${id}.mp4);
    let quarterPath = path.resolve(__dirname, ./quarter/video/${id}.mp4);
    let halfPath = path.resolve(__dirname, ./half/video/${id}.mp4);

    if (fs.existsSync(oldPath)) {

      log.info('file exist', oldPath)
      return true
    

    } else if (fs.existsSync(newPath)) {

      log.info('file exist', newPath)
      return true
    

    } else if(fs.existsSync(weeklyPath)){

      log.info('file exist', weeklyPath)
      return true
    

    } else if(fs.existsSync(monthlyPath)){

      log.info('file exist', monthlyPath)
      return true
    

    } else if(fs.existsSync(quarterPath)){

      log.info('file exist', quarterPath)
      return true
    

    } else if(fs.existsSync(halfPath)){

      log.info('file exist', halfPath)
      return true
    

    } else return false
    }

/**

  • 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存

  • @param {*} data

  • /
    const saveJsonData = data => {
    try {

      // 读取已有json信息
      let jsonFile = `${jsonPath}/all.json`
    
      let jsonData = []
      if (fs.existsSync(jsonFile)) {
          fileData = fs.readFileSync(jsonFile, {
              encoding: 'utf8'
          })
          if (fileData) {
              jsonData = JSON.parse(fileData)
          }
      }
      // 写入
      jsonData.push(data)
      fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
    

    } catch (error) {

      log.error('写入json文件失败', data)
    

    }

}

/**

  • 使用-C模式,将视频与音频快速合并,速度快,但问题视频较多,视频声音不正常。

  • 使用非-C模式,速度较慢,且由于合并时占用cpu较大,多个视频合并任务同时进行时,电脑基本会卡死

  • 最终采用非-C模式,保证每个视频的音频正常。同时为保证电脑不死机,以队列模式依次处理。唯一缺陷是耗时。

  • /
    async function doDownload(c) {
    let result = await getMultiVideo(c)
    videoList = []
    let data = []
    result.forEach(item => data = data.concat(item))
    log.info(要抓取的 ${c} 类型的视频总数为 ${data.length} 个)

    let actions = data.map(video => next => {

      downloadFile(c, video, next)
    

    })

    return new Promise((resolve, reject) => {

      let st = new Date()
      async.series(actions, function (err, result) {
          let et = new Date()
          let ut = timeUsed((et - st) / 1000)
          log.info(`finish download ${c} video, 耗时 ${ut}`, err, result.length)
    
          if (err) return reject(new Error(err))
          // 每个分类的json
          fs.writeFileSync(`${jsonPath}/${c}.json`, JSON.stringify(dlFilesJson));
          dlFilesJson = []
          downloadCount = 1
          return resolve(result)
      })
    

    })

}

async function main() {

let animals_pets = await doDownload('animals-pets')
let mashup = await doDownload('mashup')
let anime = await doDownload('anime')
let movies = await doDownload('movies')
let gaming = await doDownload('gaming')
let cartoons = await doDownload('cartoons')
let art = await doDownload('art')
let music = await doDownload('music')
let news  = await doDownload('news')
let sports = await doDownload('sports')
let science_technology = await doDownload('science-technology')
let celebrity = await doDownload('celebrity')
let nature_travel = await doDownload('nature-travel')
let fashion = await doDownload('fashion')
let dance = await doDownload('dance')
let cars = await doDownload('cars')
let nsfw = await doDownload('nsfw')

return true

}

/**

  • 用时显示
  • /
    const timeUsed = t => {
    // [1s, 1m)
    if (t < 60) return ${Math.round(t)}s
    // [1m, 1h)
    else if (t >= 60 && t < 60 * 60) return ${Math.floor(t/60)}m${Math.floor(t%60)}s
    // [1h, 1d)
    else if (t >= 60 * 60 && t < 60 * 60 * 24) return ${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
    // [1d, ~)
    else return ${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
    }

main()
.then(result => {
let endTime = new Date()
let usedTime = timeUsed((endTime - startTime) / 1000)
log.info(all downloads finish,${result} 个视频,共耗时 ${usedTime}, )
})
.catch(error => {
log.error(‘download error’, error)
})
.then(() => {
process.exit(0)
})

process.on(‘uncaughtException’, err => {
log.info(err)
log.info(JSON.stringify(dlFilesJson))
})

完整代码: https://github.com/flute/coub-crawler

Author

Ludis

Posted on

2018-09-26

Updated on

2019-01-14

Licensed under

Comments