coub.com 内容抓取
抓取说明
1、总共17个分类。
2、数据获取
- url:
https://coub.com/api/v2/timeline/hot/movies/half?per_page=25
- 说明:
movies
为分类。per_page
为每页返回的数据量[1,25]。首次获取只需传入page=1
即为第一页的数据。下次请求附带字段anchor
为上次请求返回的next
参数即可。
3、每个资源的属性:
- 唯一标志: id、permalink
- 资源描述: titile
4、下载
coub.com的音频和视频是分开的,下载的时候需要将音视频分别下载,然后使用FFmpeg合并。
下载及合并使用开源项目 https://github.com/TeeSeal/coub-dl
5、分类数组
["animals-pets", "mashup", "anime", "movies", "gaming", "cartoons", "art", "music", "sports", "science-technology", "celebrity", "nature-travel", "fashion", "dance", "cars", "nsfw"]
核心代码:
/** * 获取视频列表,每次请求返回10个视频 * @param {number} page 请求的页数 * @param {number} anchor 保证数据的不重复性 */ function getCoubVideoList(c, page = 1, anchor, next) { if (!c) { log.error('category empty', c) return next(new Error('category empty'), null) } var options = { method: 'GET', url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`, //url: `https://coub.com/api/v2/timeline/hot/${c}/half`, qs: { page: page, per_page: per_page } }; if (anchor) options.qs.anchor = anchor
request(options, function (error, response, body) { if (error) { next(error, null) return } let data = JSON.parse(body) if (data && data.coubs && data.coubs.length) { log.info(`获取视频列表成功 page ${page}`, data.next, data.coubs.length) //videoList.push(data.data) videoList = videoList.concat(data.coubs) return next(null, c, ++data.page, data.next) } else { log.info('获取内容为空 page ${page}') return next(null, c, ++data.page, data.next) } });
}
/**
- 获取指定分类的总页数
- /
const getTotalPage = (c) => {
var options = {};method: 'GET', url: `https://coub.com/api/v2/timeline/hot/${c}/${time}`, //url: `https://coub.com/api/v2/timeline/hot/${c}/half`, qs: { page: 1, per_page: per_page }
return new Promise((resolve, reject) => {})request(options, function (error, response, body) { if (error) return reject(new Error(error)) let data = JSON.parse(body) if (data && data.total_pages) { log.info(`获取${c}总页数成功`, data.total_pages) return resolve(data.total_pages) } else { log.info(`获取${c}总页数失败`) return reject(new Error('页数为空')) } });
}
/**
- 获取多页的视频
- /
const getMultiVideo = async c => {
// 总页数
let totalPage = await getTotalPage(c)
// 每页依次队列获取
let actions = [async.constant(c, startPage, startAnchor)]
for (let i = 1; i <= totalPage; i++) {}actions.push(getCoubVideoList)
return new Promise((resolve, reject) => {})async.waterfall(actions, function (err, result) { log.info(`finish crawler ${c} videos`, err, videoList.length) if (err) return reject(new Error(err)) return resolve(videoList) })
}/**
根据视频的permalink下载视频
@param {string} id video permalink
/
async function downloadFile(c, video, next) {
if (!video || !video.permalink) return next(null, ‘’)
let id = video.permalinklet filename =
${dlPath}/${id}.mp4
let isExist = isFileExist(id)
// 文件已存在
if (isExist) {return next(null, filename)
}
// 下载操作
const coub = await Coub.fetch(http://coub.com/view/${id}
).catch(error => {console.log('fetch error', error) return next(null, '')
})
if (!coub) return next(null, ‘’)
coub.attachAudio()
if (fastMode) coub.addOption(‘-c’, ‘copy’)
coub.addOption(‘-shortest’)
let ts = new Date()
coub.write(filename).then(result => { let te = new Date() let tu = (te - ts) / 1000 log.info(`${downloadCount}:finish download ${c} ${id}.mp4`, filename, `用时${tu}s`) downloadCount++ // 视频信息 let videoInfo = { desc: video.title, category: c, filename: `${id}.mp4` } // 实时写入json saveJsonData(videoInfo) dlFilesJson.push(videoInfo) return next(null, result) //return resolve(result) }) .catch(error => { log.error(`download error ${id}.mp4`, error) return next(error, '') //return reject(error) })
}
/**
视频是否已下载
/
const isFileExist = id => {
let oldPath = path.resolve(__dirname,./src/video/${id}.mp4
);
let newPath = path.resolve(__dirname,./downloads/video/${id}.mp4
);
let weeklyPath = path.resolve(__dirname,./weekly/video/${id}.mp4
);
let monthlyPath = path.resolve(__dirname,./monthly/video/${id}.mp4
);
let quarterPath = path.resolve(__dirname,./quarter/video/${id}.mp4
);
let halfPath = path.resolve(__dirname,./half/video/${id}.mp4
);if (fs.existsSync(oldPath)) {
log.info('file exist', oldPath) return true
} else if (fs.existsSync(newPath)) {
log.info('file exist', newPath) return true
} else if(fs.existsSync(weeklyPath)){
log.info('file exist', weeklyPath) return true
} else if(fs.existsSync(monthlyPath)){
log.info('file exist', monthlyPath) return true
} else if(fs.existsSync(quarterPath)){
log.info('file exist', quarterPath) return true
} else if(fs.existsSync(halfPath)){
log.info('file exist', halfPath) return true
} else return false
}/**
视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
@param {*} data
/
const saveJsonData = data => {
try {// 读取已有json信息 let jsonFile = `${jsonPath}/all.json` let jsonData = [] if (fs.existsSync(jsonFile)) { fileData = fs.readFileSync(jsonFile, { encoding: 'utf8' }) if (fileData) { jsonData = JSON.parse(fileData) } } // 写入 jsonData.push(data) fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
} catch (error) {
log.error('写入json文件失败', data)
}
}
/**
使用-C模式,将视频与音频快速合并,速度快,但问题视频较多,视频声音不正常。
使用非-C模式,速度较慢,且由于合并时占用cpu较大,多个视频合并任务同时进行时,电脑基本会卡死
最终采用非-C模式,保证每个视频的音频正常。同时为保证电脑不死机,以队列模式依次处理。唯一缺陷是耗时。
/
async function doDownload(c) {
let result = await getMultiVideo(c)
videoList = []
let data = []
result.forEach(item => data = data.concat(item))
log.info(要抓取的 ${c} 类型的视频总数为 ${data.length} 个
)let actions = data.map(video => next => {
downloadFile(c, video, next)
})
return new Promise((resolve, reject) => {
let st = new Date() async.series(actions, function (err, result) { let et = new Date() let ut = timeUsed((et - st) / 1000) log.info(`finish download ${c} video, 耗时 ${ut}`, err, result.length) if (err) return reject(new Error(err)) // 每个分类的json fs.writeFileSync(`${jsonPath}/${c}.json`, JSON.stringify(dlFilesJson)); dlFilesJson = [] downloadCount = 1 return resolve(result) })
})
}
async function main() {
let animals_pets = await doDownload('animals-pets') let mashup = await doDownload('mashup') let anime = await doDownload('anime') let movies = await doDownload('movies') let gaming = await doDownload('gaming') let cartoons = await doDownload('cartoons') let art = await doDownload('art') let music = await doDownload('music') let news = await doDownload('news') let sports = await doDownload('sports') let science_technology = await doDownload('science-technology') let celebrity = await doDownload('celebrity') let nature_travel = await doDownload('nature-travel') let fashion = await doDownload('fashion') let dance = await doDownload('dance') let cars = await doDownload('cars') let nsfw = await doDownload('nsfw') return true
}
/**
- 用时显示
- /
const timeUsed = t => {
// [1s, 1m)
if (t < 60) return${Math.round(t)}s
// [1m, 1h)
else if (t >= 60 && t < 60 * 60) return${Math.floor(t/60)}m${Math.floor(t%60)}s
// [1h, 1d)
else if (t >= 60 * 60 && t < 60 * 60 * 24) return${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
// [1d, ~)
else return${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
}main()
.then(result => {
let endTime = new Date()
let usedTime = timeUsed((endTime - startTime) / 1000)
log.info(all downloads finish,${result} 个视频,共耗时 ${usedTime}
, )
})
.catch(error => {
log.error(‘download error’, error)
})
.then(() => {
process.exit(0)
})
process.on(‘uncaughtException’, err => {
log.info(err)
log.info(JSON.stringify(dlFilesJson))
})
coub.com 内容抓取