instagram 内容抓取
抓取说明
1、需要登录信息,即抓取时需要附带cookie
,同时需要user-agent
。
2、数据获取接口及下载均有频率限制,无间隔的请求(几百个资源)会被限制,在被限制后睡眠一定时间继续。
3、内容抓取分为两个入口
- 一个是抓取某个用户发布的所有资源
- 一个是抓取某个tag下的所有资源
两种入口附带的cookie不同,请求的URL不同。
4、抓取步骤:
- 电脑端登陆ins,保存
cookie
、query_hash
、user-agent
信息。后续所有请求附带cookie
及user-agent
。 - 模拟请求个人主页/tag主页,通过解析HTML页面,得到userId/tag name。同时拿到第一页的数据及下页cursor。
- 通过API接口,根据
cursor
持续获取多页数据。所有数据获取完毕后开始下载。 - 返回的数据中,图片资源可以直接下载。视频资源需要再次请求视频地址获取接口获得视频地址,然后再下载。
5、请求数据接口:
user:
https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D
tag:
https://www.instagram.com/graphql/query/?query_hash=1780c1b186e2c37de9f7da95ce41bb67&variables=%7B%22tag_name%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D
获取视频的地址:
https://www.instagram.com/p/%s/?__a=1
核心代码
/** * 获取指定用户的主页 */ const getHtml = item => { let userName = item.name, type = item.type let url if (item.type == 'user') { url = `${baseUrl}${userName}/` headers.cookie = userCookie } else { url = `${baseUrl}explore/tags/${userName}/` headers.cookie = tagCookie } let options = { method: 'GET', url: url, headers: headers }
return new Promise((resolve, reject) => { request(options, function (error, response, body) { if (error) return reject(error); const $ = cheerio.load(body) let html = $.html() // 获取uid/tag name userId = item.type == 'user' ? html.match(/"profilePage_([0-9]+)"/)[1] : html.match(/"name":"([a-zA-Z_]+)",/)[1] log.info(`${userName} id/name 获取成功 ${userId}`) // 获取首页数据 data = html.match(/<script type="text\/javascript">window._sharedData = (.*?);<\/script>/)[1] data = JSON.parse(data) let edges, count, pageInfo, cursor, flag, totalPage let firstPageDate if (item.type == 'user') { firstPageDate = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media } else { firstPageDate = data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media } edges = firstPageDate.edges count = firstPageDate.count pageInfo = firstPageDate.page_info cursor = pageInfo.end_cursor flag = pageInfo.has_next_page totalPage = Math.ceil(count / purePage) // 存储首页信息 edges.forEach(item => { item.mode = type storeMedia(item) }) // 返回分页信息 return resolve({ totalPage: totalPage, userId: userId, cursor: cursor }) }); })
}
/**
- 获取该用户的所有内容
- /
const getAllUrls = (item, totalPage, uid, cursor) => {
let userName = item.name
let actions = [async.constant(item, uid, cursor)]
let limit = totalPage > pageLimit ? pageLimit : totalPage
for (let i = 0; i < limit; i++) {}actions.push(fetchData)
log.info(${userName} 数据共 ${totalPage} 页
)
return new Promise((resolve, reject) => {})async.waterfall(actions, (error, result) => { log.info(`${userName} 的所有帖子数据获取成功,共${media.length}个帖子,视频${videoCount}个,图片${imgCount}个`, ) fetchPageCount = 0 //console.log(media) return resolve(media) })
}
/**
请求获取数据
/
const fetchData = (item, uid, offset, next) => {let userName = item.name,
type = item.type
let url
if (item.type == ‘user’) {
url = util.format(fetchUserUrl, uid, offset) headers.cookie = userCookie
} else {
url = util.format(fetchTagUrl, uid, offset) headers.cookie = tagCookie
}
let options = {
method: 'GET', url: url, headers: headers
};
request(options, function (error, response, body) {
if (error) { log.error('fetch data error', error) log.info('休息1min~') return setTimeout(function () { return next(null, item, uid, offset) }, 1 * 60 * 1000) } let data try { data = JSON.parse(body) } catch (error) { log.error('json序列化失败', error) return next(null, item, uid, offset) } if (data.status == 'fail') { log.error('返回内容失败', data) log.info('休息1min~') //return next(data.message) return setTimeout(function () { return next(null, item, uid, offset) }, 1 * 60 * 1000) } let listData try { if (item.type == 'user') { listData = data.data.user.edge_owner_to_timeline_media } else { listData = data.data.hashtag.edge_hashtag_to_media } } catch (error) { log.error('数据获取失败', error) next(error) } let edges = listData.edges edges.forEach(item => { item.mode = type storeMedia(item) }) let { has_next_page, end_cursor } = listData.page_info log.info(`page:${++fetchPageCount} ${userName} 数据获取成功,帖子 ${edges.length} 个, has_next_page: ${has_next_page} ,end_cursor: ${end_cursor}`) if (!has_next_page) { return next('所有数据获取完毕,无下页') } setTimeout(function () { return next(null, item, uid, end_cursor) }, 2000)
});
}
/**
根据视频的shortcode获取视频的下载地址
/
const fetchVideoUrl = (mode, shortcode) => {
let url = util.format(getVideoUrl, shortcode)if (mode == ‘user’) {
headers.cookie = userCookie
} else {
headers.cookie = tagCookie
}
let options = {method: 'GET', url: url, headers: headers
}
return new Promise((resolve, reject) => {request(options, function (error, response, body) { let videoUrl = '' if (error) { log.error(`获取 ${shortcode} 视频地址失败`, error) return resolve(videoUrl) } try { let data = JSON.parse(body) videoUrl = data.graphql.shortcode_media.video_url } catch (error) { log.error(`获取 ${shortcode} videoUrl 为空`) } return resolve(videoUrl) })
})
}
/**
- 根据不同的类型存储数据
- /
const storeMedia = async item => {
let result = {}id: item.node.id, desc: item.node.edge_media_to_caption.edges[0] ? item.node.edge_media_to_caption.edges[0].node.text : ''
if (item.node.is_video) {} else {// video // 如果有video_url直接获取 // 如果没有video_url,通过接口获取 let videoUrl = item.node.video_url if (!videoUrl) videoUrl = await fetchVideoUrl(item.mode, item.node.shortcode) if (videoUrl) { result.type = 'video' result.url = videoUrl videoCount++ }
}// img let imgUrl = item.node.display_url if (imgUrl) { result.type = 'img' result.url = imgUrl imgCount++ }
media.push(result)
}/**
下载视频/图片
/
const download = (category, media, next) => {let isExist = isFileExist(media.id)
if (isExist) return next(null)let filePath
if (media.type == ‘video’) {filePath = `${videoDlPath}/${media.id}.mp4`
} else if (media.type == ‘img’) {
filePath = `${imgDlPath}/${media.id}.jpg`
} else return next(null)
let st = new Date()
request(media.url).on('response', function (res) { // create file write stream let fws = fs.createWriteStream(filePath); // setup piping res.pipe(fws); // finish res.on('end', function (e) { let et = new Date() let ut = timeUsed((et - st) / 1000) log.info(`${videoDl + imgDl} finish download ${category} ${filePath},用时${ut}`) saveJsonData(media.type, { id: media.id, category: category, desc: media.desc }) if (media.type == 'video') videoDl++ else imgDl++ return next(null) }); // error handler res.on('error', err => { log.error('download error', err) return next(null) }) }) .on('error', function (err) { log.error('request source failed', media.url, err) // 大约3分钟可恢复 log.info('超频啦!休息1分钟~') setTimeout(function () { return next(null) }, 1 * 60 * 1000) })
}
/**
- 视频是否已下载
- /
const isFileExist = id => {
let videoPath =${videoDlPath}/${id}.mp4
let imgPath =${imgDlPath}/${id}.jpg
if (fs.existsSync(videoPath)) {} else if (fs.existsSync(imgPath)) {log.info('video file exist', videoPath) return true
} else return falselog.info('img file exist', imgPath) return true
}/**
视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
/
const saveJsonData = (type, data) => {
try {// 读取已有json信息 let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath jsonFile += `/data.json` let jsonData = [] if (fs.existsSync(jsonFile)) { fileData = fs.readFileSync(jsonFile, { encoding: 'utf8' }) if (fileData) { jsonData = JSON.parse(fileData) } } // 写入 jsonData.push(data) fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
} catch (error) {
log.error('写入json文件失败', data)
}
}
const clearData = () => {
media = []
videoCount = 0
imgCount = 0
videoDl = 0
imgDl = 0
}/**
- 下载某用户/标签下获取的所有资源
- /
const downloadAll = (userName, data) => {
let dlActions = data.map(item => next => {})download(userName, item, next)
return new Promise((resolve, reject) => {})async.series(dlActions, (error, result) => { return resolve(result) })
}/**
- 用时显示
- /
const timeUsed = t => {
// [1s, 1m)
if (t < 60) return${Math.ceil(t)}s
// [1m, 1h)
else if (t >= 60 && t < 60 * 60) return${Math.floor(t/60)}m${Math.floor(t%60)}s
// [1h, 1d)
else if (t >= 60 * 60 && t < 60 * 60 * 24) return${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
// [1d, ~)
else return${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
}/**
某个用户/标签的抓取任务
/
const task = async (item, next) => {
let userName = item.namelet {
totalPage, userId, cursor
} = await getHtml(item).catch(error => {
log.error('fetch error', error) return next(null)
})
let data = await getAllUrls(item, totalPage, userId, cursor)
clearData()
let st = new Date()
let download = await downloadAll(userName, data)
let et = new Date()
let ut = timeUsed((et - st) / 1000)
log.info(${userName} 所有下载完成, video ${videoDl} 个,img ${imgDl} 个,共用时 ${ut}
)
clearData()
return next(null)}
const main = () => {
let actions = target.map(item => next => {
task(item, next)
})
async.series(actions, (error, result) => {
log.info(所有 ${result.length} 个任务完成
, error)
process.exit(0)
})
}
main()
instagram 内容抓取