instagram 内容抓取
抓取说明
1、需要登录信息,即抓取时需要附带cookie
,同时需要user-agent
。
2、数据获取接口及下载均有频率限制,无间隔的请求(几百个资源)会被限制,在被限制后睡眠一定时间继续。
3、内容抓取分为两个入口
- 一个是抓取某个用户发布的所有资源
- 一个是抓取某个tag下的所有资源
两种入口附带的cookie不同,请求的URL不同。
4、抓取步骤:
- 电脑端登陆ins,保存
cookie
、query_hash
、user-agent
信息。后续所有请求附带cookie
及user-agent
。 - 模拟请求个人主页/tag主页,通过解析HTML页面,得到userId/tag name。同时拿到第一页的数据及下页cursor。
- 通过API接口,根据
cursor
持续获取多页数据。所有数据获取完毕后开始下载。 - 返回的数据中,图片资源可以直接下载。视频资源需要再次请求视频地址获取接口获得视频地址,然后再下载。
5、请求数据接口:
user:
https://www.instagram.com/graphql/query/?query_hash=a5164aed103f24b03e7b7747a2d94e3c&variables=%7B%22id%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D
tag:
https://www.instagram.com/graphql/query/?query_hash=1780c1b186e2c37de9f7da95ce41bb67&variables=%7B%22tag_name%22%3A%22%s%22%2C%22first%22%3A${purePage}%2C%22after%22%3A%22%s%22%7D
获取视频的地址:
https://www.instagram.com/p/%s/?__a=1
核心代码
/**
* 获取指定用户的主页
*/
const getHtml = item => {
let userName = item.name,
type = item.type
let url
if (item.type == 'user') {
url = `${baseUrl}${userName}/`
headers.cookie = userCookie
} else {
url = `${baseUrl}explore/tags/${userName}/`
headers.cookie = tagCookie
}
let options = {
method: 'GET',
url: url,
headers: headers
}
return new Promise((resolve, reject) => {
request(options, function (error, response, body) {
if (error) return reject(error);
const $ = cheerio.load(body)
let html = $.html()
// 获取uid/tag name
userId = item.type == 'user' ? html.match(/"profilePage_([0-9]+)"/)[1] : html.match(/"name":"([a-zA-Z_]+)",/)[1]
log.info(`${userName} id/name 获取成功 ${userId}`)
// 获取首页数据
data = html.match(/<script type="text\/javascript">window._sharedData = (.*?);<\/script>/)[1]
data = JSON.parse(data)
let edges, count, pageInfo, cursor, flag, totalPage
let firstPageDate
if (item.type == 'user') {
firstPageDate = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media
} else {
firstPageDate = data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media
}
edges = firstPageDate.edges
count = firstPageDate.count
pageInfo = firstPageDate.page_info
cursor = pageInfo.end_cursor
flag = pageInfo.has_next_page
totalPage = Math.ceil(count / purePage)
// 存储首页信息
edges.forEach(item => {
item.mode = type
storeMedia(item)
})
// 返回分页信息
return resolve({
totalPage: totalPage,
userId: userId,
cursor: cursor
})
});
})
}
/**
- 获取该用户的所有内容
- /
const getAllUrls = (item, totalPage, uid, cursor) => {
let userName = item.name
let actions = [async.constant(item, uid, cursor)]
let limit = totalPage > pageLimit ? pageLimit : totalPage
for (let i = 0; i < limit; i++) { actions.push(fetchData)
}
log.info(${userName} 数据共 ${totalPage} 页
)
return new Promise((resolve, reject) => { async.waterfall(actions, (error, result) => {
log.info(`${userName} 的所有帖子数据获取成功,共${media.length}个帖子,视频${videoCount}个,图片${imgCount}个`, )
fetchPageCount = 0
//console.log(media)
return resolve(media)
})
})
}
/**
请求获取数据
/
const fetchData = (item, uid, offset, next) => {
let userName = item.name,
type = item.type
let url
if (item.type == ‘user’) {
url = util.format(fetchUserUrl, uid, offset)
headers.cookie = userCookie
} else {
url = util.format(fetchTagUrl, uid, offset)
headers.cookie = tagCookie
}
let options = {
method: 'GET',
url: url,
headers: headers
};
request(options, function (error, response, body) {
if (error) {
log.error('fetch data error', error)
log.info('休息1min~')
return setTimeout(function () {
return next(null, item, uid, offset)
}, 1 * 60 * 1000)
}
let data
try {
data = JSON.parse(body)
} catch (error) {
log.error('json序列化失败', error)
return next(null, item, uid, offset)
}
if (data.status == 'fail') {
log.error('返回内容失败', data)
log.info('休息1min~')
//return next(data.message)
return setTimeout(function () {
return next(null, item, uid, offset)
}, 1 * 60 * 1000)
}
let listData
try {
if (item.type == 'user') {
listData = data.data.user.edge_owner_to_timeline_media
} else {
listData = data.data.hashtag.edge_hashtag_to_media
}
} catch (error) {
log.error('数据获取失败', error)
next(error)
}
let edges = listData.edges
edges.forEach(item => {
item.mode = type
storeMedia(item)
})
let {
has_next_page,
end_cursor
} = listData.page_info
log.info(`page:${++fetchPageCount} ${userName} 数据获取成功,帖子 ${edges.length} 个, has_next_page: ${has_next_page} ,end_cursor: ${end_cursor}`)
if (!has_next_page) {
return next('所有数据获取完毕,无下页')
}
setTimeout(function () {
return next(null, item, uid, end_cursor)
}, 2000)
});
}
/**
根据视频的shortcode获取视频的下载地址
/
const fetchVideoUrl = (mode, shortcode) => {
let url = util.format(getVideoUrl, shortcode)
if (mode == ‘user’) {
headers.cookie = userCookie
} else {
headers.cookie = tagCookie
}
let options = {
method: 'GET',
url: url,
headers: headers
}
return new Promise((resolve, reject) => {
request(options, function (error, response, body) {
let videoUrl = ''
if (error) {
log.error(`获取 ${shortcode} 视频地址失败`, error)
return resolve(videoUrl)
}
try {
let data = JSON.parse(body)
videoUrl = data.graphql.shortcode_media.video_url
} catch (error) {
log.error(`获取 ${shortcode} videoUrl 为空`)
}
return resolve(videoUrl)
})
})
}
/**
- 根据不同的类型存储数据
- /
const storeMedia = async item => {
let result = { id: item.node.id,
desc: item.node.edge_media_to_caption.edges[0] ? item.node.edge_media_to_caption.edges[0].node.text : ''
}
if (item.node.is_video) { // video
// 如果有video_url直接获取
// 如果没有video_url,通过接口获取
let videoUrl = item.node.video_url
if (!videoUrl) videoUrl = await fetchVideoUrl(item.mode, item.node.shortcode)
if (videoUrl) {
result.type = 'video'
result.url = videoUrl
videoCount++
}
} else { // img
let imgUrl = item.node.display_url
if (imgUrl) {
result.type = 'img'
result.url = imgUrl
imgCount++
}
}
media.push(result)
}
/**
下载视频/图片
/
const download = (category, media, next) => {
let isExist = isFileExist(media.id)
if (isExist) return next(null)
let filePath
if (media.type == ‘video’) {
filePath = `${videoDlPath}/${media.id}.mp4`
} else if (media.type == ‘img’) {
filePath = `${imgDlPath}/${media.id}.jpg`
} else return next(null)
let st = new Date()
request(media.url)
.on('response', function (res) {
// create file write stream
let fws = fs.createWriteStream(filePath);
// setup piping
res.pipe(fws);
// finish
res.on('end', function (e) {
let et = new Date()
let ut = timeUsed((et - st) / 1000)
log.info(`${videoDl + imgDl} finish download ${category} ${filePath},用时${ut}`)
saveJsonData(media.type, {
id: media.id,
category: category,
desc: media.desc
})
if (media.type == 'video') videoDl++
else imgDl++
return next(null)
});
// error handler
res.on('error', err => {
log.error('download error', err)
return next(null)
})
})
.on('error', function (err) {
log.error('request source failed', media.url, err)
// 大约3分钟可恢复
log.info('超频啦!休息1分钟~')
setTimeout(function () {
return next(null)
}, 1 * 60 * 1000)
})
}
/**
- 视频是否已下载
- /
const isFileExist = id => {
let videoPath = ${videoDlPath}/${id}.mp4
let imgPath = ${imgDlPath}/${id}.jpg
if (fs.existsSync(videoPath)) { log.info('video file exist', videoPath)
return true
} else if (fs.existsSync(imgPath)) { log.info('img file exist', imgPath)
return true
} else return false
}
/**
视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
/
const saveJsonData = (type, data) => {
try {
// 读取已有json信息
let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
jsonFile += `/data.json`
let jsonData = []
if (fs.existsSync(jsonFile)) {
fileData = fs.readFileSync(jsonFile, {
encoding: 'utf8'
})
if (fileData) {
jsonData = JSON.parse(fileData)
}
}
// 写入
jsonData.push(data)
fs.writeFileSync(jsonFile, JSON.stringify(jsonData));
} catch (error) {
log.error('写入json文件失败', data)
}
}
const clearData = () => {
media = []
videoCount = 0
imgCount = 0
videoDl = 0
imgDl = 0
}
/**
- 下载某用户/标签下获取的所有资源
- /
const downloadAll = (userName, data) => {
let dlActions = data.map(item => next => { download(userName, item, next)
})
return new Promise((resolve, reject) => { async.series(dlActions, (error, result) => {
return resolve(result)
})
})
}
/**
- 用时显示
- /
const timeUsed = t => {
// [1s, 1m)
if (t < 60) return ${Math.ceil(t)}s
// [1m, 1h)
else if (t >= 60 && t < 60 * 60) return ${Math.floor(t/60)}m${Math.floor(t%60)}s
// [1h, 1d)
else if (t >= 60 * 60 && t < 60 * 60 * 24) return ${Math.floor(t/(60*60))}h${Math.floor(t%(60*60)/60)}m
// [1d, ~)
else return ${ Math.floor(t/(24*60*60)) }d ${ Math.floor( t%(24*60*60)/(60*60) ) }h
}
/**
某个用户/标签的抓取任务
/
const task = async (item, next) => {
let userName = item.name
let {
totalPage,
userId,
cursor
} = await getHtml(item).catch(error => {
log.error('fetch error', error)
return next(null)
})
let data = await getAllUrls(item, totalPage, userId, cursor)
clearData()
let st = new Date()
let download = await downloadAll(userName, data)
let et = new Date()
let ut = timeUsed((et - st) / 1000)
log.info(${userName} 所有下载完成, video ${videoDl} 个,img ${imgDl} 个,共用时 ${ut}
)
clearData()
return next(null)
}
const main = () => {
let actions = target.map(item => next => {
task(item, next)
})
async.series(actions, (error, result) => {
log.info(所有 ${result.length} 个任务完成
, error)
process.exit(0)
})
}
main()
instagram 内容抓取