9GAG.com 内容抓取

抓取说明

1、总共52个分类。

2、数据获取

  • url:https://9gag.com/v1/group-posts/group/cute/type/hot?c=10
  • 说明:cute 为分类。首次获取只需传入 c=10 即为前十条数据。下次请求附带上次请求返回的 nextCursor 参数即可。每次请求返回10条数据。

3、每个资源的属性:

  • 唯一标志: id
  • 资源描述: titile

4、资源分三种类型,根据images属性下的字段区分

  1. image  属性:image460    image700  
  2. gif  属性:image460    image460sv  image460svwm    image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为0,及为无声,即为GIF  
  3. video  属性:image460    image460sv  image460svwm   image700 说明:image460sv image460svwm 两个属性下的 hasAudio 字段为1,及为有声,即为video  

5、内容字段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
image460: {
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
webpUrl: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460swp.webp"
width: 460
}

image460sv: {
duration: 32
h265Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svh265.mp4"
hasAudio: 1
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460sv.mp4"
vp9Url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svvp9.webm"
width: 460
}

image460svwm: {
duration: 32
hasAudio: 1
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460svwm.webm"
width: 460
}

image700: {
height: 258
url: "https://img-9gag-fun.9cache.com/photo/aq73Yrj_460s.jpg"
width: 460
}

6、分类数组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
[
'funny',
'cute',
'anime-manga',
'ask9gag',
'awesome',
'basketball',
'car',
'comic',
'cosplay',
'country',
'classicalartmemes',
'imadedis',
'drawing',
'animefanart',
'food',
'football',
'fortnite',
'gaming',
'gif',
'girl',
'girly',
'guy',
'history',
'horror',
'home',
'kpop',
'leagueoflegends',
'lego',
'movie-tv',
'music',
'overwatch',
'pcmr',
'photography',
'pokemon',
'politics',
'relationship',
'pubg',
'roastme',
'savage',
'starwars',
'satisfying',
'school',
'science',
'superhero',
'surrealmemes',
'sport',
'travel',
'timely',
'video',
'warhammer',
'wallpaper',
'wtf'
]

核心代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/**
* 获取内容
*/
const get9gagList = (category, offset, next) = & gt; {

var options = {
method: 'GET',
url: `https://9gag.com/v1/group-posts/group/${category}/type/${type}?`
};

if (offset == '') {
options.url += 'c=10'
} else if (offset == -1) {
return next('complete')
} else {
options.url += offset
}

request(options, function (error, response, body) {
if (error) {
next(error, null)
return
}
let data = JSON.parse(body)

if (data & amp; & amp; data.data & amp; & amp; data.data.posts & amp; & amp; data.data.posts.length) {
log.info(`获取 ${category} 视频列表成功 offset ${offset? offset: 'c=10'}`, data.data.posts.length)
//videoList.push(data.data)
videoList = videoList.concat(data.data.posts)
return next(null, category, data.data.nextCursor)
} else {
log.info(`获取 ${category} 内容为空 offset ${offset},所有数据获取完毕 。`)
return next(null, category, -1)
}
});

}

/**
* 批量获取内容列表
*/
const getMultiList = async category = & gt; {
// 每页依次队列获取
let actions = [async.constant(category, '')]
for (let i = 1; i & lt; = pageCount; i++) {
actions.push(get9gagList)
}
return new Promise((resolve, reject) = & gt; {
async.waterfall(actions, function (err, result) {
log.info(`finish crawler ${category} videos`, err, videoList.length)
//if (err) return reject(new Error(err))
if (err) log.info(err)
return resolve(videoList)
})
})
}

/**
* 下载视频/图片
*/
const download = (category, media, next) = & gt; {
//return new Promise((resolve, reject) => {
let isExist = isFileExist(media.id)
if (isExist) return next(null)

let filePath
if (media.type == 'video') {
filePath = `${videoDlPath}/${media.id}.mp4`
} else if (media.type == 'img') {
filePath = `${imgDlPath}/${media.id}.jpg`
} else return next(null)

request(media.url)
.on('response', function (res) {
// create file write stream
var fws = fs.createWriteStream(filePath);
// setup piping
res.pipe(fws);
// finish
res.on('end', function (e) {
log.info(`finish download ${category} ${filePath}`)
saveJsonData(media.type, {
id: media.id,
category: category,
desc: media.desc
})
if (media.type == 'video') videoAmount++
else imgAmount++

//return resolve(filePath)
return next(null)
});
// error handler
res.on('error', err = & gt; {
log.error('download error', err)
//return reject(err)
return next(null)
})
});
//})
}

/**
* 视频是否已下载
*/
const isFileExist = id = & gt; {
let videoPath = `${videoDlPath}/${id}.mp4`
let imgPath = `${imgDlPath}/${id}.jpg`
if (fs.existsSync(videoPath)) {
log.info('video file exist', videoPath)
return true
} else if (fs.existsSync(imgPath)) {
log.info('img file exist', imgPath)
return true
} else return false
}

/**
* 视频下载成功后,实时更新json数据。防止程序中途奔溃后视频信息未保存
*/
const saveJsonData = (type, data) = & gt; {
try {
// 读取已有json信息
let jsonFile = type == 'video' ? videoJsonPath : imgJsonPath
jsonFile += `/data.json`

let jsonData = []
if (fs.existsSync(jsonFile)) {
fileData = fs.readFileSync(jsonFile, {
encoding: 'utf8'
})
if (fileData) {
jsonData = JSON.parse(fileData)
}
}
// 写入
jsonData.push(data)
fs.writeFileSync(jsonFile, JSON.stringify(jsonData));

} catch (error) {
log.error('写入json文件失败', data)
}

}

/**
* 将无声MP4转为gif图
*/
const convertVideoToGift = () = & gt; {
let videoPath = './233.mp4'
var command = ffmpeg(videoPath)
.format('gif');
command.save('./233.gif');
}

/**
* 内容筛选,只下载有声视频
*/
const mediaFilter = data = & gt; {
let results = [],
videos = [],
imgs = []
for (let i = 0; i & lt; data.length; i++) {
let video = data[i]
if (video.images.image460sv & amp; & amp; video.images.image460sv.hasAudio & amp; & amp; video.images.image460sv.url) {
// 有声视频
videos.push({
id: video.id,
type: 'video',
url: video.images.image460sv.url,
desc: video.title
})
} else if (!video.images.image460sv & amp; & amp; video.images.image460.url) {
// 图片
imgs.push({
id: video.id,
type: 'img',
url: video.images.image460.url,
desc: video.title
})
}
}
return {
results: results.concat(videos, imgs),
video: videos.length,
img: imgs.length
}
}

/**
* 每个分类的抓取任务
*/
const task = async (category, next) = & gt; {
let videoLists = await getMultiList(category)
videoList = []
log.info('数据获取成功', videoLists.length)
let {
results: videos,
video,
img
} = mediaFilter(videoLists)
log.info(`${videoLists.length} 个内容,有声视频共 ${video} 个,图片共 ${img} 个`)

let dlActions = videos.map(video = & gt; next = & gt; {
return download(category, video, next)
})

async.series(dlActions, (err, result) = & gt; {
if (err) {
log.error(`finish【${category}】all download error`, error)
return next(error)
}
log.info(`finish【${category}】all downloads success`, result.filter(item = & gt; item).length)
return next(null)
})
}

const main = () = & gt; {

let actions = category.map(item = & gt; next = & gt; {
return task(item, next)
})

return new Promise((resolve, reject) = & gt; {
async.series(actions, function (err, result) {
if (err) return reject(new Error(err))
return resolve(result)
})
})
}

main()
.then(result = & gt; {
log.info(`awsome! all ${result.length} tasks finish success! video: ${videoAmount} 个, img: ${imgAmount} 个`, )
})
.catch(error = & gt; {
log.info(`all tasks finish error! video: ${videoAmount}, img: ${imgAmount}`, error)
})
.then(() = & gt; {
process.exit(0)
})

完整代码: https://github.com/flute/9gag-crawler

Author

Ludis

Posted on

2018-09-25

Updated on

2019-01-14

Licensed under

Comments