feat: 实现天涯论坛的抓取 (#3143)

This commit is contained in:
a14907
2019-09-27 10:28:52 +08:00
committed by DIYgod
parent 7b5f82f326
commit 84ded422f1
5 changed files with 125 additions and 0 deletions

View File

@@ -160,3 +160,17 @@ pageClass: routes
### 回帖 ### 回帖
<Route author="LogicJake" example="/zhibo8/post/2601615" path="/zhibo8/post/:id" :paramsDesc="['帖子 id可在帖子 URL 找到']"/> <Route author="LogicJake" example="/zhibo8/post/2601615" path="/zhibo8/post/:id" :paramsDesc="['帖子 id可在帖子 URL 找到']"/>
## 天涯论坛
### 子版块
<Route author="a14907" example="/tianya/index/lookout" path="/tianya/index/:type" :paramsDesc="['板块类型 type可在 URL 找到 例如天涯杂谈板块的地址是http://bbs.tianya.cn/list-free-1.shtml 这个板块的type就是free; 同理我的大学板块地址为http://bbs.tianya.cn/list-university-1.shtml类型是university']"/>
### 用户帖子
<Route author="a14907" example="/tianya/user/11488997" path="/tianya/user/:userid" :paramsDesc="['用户id userid可在 URL 找到 例如用户苕木匠的地址是http://www.tianya.cn/11488997/bbs 苕木匠的userid就是11488997']"/>
### 用户的回帖
<Route author="a14907" example="/tianya/comments/11488997" path="/tianya/comments/:userid" :paramsDesc="['用户id userid可在 URL 找到 例如用户苕木匠的地址是http://www.tianya.cn/11488997/bbs 苕木匠的userid就是11488997']"/>

View File

@@ -1649,6 +1649,11 @@ router.get('/soul/:id', require('./routes/soul'));
// 单向空间 // 单向空间
router.get('/owspace/read/:type?', require('./routes/owspace/read')); router.get('/owspace/read/:type?', require('./routes/owspace/read'));
// 天涯论坛
router.get('/tianya/index/:type', require('./routes/tianya/index'));
router.get('/tianya/user/:userid', require('./routes/tianya/user'));
router.get('/tianya/comments/:userid', require('./routes/tianya/comments'));
// eleme // eleme
router.get('/eleme/open/announce', require('./routes/eleme/open/announce')); router.get('/eleme/open/announce', require('./routes/eleme/open/announce'));
router.get('/eleme/open-be/announce', require('./routes/eleme/open-be/announce')); router.get('/eleme/open-be/announce', require('./routes/eleme/open-be/announce'));

View File

@@ -0,0 +1,36 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
module.exports = async (ctx) => {
const { userid } = ctx.params;
const url = 'http://www.tianya.cn/' + userid + '/bbs?t=post';
const responseraw = await got(url, { headers: { Referer: 'http://bbs.tianya.cn' } });
const $ = cheerio.load(responseraw.data);
const username = $('div.portrait h2 a')
.first()
.text();
const turl = `http://www.tianya.cn/api/bbsuser?method=userinfo.ice.getUserTotalReplyList&params.userId=${userid}&params.pageSize=20&params.bMore=true`;
const response = await got(turl, { headers: { Referer: 'http://bbs.tianya.cn' } });
const json = response.data;
const items = json.data.rows.map((ele) => {
const title = ele.title;
const clicknum = ` 点击数:${ele.click_counter},回复数:${ele.reply_counter}`;
const link = `http://bbs.tianya.cn/go_reply_position.jsp?item=${ele.item}&id=${ele.art_id}&rid=${ele.reply_id}`;
const date = ele.reply_time;
const pubDate = new Date(date).toUTCString();
return {
title,
description: title + clicknum,
link,
pubDate,
};
});
ctx.state.data = {
title: username + '的天涯回帖',
description: username,
link: url,
item: items,
};
};

View File

@@ -0,0 +1,34 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
module.exports = async (ctx) => {
const { type } = ctx.params;
const url = 'http://bbs.tianya.cn/list-' + type + '-1.shtml';
const response = await got(url, { headers: { Referer: 'http://bbs.tianya.cn' } });
const $ = cheerio.load(response.data);
const typeTitle = $('div.location div.text strong').text();
const items = $('table > tbody ~ tbody > tr')
.map((_, ele) => {
const $item = cheerio.load(ele);
const title = $item('td.td-title a').text();
const link = $item('td.td-title a').attr('href');
const date = $item('td')
.last()
.attr('title');
const pubDate = new Date(date).toUTCString();
return {
title,
description: title,
link,
pubDate,
};
})
.get();
ctx.state.data = {
title: typeTitle,
description: typeTitle,
link: url,
item: items,
};
};

36
lib/routes/tianya/user.js Normal file
View File

@@ -0,0 +1,36 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
module.exports = async (ctx) => {
const { userid } = ctx.params;
const url = 'http://www.tianya.cn/' + userid + '/bbs?t=post';
const responseraw = await got(url, { headers: { Referer: 'http://bbs.tianya.cn' } });
const $ = cheerio.load(responseraw.data);
const username = $('div.portrait h2 a')
.first()
.text();
const turl = `http://www.tianya.cn/api/bbsuser?method=userinfo.ice.getUserTotalArticleList&params.userId=${userid}&params.pageSize=20&params.bMore=true`;
const response = await got(turl, { headers: { Referer: 'http://bbs.tianya.cn' } });
const json = response.data;
const items = json.data.rows.map((ele) => {
const title = ele.title;
const clicknum = ` 点击数:${ele.click_counter},回复数:${ele.reply_counter}`;
const link = `http://bbs.tianya.cn/post-${ele.item}-${ele.art_id}-1.shtml`;
const date = ele.compose_time;
const pubDate = new Date(date).toUTCString();
return {
title,
description: title + clicknum,
link,
pubDate,
};
});
ctx.state.data = {
title: username + '的天涯帖子',
description: username,
link: url,
item: items,
};
};