对电影天堂增加全文抓取 (#1697)

let list = $('.co_content8 table tr').get();
list.splice(0, 1);
原因如下:
页面内含有两个 .co_content8 table
仅第一个 table内 第一个 tr 元素是广告

const list = $('.co_content8 table tr:not(:first-child)').get();
会丢失第二个table的第一个tr元素
This commit is contained in:
junfengP
2019-03-08 11:23:38 +08:00
committed by DIYgod
parent 41f94cd2a5
commit a538a9c0d6

View File

@@ -1,6 +1,24 @@
const axios = require('../../utils/axios');
const cheerio = require('cheerio');
const iconv = require('iconv-lite');
async function load(link, ctx) {
const cache = await ctx.cache.get(link);
if (cache) {
return cache;
}
const response = await axios.get(link, {
responseType: 'arraybuffer',
});
response.data = iconv.decode(response.data, 'gb2312');
const $ = cheerio.load(response.data);
const description = $('div#Zoom').html();
await ctx.cache.set(link, description, 24 * 60 * 60);
return description;
}
module.exports = async (ctx) => {
const response = await axios.get('http://www.dytt8.net', {
responseType: 'arraybuffer',
@@ -9,25 +27,34 @@ module.exports = async (ctx) => {
const $ = cheerio.load(response.data);
const list = $('.co_content8 table tr').get();
// 页面含有2个.co_content8 table
// 仅第一个table内第一个tr元素是广告连接
// 去除该广告连接
list.splice(0, 1);
// const list = $('.co_content8 table tr:not(:first-child)').get();
const process = await Promise.all(
list.map(async (item) => {
const link = $(item).find('a:nth-of-type(2)');
const itemUrl = 'http://www.dytt8.net' + link.attr('href');
const other = await load(itemUrl, ctx);
return {
title: link.text(),
description: other,
pubDate: new Date(
$(item)
.find('font')
.text()
).toUTCString(),
link: itemUrl,
};
})
);
const data = {
title: '电影天堂',
link: 'http://www.dytt8.net',
description: '电影天堂RSS',
item: list
.map((item) => {
const link = $(item).find('a:nth-of-type(2)');
return {
title: link.text(),
description: link.text(),
pubDate: new Date(
$(item)
.find('font')
.text()
).toUTCString(),
link: 'http://www.dytt8.net' + link.attr('href'),
};
})
.slice(1),
item: process,
};
ctx.state.data = data;