mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-04 19:59:54 +08:00
138 lines
5.5 KiB
JavaScript
138 lines
5.5 KiB
JavaScript
const got = require('@/utils/got');
|
|
const cheerio = require('cheerio');
|
|
const { parseDate } = require('@/utils/parse-date');
|
|
const timezone = require('@/utils/timezone');
|
|
const { finishArticleItem } = require('@/utils/wechat-mp');
|
|
const { RequestInProgressError } = require('@/errors');
|
|
const wait = require('@/utils/wait');
|
|
|
|
const parsePage = ($item, hyperlinkSelector, timeSelector) => {
|
|
const hyperlink = $item.find(hyperlinkSelector);
|
|
const title = hyperlink.text();
|
|
const link = hyperlink.attr('href');
|
|
const pubDate = timezone(parseDate($item.find(timeSelector).text(), 'YYYY-MM-DD HH:mm'), 8);
|
|
return {
|
|
title,
|
|
link,
|
|
pubDate,
|
|
};
|
|
};
|
|
|
|
module.exports = async (ctx) => {
|
|
// !!! here we must use a lock to prevent other requests to break the anti-anti-crawler workarounds !!!
|
|
if ((await ctx.cache.get('data258:lock', false)) === '1') {
|
|
throw new RequestInProgressError('Another request is in progress, please try again later.');
|
|
}
|
|
// !!! here no need to acquire the lock, because the MP/category page has no crawler detection !!!
|
|
|
|
const id = ctx.params.id;
|
|
|
|
const limit = ctx.query.limit ? parseInt(ctx.query.limit) : 5;
|
|
|
|
const rootUrl = 'https://mp.data258.com';
|
|
const pageUrl = id ? `${rootUrl}/article/category/${id}` : rootUrl;
|
|
|
|
const response = await got(pageUrl);
|
|
const $ = cheerio.load(response.data);
|
|
|
|
const title = $('head title').text();
|
|
// title = title.endsWith('-微阅读') ? title.slice(0, title.length - 4) : title;
|
|
const description = $('meta[name="description"]').attr('content');
|
|
|
|
const categoryPage = $('ul.fly-list');
|
|
|
|
let items;
|
|
if (categoryPage && categoryPage.length) {
|
|
// got a category page
|
|
items = $(categoryPage)
|
|
.find('li')
|
|
.map((_, item) => parsePage($(item), 'h2 a', '.fly-list-info span'))
|
|
.get();
|
|
} else {
|
|
// got an MP page
|
|
items = $('ul.jie-row li')
|
|
.map((_, item) => parsePage($(item), 'a.jie-title', '.layui-hide-xs'))
|
|
.get();
|
|
}
|
|
|
|
items = items.slice(0, limit); // limit to avoid being anti-crawled
|
|
|
|
// !!! double-check !!!
|
|
if ((await ctx.cache.get('data258:lock', false)) === '1') {
|
|
throw new RequestInProgressError('Another request is in progress, please try again later.');
|
|
} else {
|
|
// !!! here we acquire the lock because the jump page has crawler detection !!!
|
|
await ctx.cache.set('data258:lock', '1', 60);
|
|
}
|
|
|
|
// !!! here we must use a for-loop to ensure the concurrency is 1 !!!
|
|
// !!! please do note that if you try to increase the concurrency, your IP will be banned for a long time !!!
|
|
|
|
let err; // !!! let RSSHub throw an anti-crawler prompt if the route is empty !!!
|
|
|
|
/* eslint-disable no-await-in-loop */
|
|
for (const item of items) {
|
|
// https://mp.data258.com/wx?id=${id}&t={token}, id is a permanent hex, token is a temporary base64
|
|
const cacheId = item.link.match(/id=([\da-f]+)/)[1];
|
|
item.link = item.link.startsWith('http') ? item.link : `${rootUrl}${item.link}`;
|
|
const realLink = await ctx.cache.tryGet(`data258:${cacheId}`, async () => {
|
|
try {
|
|
// !!! here we must sleep 1s to avoid being anti-crawled !!!
|
|
// !!! please do note that if the interval is less than 1s, your IP will be banned for a long time !!!
|
|
await wait(1000);
|
|
|
|
const response = await got.get(item.link, {
|
|
headers: {
|
|
Referer: pageUrl, // essential
|
|
},
|
|
});
|
|
if (response.data.includes('今日浏览次数已达上限')) {
|
|
// !!! as long as cache hits, the link will not be crawled and consume the limit !!!
|
|
// !!! so that's not a big problem if the RSSHub instance is self-hosted !!!
|
|
err = new got.RequestError(response.data, {}, response.request);
|
|
return null;
|
|
}
|
|
const $ = cheerio.load(response.data);
|
|
const jmpJS = $('script')
|
|
.filter((_, e) => $(e).html().includes('location.href'))
|
|
.html();
|
|
return jmpJS.match(/location\.href='([^']+)'/)[1];
|
|
} catch (e) {
|
|
err = e;
|
|
return null;
|
|
}
|
|
});
|
|
if (realLink) {
|
|
item.link = realLink;
|
|
} else {
|
|
break; // being anti-crawled, immediately cancel following operations
|
|
}
|
|
}
|
|
/* eslint-enable no-await-in-loop */
|
|
|
|
// !!! release the lock, let it expire immediately since no need to keep it in cache !!!
|
|
await ctx.cache.set('data258:lock', '0', 1);
|
|
|
|
// jump links are valid only for a short period of time, drop those un-jumped items
|
|
// http://mp.weixin.qq.com/s
|
|
items = items.filter((item) => item.link.match(/^https?:\/\/mp\.weixin\.qq\.com\/s/));
|
|
|
|
if (items.length === 0 && err) {
|
|
// !!! if each request is anti-crawled, the filtered items array will be empty !!!
|
|
// !!! let RSSHub throw an anti-crawler prompt !!!
|
|
throw err;
|
|
}
|
|
|
|
await Promise.all(items.map(async (item) => await finishArticleItem(ctx, item, !!categoryPage)));
|
|
|
|
ctx.state.data = {
|
|
title,
|
|
link: pageUrl,
|
|
description,
|
|
item: items,
|
|
};
|
|
};
|
|
|
|
// TODO: login? the valid time for cookies seems to be short, and abusing account will probably get banned...
|
|
// TODO: fetch full article for the official RSS feed? unless someone who is VIP contributes their RSS feed for test...
|