Files
RSSHub/lib/v2/gzh360/utils.js
Rongrong a79cc20ec1 feat(utils): add utils for WeChat MP (#9487)
Motivation:
There are multiple routes that need to fetch articles from WeChat MP.
However, letting them fetch articles by themselves could potentially
lead to cache key collisions. Even if cache key collisions do not occur,
un-normalized URL could potentially lead to duplicated requests.
What's more, articles from WeChat MP have weird formats and need to be
fixed. Creating a universal function to do this work can create some
ease for new route contributors.

Note:
In order to make this PR atomic as much as possible, I did not touch
those broken routes. Once this PR is merged, I will try to fix them.

Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
2022-04-07 21:46:15 +08:00

63 lines
2.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { parseDate } = require('@/utils/parse-date');
const { fixArticleContent } = require('@/utils/wechat-mp');
const invalidIdError = new RangeError('Invalid id');
const mpIdEncode = (name) => {
const onePassed = Buffer.from(name).toString('base64'); // one-pass uses the standard base64 alphabet
// two-passed base64 is always url-safe (`7f+/` never appear), you can prove it by yourself
return Buffer.from(onePassed).toString('base64');
};
const mpIdDecode = (id) => {
// verify that the decoded name can be a valid WeChat mp name: https://kf.qq.com/faq/120911VrYVrA141110r2MRJV.html
// "公众号名称/昵称可设置4-30个字符1个汉字算2字符": 4 ASCII characters => 12, 15 Chinese characters => 80
// and the id is a valid two-passed base64 string (`7f+/` never appear)
if (id.length < 12 || id.length > 80 || id.length % 4 !== 0 || !/^[a-eg-zA-Z0-68-9]+={0,2}$/.test(id)) {
throw invalidIdError;
}
const deSecondPassed = Buffer.from(id, 'base64').toString();
// verify that it is a valid base64 string using standard base64 alphabet
if (deSecondPassed.length % 4 !== 0 || !/^[a-zA-Z0-9+/]+={0,2}$/.test(deSecondPassed)) {
throw invalidIdError;
}
const deFirstPassed = Buffer.from(deSecondPassed, 'base64').toString();
// "空格不可在最前或者最后,且空格不可连续"
if (deFirstPassed.length < 2 || deFirstPassed.startsWith(' ') || deFirstPassed.endsWith(' ') || deFirstPassed.includes(' ')) {
throw invalidIdError;
}
return deFirstPassed;
};
const finishArticleItem = async (ctx, item, skipAuthor = false) => {
// the website is slow and unstable, so we need to ignore errors to avoid breaking the whole route
// we can't cache the item instead of the webpage because we need to retry when the last request failed
const article = await ctx.cache.tryGet(
item.link,
async () =>
await got(item.link)
.then((_r) => _r.data)
.catch(() => null) // it is safe do that in tryGet because a false value always lead to a cache miss
);
if (article) {
const $ = cheerio.load(article);
if (!skipAuthor) {
item.author = item.author || $('[id=webbdgzh]+a').text();
}
item.title = item.title || $('div.desc > h1').text();
item.pubDate = item.pubDate || parseDate($('div.desc span[data-timestamp]').attr('data-timestamp'));
item.description = fixArticleContent($('div.rich_media_content'), true); // sometimes it is an empty string due to the website's fault
}
return item;
};
module.exports = {
mpIdEncode,
mpIdDecode,
finishArticleItem,
};