feat(utils): add utils for WeChat MP (#9487)

Motivation:
There are multiple routes that need to fetch articles from WeChat MP.
However, letting them fetch articles by themselves could potentially
lead to cache key collisions. Even if cache key collisions do not occur,
un-normalized URL could potentially lead to duplicated requests.
What's more, articles from WeChat MP have weird formats and need to be
fixed. Creating a universal function to do this work can create some
ease for new route contributors.

Note:
In order to make this PR atomic as much as possible, I did not touch
those broken routes. Once this PR is merged, I will try to fix them.

Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
This commit is contained in:
Rongrong
2022-04-07 21:46:15 +08:00
committed by GitHub
parent e0b7ca676d
commit a79cc20ec1
12 changed files with 305 additions and 158 deletions

View File

@@ -3344,7 +3344,7 @@ column 为 third 时可选的 category:
### 公众号CareerEngine 来源)
<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号,通过 URL 中找到对应的公众号 id']"/>
<Route author="HenryQW" example="/wechat/ce/595a5b14d7164e53908f1606" path="/wechat/ce/:id" :paramsDesc="['公众号 id, 在 [CareerEngine](https://search.careerengine.us/) 搜索公众号,通过 URL 中找到对应的公众号 id']" anticrawler="1"/>
### 公众号Telegram 频道来源)
@@ -3380,7 +3380,7 @@ column 为 third 时可选的 category:
### 公众号 (wxnmh.com 来源)
<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']"/>
<Route author="laampui" example="/wechat/wxnmh/51798" path="/wechat/wxnmh/:id" :paramsDesc="['公众号 id, 打开 wxnmh.com, 在 URL 中找到 id']" anticrawler="1"/>
### 公众号 (wechat-feeds 来源)

View File

@@ -0,0 +1 @@
Make sure you read lib/utils/wechat-mp.js before adding a new route.

View File

@@ -1,42 +1,18 @@
const parser = require('@/utils/rss-parser');
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { finishArticleItem } = require('@/utils/wechat-mp');
module.exports = async (ctx) => {
const { id } = ctx.params;
const link = `https://github.com/hellodword/wechat-feeds/raw/feeds/${id}.xml`;
const feed = await parser.parseURL(link);
const items = await Promise.all(
feed.items.map(async (item) => {
const cache = await ctx.cache.get(item.link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
const response = await got.get(item.link);
const $ = cheerio.load(response.data);
const post = $('#js_content');
post.find('img').each((_, img) => {
const dataSrc = $(img).attr('data-src');
if (dataSrc) {
$(img).attr('src', dataSrc);
}
});
const single = {
const items = feed.items.map((item) => ({
title: item.title,
description: post.html(),
pubDate: new Date(item.pubDate),
link: item.link,
};
ctx.cache.set(item.link, JSON.stringify(single));
return Promise.resolve(single);
})
);
guid: item.link,
}));
await Promise.all(items.map(async (item) => await finishArticleItem(ctx, item)));
ctx.state.data = {
title: feed.title,

View File

@@ -1,6 +1,8 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const dayjs = require('dayjs');
const { finishArticleItem } = require('@/utils/wechat-mp');
module.exports = async (ctx) => {
const { biz, hid, cid } = ctx.params;
let cidurl = '';
@@ -26,32 +28,11 @@ module.exports = async (ctx) => {
const mptitle = $('div.articles_header').find('a').text() + `|` + $('div.articles_header > h2.rich_media_title').text();
const articledata = await Promise.all(
list.map(async (item) => {
const link = item.link.replace('http://', 'https://');
const cache = await ctx.cache.get(link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
const response2 = await got({
method: 'get',
url: link,
});
const articleHtml = response2.data;
const $2 = cheerio.load(articleHtml);
$2('img').removeAttr('src');
$2('div#js_profile_qrcode').remove();
const content = $2('div#js_content.rich_media_content')
.html()
.replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
.replace('<iframe ', '<iframe width="640" height="360"')
.replace(/data-src/g, 'src');
const author = $2('div#meta_content:not(:last-child)').text();
const single = {
content,
author,
link: item.link,
guid: item.link,
};
ctx.cache.set(link, JSON.stringify(single));
return Promise.resolve(single);
return await finishArticleItem(ctx, single);
})
);
ctx.state.data = {
@@ -67,9 +48,10 @@ module.exports = async (ctx) => {
src="${item.cover}"
><br>
<br>
${articledata[index].content}
${articledata[index].description}
`,
link: item.link,
link: articledata[index].link,
guid: articledata[index].guid,
author: articledata[index].author,
pubDate: dayjs.unix(item.sendtime).format(),
})),

View File

@@ -1,6 +1,8 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const dayjs = require('dayjs');
const { finishArticleItem } = require('@/utils/wechat-mp');
module.exports = async (ctx) => {
const { biz, aid } = ctx.params;
const aidurl = `&album_id=${aid}`;
@@ -16,33 +18,12 @@ module.exports = async (ctx) => {
list.map(async (item) => {
const link = $(item).attr('data-link').replace('http://', 'https://');
const title = $(item).attr('data-title');
const cache = await ctx.cache.get(link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
const response2 = await got({
method: 'get',
url: link,
});
const articleHtml = response2.data;
const $2 = cheerio.load(articleHtml);
$2('img').removeAttr('src');
$2('div#js_profile_qrcode').remove();
const content = $2('div#js_content.rich_media_content')
.html()
.replace('iframe/preview.html?width=500&amp;height=375&amp;', 'txp/iframe/player.html?')
.replace('<iframe ', '<iframe width="640" height="360"')
.replace(/data-src/g, 'src');
const author = $2('div#meta_content:not(:last-child)').text();
const single = {
content,
author,
link,
title,
link,
guid: link,
};
ctx.cache.set(link, JSON.stringify(single));
return Promise.resolve(single);
return await finishArticleItem(ctx, single);
})
);
ctx.state.data = {
@@ -50,8 +31,9 @@ module.exports = async (ctx) => {
link: `https://mp.weixin.qq.com/mp/appmsgalbum?__biz=${biz}&action=getalbum${aidurl}`,
item: list.map((item, index) => ({
title: articledata[index].title,
description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].content}`,
description: $(item).find('.album__item-img').html() + `<br><br>${articledata[index].description}`,
link: articledata[index].link,
guid: articledata[index].guid,
author: articledata[index].author,
pubDate: dayjs.unix($(item).find('.js_article_create_time').text()).format(),
})),

View File

@@ -1,5 +1,6 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { finishArticleItem } = require('@/utils/wechat-mp');
module.exports = async (ctx) => {
const id = ctx.params.id;
@@ -120,37 +121,16 @@ module.exports = async (ctx) => {
title,
pubDate,
link,
author,
guid: link,
};
if (link !== undefined) {
const value = await ctx.cache.get(link);
if (value) {
single.description = value;
} else {
try {
const reponse = await got.get(link);
const $ = cheerio.load(reponse.data);
single.description = $('.rich_media_content')
.html()
.replace(/data-src/g, 'src');
ctx.cache.set(link, single.description, 12 * 60 * 60);
return await finishArticleItem(ctx, single);
} catch (err) {
single.description = item.find('.tgme_widget_message_text').html();
}
}
}
// 修复文字格式错误
single.description = single.description
.replace(/(<strong.*?>)(.*?)(<\/strong>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
.replace(/<section(.*?)>(.*?)<\/section>/g, '<p $1>$2</p>')
.replace(/(<p.*?>)(.*?)(<\/p>)/g, '$1<span style="font-size: 16px; line-height: 16px;">$2</span>$3')
.replace(/<p.*?data-encc.*?>.*?<\/p>/g, '')
.replace(/<h\d(.*?)>(.*?)<\/h\d>/g, '<p $1>$2</p>')
.replace(/<br.*?>/g, '');
return single;
})
.get()

164
lib/utils/wechat-mp.js Normal file
View File

@@ -0,0 +1,164 @@
/**
* Author: @Rongronggg9
*
* There are at least three folders which are relevant with WeChat MP (Official Account Platform / Media Platform):
* lib/route/tencent/wechat
* lib/v2/wechat
* lib/v2/gzh360
*
* If your new route is not in the above folders, please add it to the list.
*
* If your route need to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
* However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`.
* If you find more metadata on the webpage, consider modifying `fetchArticle` to include them.
* NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision.
* NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you.
*
* If your route fetch MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
* If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them.
*
* For more details of these functions, please refer to the jsDoc in the source code.
*/
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { parseDate } = require('@/utils/parse-date');
const replaceTag = ($, oldTag, newTagName) => {
oldTag = $(oldTag);
const NewTag = $($(`<${newTagName} />`));
const oldTagAttr = oldTag.attr();
for (const key in oldTagAttr) {
NewTag.attr(key, oldTagAttr[key]);
}
NewTag.append(oldTag.contents());
oldTag.replaceWith(NewTag);
};
/**
* Articles from WeChat MP have weird formats, this function is used to fix them.
*
* Even though your content are not directly fetched from WeChat MP, you SHOULD still call this function.
* Calling this function is safe in most situations.
*
* Example usage: item.description = fixArticleContent($('div#js_content.rich_media_content'));
* @param {*} html - The html to be fixed, a string or a cheerio object.
* @param {boolean} skipImg - Whether to skip fixing images.
* @return {string} - The fixed html, a string.
*/
const fixArticleContent = (html, skipImg = false) => {
html = html && html.html ? html.html() : html; // do not disturb the original tree
if (!html) {
return '';
}
const $ = cheerio.load(html, undefined, false);
if (!skipImg) {
// fix img lazy loading
$('img[data-src]').each((_, img) => {
img = $(img);
const realSrc = img.attr('data-src');
if (realSrc) {
img.attr('src', realSrc);
img.removeAttr('data-src');
}
});
}
// fix section
$('section').each((_, section) => {
section = $(section);
const p_count = section.find('p').length;
const div_count = section.find('div').length;
const section_count = section.find('section').length;
if (p_count + div_count + section_count === 0) {
// make it a p
replaceTag($, section, 'p');
} else {
// make it a div
replaceTag($, section, 'div');
}
});
// clean scripts
$('script').remove();
return $.html();
};
/**
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
*
* If you use this function, no need to call `fixArticleContent`
* @param {object} ctx - The context object.
* @param {string} url - The url of the article.
* @param {boolean} bypassHostCheck - Whether to bypass host check.
* @return {Promise<object>} - An object containing the article and its metadata.
*/
const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
const oriUrl = url;
url = url.replace(/^http:\/\//, 'https://').replace(/#\w*$/, ''); // normalize url
if (!bypassHostCheck && !url.startsWith('https://mp.weixin.qq.com/')) {
throw new Error('wechat-mp: URL must start with https://mp.weixin.qq.com/ or http://mp.weixin.qq.com/, but got ' + oriUrl);
}
return await ctx.cache.tryGet(url, async () => {
const response = await got(url);
const $ = cheerio.load(response.data);
const title = $('meta[property="og:title"]').attr('content');
const author = $('meta[name=author]').attr('content');
let summary = $('meta[name=description]').attr('content');
summary = summary !== title ? summary : '';
const description = fixArticleContent($('div#js_content.rich_media_content'));
let pubDate;
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html();
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
const publish_timestamp = publish_time_match && publish_time_match[1];
if (publish_timestamp) {
pubDate = parseDate(publish_timestamp * 1000);
}
let mpName = $('.profile_nickname').first().text();
mpName = mpName && mpName.trim();
return { title, author, description, summary, pubDate, mpName, link: url };
});
};
/**
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com), then fill the `item` object with the result.
*
* If you use this function, no need to call `fetchArticle` or `fixArticleContent`
*
* A new route SHOULD use this function instead of manually calling the above functions
*
* An existing route adopting this function SHOULD either:
* - set `skipLink` to true (not recommended)
* - set `item.guid` to `item.link` BEFORE calling this function
* @param {object} ctx - The context object.
* @param {object} item - The item object to be filled.
* @param {boolean} setMpNameAsAuthor - If `true`, `author` will be the MP itself, otherwise the real author of the article.
* @param {boolean} skipLink - Whether to skip overriding `item.link` with the normalized url.
* @return {Promise<object>} - The incoming `item` object, with the article and its metadata filled in.
*/
const finishArticleItem = async (ctx, item, setMpNameAsAuthor = false, skipLink = false) => {
const { title, author, description, summary, pubDate, mpName, link } = await fetchArticle(ctx, item.link);
item.title = title || item.title;
item.description = description || item.description;
item.summary = summary || item.summary;
item.pubDate = pubDate || item.pubDate;
if (setMpNameAsAuthor) {
// the Official Account itself. if your route return articles from different accounts, you may want to use this
item.author = mpName || item.author;
} else {
// the real author of the article. if your route return articles from a certain account, use this
item.author = author || item.author;
}
if (!skipLink) {
item.link = link || item.link;
}
return item;
};
module.exports = {
fixArticleContent,
fetchArticle,
finishArticleItem, // a new route SHOULD use this function instead of manually calling the above functions
};

1
lib/v2/gzh360/_README Normal file
View File

@@ -0,0 +1 @@
Make sure you read lib/utils/wechat-mp.js before adding a new route.

View File

@@ -1,6 +1,7 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { parseDate } = require('@/utils/parse-date');
const { fixArticleContent } = require('@/utils/wechat-mp');
const invalidIdError = new RangeError('Invalid id');
@@ -48,7 +49,7 @@ const finishArticleItem = async (ctx, item, skipAuthor = false) => {
}
item.title = item.title || $('div.desc > h1').text();
item.pubDate = item.pubDate || parseDate($('div.desc span[data-timestamp]').attr('data-timestamp'));
item.description = $('div.rich_media_content').html(); // sometimes it is an empty string due to the website's fault
item.description = fixArticleContent($('div.rich_media_content'), true); // sometimes it is an empty string due to the website's fault
}
return item;

1
lib/v2/wechat/_README Normal file
View File

@@ -0,0 +1 @@
Make sure you read lib/utils/wechat-mp.js before adding a new route.

View File

@@ -1,8 +1,6 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const { parseDate } = require('@/utils/parse-date');
const { art } = require('@/utils/render');
const path = require('path');
const { finishArticleItem } = require('@/utils/wechat-mp');
module.exports = async (ctx) => {
const id = ctx.params.id;
@@ -14,54 +12,13 @@ module.exports = async (ctx) => {
let items = response.data.items.map((item) => ({
title: item.title,
// the date is when the article was grabbed, not published, `finishArticleItem` will fix this
pubDate: parseDate(item.date_modified),
link: item.url,
guid: item.id,
}));
items = await Promise.all(
items.map((item) =>
ctx.cache.tryGet(item.link, async () => {
const detailResponse = await got({
url: item.link,
headers: {
Referer: 'https://mp.weixin.qq.com',
},
});
const $ = cheerio.load(detailResponse.data);
// fix lazyload image
$('img').each((_, e) => {
e = $(e);
e.after(
art(path.join(__dirname, 'templates/image.art'), {
src: e.attr('data-src') ?? e.attr('src'),
})
);
e.remove();
});
item.description = art(path.join(__dirname, 'templates/description.art'), {
desc: $('#js_content').html(),
});
item.author = $('meta[name=author]').attr('content');
// another way to get publish timestamp:
// const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').html();
// const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"(\d{10})"/);
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("publish_time")').html();
const publish_time_match = publish_time_script && publish_time_script.match(/var.*[ ,]n *= *"(\d{10})"/);
const publish_timestamp = publish_time_match && publish_time_match[1];
if (publish_timestamp) {
item.pubDate = parseDate(publish_timestamp * 1000);
}
return item;
})
)
);
items = await Promise.all(items.map(async (item) => await finishArticleItem(ctx, item)));
ctx.state.data = {
title: response.data.title,

102
test/utils/wechat-mp.js Normal file
View File

@@ -0,0 +1,102 @@
process.env.REQUEST_TIMEOUT = '500';
const cheerio = require('cheerio');
const wechatMp = require('../../lib/utils/wechat-mp');
const nock = require('nock');
const ctx = require('../../lib/app').context;
afterAll(() => {
delete process.env.REQUEST_TIMEOUT;
});
// date from the cache will be an ISO8601 string, so we need to use this function
const compareDate = (date1, date2) => {
date1 = typeof date1 === 'string' ? new Date(date1) : date1;
date2 = typeof date2 === 'string' ? new Date(date2) : date2;
return date1.getTime() === date2.getTime();
};
describe('wechat-mp', () => {
it('fixArticleContent', () => {
const divHeader = '<div class="rich_media_content " id="js_content">';
const divFooter = '</div>';
const htmlSection =
'<section>test</section>' +
'<section><p>test</p></section>' +
'<section><div>test</div></section>' +
'<section><section><section>test</section></section></section>' +
'<div><section><p>test</p></section></div>' +
'<p>test</p>' +
'<div><p>test</p></div>' +
'<script>const test = "test"</script>';
const expectedHtmlSection = '<p>test</p>' + '<div><p>test</p></div>' + '<div><div>test</div></div>' + '<div><div><p>test</p></div></div>' + '<div><div><p>test</p></div></div>' + '<p>test</p>' + '<div><p>test</p></div>';
let $ = cheerio.load(divHeader + htmlSection + divFooter);
expect(wechatMp.fixArticleContent(htmlSection)).toBe(expectedHtmlSection);
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlSection);
const htmlImg = '<img alt="test" data-src="http://rsshub.test/test.jpg" src="http://rsshub.test/test.jpg">' + '<img alt="test" data-src="http://rsshub.test/test.jpg">' + '<img alt="test" src="http://rsshub.test/test.jpg">';
const expectedHtmlImg = new Array(3 + 1).join('<img alt="test" src="http://rsshub.test/test.jpg">');
$ = cheerio.load(divHeader + htmlImg + divFooter);
expect(wechatMp.fixArticleContent(htmlImg)).toBe(expectedHtmlImg);
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlImg);
expect(wechatMp.fixArticleContent(htmlImg, true)).toBe(htmlImg);
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'), true)).toBe(htmlImg);
expect(wechatMp.fixArticleContent('')).toBe('');
expect(wechatMp.fixArticleContent(null)).toBe('');
expect(wechatMp.fixArticleContent(undefined)).toBe('');
expect(wechatMp.fixArticleContent($('div#something_not_in.the_document_tree'))).toBe('');
});
it('fetchArticle_&_finishArticleItem', async () => {
const ct = 1636626300;
const exampleMpArticlePage =
'\n' +
'<meta name="description" content="summary" />\n' +
'<meta name="author" content="author" />\n' +
'<meta property="og:title" content="title" />\n' +
'<meta property="twitter:card" content="summary" />\n' +
'<div class="rich_media_content" id="js_content" style="visibility: hidden;">description</div>\n' +
'<div class="profile_inner"><strong class="profile_nickname">mpName</strong></div>\n' +
'<script type="text/javascript" nonce="000000000">\n' +
'var appmsg_type = "9";\n' +
`var ct = "${ct}";\n` +
'</script>';
nock('https://mp.weixin.qq.com')
.get('/rsshub_test/wechatMp_fetchArticle')
.reply(() => [200, exampleMpArticlePage]);
const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle';
const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://');
let _ret;
try {
_ret = await wechatMp.fetchArticle(ctx, 'https://im.not.wechat.mp/and/an/error/is/expected');
} catch (e) {
expect(e.name).toBe('Error');
}
expect(_ret).toBeUndefined();
const expectedItem = {
title: 'title',
summary: 'summary',
author: 'author',
description: 'description',
mpName: 'mpName',
link: httpsUrl,
};
const expectedDate = new Date(ct * 1000);
const fetchArticleItem = await wechatMp.fetchArticle(ctx, httpUrl);
expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true);
delete fetchArticleItem.pubDate;
expect(fetchArticleItem).toEqual(expectedItem);
delete expectedItem.mpName;
const finishArticleItem = await wechatMp.finishArticleItem(ctx, { link: httpUrl });
expect(compareDate(finishArticleItem.pubDate, expectedDate)).toBe(true);
delete finishArticleItem.pubDate;
expect(finishArticleItem).toEqual(expectedItem);
});
});