diff --git a/lib/utils/wechat-mp.js b/lib/utils/wechat-mp.js
index e24b64db07..ba948c7aad 100644
--- a/lib/utils/wechat-mp.js
+++ b/lib/utils/wechat-mp.js
@@ -38,16 +38,29 @@ const replaceTag = ($, oldTag, newTagName) => {
const detectOriginalArticleUrl = ($) => {
// No article content get, try the original url
+ // example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw
if (!$('#js_content').text()) {
return $('#js_share_source').attr('data-url');
}
// Article content is too short, try the first link
+ // example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg
if ($('#js_content').text().length < 80) {
return $('#js_content a').attr('href');
}
return null;
};
+const detectSourceUrl = ($) => {
+ const matchs = $.root()
+ .html()
+ .match(/msg_source_url = '(.+)';/);
+
+ if (matchs) {
+ return matchs[1];
+ }
+ return null;
+};
+
/**
* Articles from WeChat MP have weird formats, this function is used to fix them.
*
@@ -90,6 +103,18 @@ const fixArticleContent = (html, skipImg = false) => {
replaceTag($, section, 'div');
}
});
+
+ // fix single picture article
+ // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
+ $('script').each((_, script) => {
+ script = $(script);
+ const matchs = script.html().match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
+
+ if (matchs) {
+ script.replaceWith(matchs[1].replace(/\r/g, '').replace(/\n/g, '
').replace(/\\x0d/g, '').replace(/\\x0a/g, '
'));
+ }
+ });
+
// clean scripts
$('script').remove();
return $.html();
@@ -156,11 +181,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
const response = await got(url);
const $ = cheerio.load(response.data);
- const title = $('meta[property="og:title"]').attr('content');
+ const title = $('meta[property="og:title"]').attr('content').replace(/\\r/g, '').replace(/\\n/g, ' ');
const author = $('meta[name=author]').attr('content');
let summary = $('meta[name=description]').attr('content');
summary = summary !== title ? summary : '';
- let description = fixArticleContent($('div#js_content.rich_media_content'));
+ let description = fixArticleContent($('#js_content'));
// No article get or article is too short, try the original url
const originalUrl = detectOriginalArticleUrl($);
@@ -171,6 +196,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
description += fixArticleContent(original$('#js_content'));
}
+ const sourceUrl = detectSourceUrl($);
+ if (sourceUrl) {
+ description += `阅读原文`;
+ }
+
let pubDate;
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html();
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);