fix(utils): 支持微信公众号单图片文章抓取; 增加输出阅读原文链接 (#9557)

* fix(utils): 支持微信公众号单图片文章抓取

* fix(utils): 支持输出微信公众号转载文章阅读原文链接
This commit is contained in:
任平生
2022-04-19 01:29:45 +08:00
committed by GitHub
parent 864016c598
commit f3e069d399

View File

@@ -38,16 +38,29 @@ const replaceTag = ($, oldTag, newTagName) => {
const detectOriginalArticleUrl = ($) => {
// No article content get, try the original url
// example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw
if (!$('#js_content').text()) {
return $('#js_share_source').attr('data-url');
}
// Article content is too short, try the first link
// example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg
if ($('#js_content').text().length < 80) {
return $('#js_content a').attr('href');
}
return null;
};
const detectSourceUrl = ($) => {
const matchs = $.root()
.html()
.match(/msg_source_url = '(.+)';/);
if (matchs) {
return matchs[1];
}
return null;
};
/**
* Articles from WeChat MP have weird formats, this function is used to fix them.
*
@@ -90,6 +103,18 @@ const fixArticleContent = (html, skipImg = false) => {
replaceTag($, section, 'div');
}
});
// fix single picture article
// example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
$('script').each((_, script) => {
script = $(script);
const matchs = script.html().match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
if (matchs) {
script.replaceWith(matchs[1].replace(/\r/g, '').replace(/\n/g, '<br>').replace(/\\x0d/g, '').replace(/\\x0a/g, '<br>'));
}
});
// clean scripts
$('script').remove();
return $.html();
@@ -156,11 +181,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
const response = await got(url);
const $ = cheerio.load(response.data);
const title = $('meta[property="og:title"]').attr('content');
const title = $('meta[property="og:title"]').attr('content').replace(/\\r/g, '').replace(/\\n/g, ' ');
const author = $('meta[name=author]').attr('content');
let summary = $('meta[name=description]').attr('content');
summary = summary !== title ? summary : '';
let description = fixArticleContent($('div#js_content.rich_media_content'));
let description = fixArticleContent($('#js_content'));
// No article get or article is too short, try the original url
const originalUrl = detectOriginalArticleUrl($);
@@ -171,6 +196,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
description += fixArticleContent(original$('#js_content'));
}
const sourceUrl = detectSourceUrl($);
if (sourceUrl) {
description += `<a href="${sourceUrl}">阅读原文</a>`;
}
let pubDate;
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html();
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);