mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-04 11:07:54 +08:00
fix(utils): 支持微信公众号单图片文章抓取; 增加输出阅读原文链接 (#9557)
* fix(utils): 支持微信公众号单图片文章抓取 * fix(utils): 支持输出微信公众号转载文章阅读原文链接
This commit is contained in:
@@ -38,16 +38,29 @@ const replaceTag = ($, oldTag, newTagName) => {
|
||||
|
||||
const detectOriginalArticleUrl = ($) => {
|
||||
// No article content get, try the original url
|
||||
// example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw
|
||||
if (!$('#js_content').text()) {
|
||||
return $('#js_share_source').attr('data-url');
|
||||
}
|
||||
// Article content is too short, try the first link
|
||||
// example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg
|
||||
if ($('#js_content').text().length < 80) {
|
||||
return $('#js_content a').attr('href');
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
const detectSourceUrl = ($) => {
|
||||
const matchs = $.root()
|
||||
.html()
|
||||
.match(/msg_source_url = '(.+)';/);
|
||||
|
||||
if (matchs) {
|
||||
return matchs[1];
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
/**
|
||||
* Articles from WeChat MP have weird formats, this function is used to fix them.
|
||||
*
|
||||
@@ -90,6 +103,18 @@ const fixArticleContent = (html, skipImg = false) => {
|
||||
replaceTag($, section, 'div');
|
||||
}
|
||||
});
|
||||
|
||||
// fix single picture article
|
||||
// example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
|
||||
$('script').each((_, script) => {
|
||||
script = $(script);
|
||||
const matchs = script.html().match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
|
||||
|
||||
if (matchs) {
|
||||
script.replaceWith(matchs[1].replace(/\r/g, '').replace(/\n/g, '<br>').replace(/\\x0d/g, '').replace(/\\x0a/g, '<br>'));
|
||||
}
|
||||
});
|
||||
|
||||
// clean scripts
|
||||
$('script').remove();
|
||||
return $.html();
|
||||
@@ -156,11 +181,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
|
||||
const response = await got(url);
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
const title = $('meta[property="og:title"]').attr('content');
|
||||
const title = $('meta[property="og:title"]').attr('content').replace(/\\r/g, '').replace(/\\n/g, ' ');
|
||||
const author = $('meta[name=author]').attr('content');
|
||||
let summary = $('meta[name=description]').attr('content');
|
||||
summary = summary !== title ? summary : '';
|
||||
let description = fixArticleContent($('div#js_content.rich_media_content'));
|
||||
let description = fixArticleContent($('#js_content'));
|
||||
|
||||
// No article get or article is too short, try the original url
|
||||
const originalUrl = detectOriginalArticleUrl($);
|
||||
@@ -171,6 +196,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
|
||||
description += fixArticleContent(original$('#js_content'));
|
||||
}
|
||||
|
||||
const sourceUrl = detectSourceUrl($);
|
||||
if (sourceUrl) {
|
||||
description += `<a href="${sourceUrl}">阅读原文</a>`;
|
||||
}
|
||||
|
||||
let pubDate;
|
||||
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html();
|
||||
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
|
||||
|
||||
Reference in New Issue
Block a user