diff --git a/lib/utils/wechat-mp.js b/lib/utils/wechat-mp.js index e24b64db07..ba948c7aad 100644 --- a/lib/utils/wechat-mp.js +++ b/lib/utils/wechat-mp.js @@ -38,16 +38,29 @@ const replaceTag = ($, oldTag, newTagName) => { const detectOriginalArticleUrl = ($) => { // No article content get, try the original url + // example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw if (!$('#js_content').text()) { return $('#js_share_source').attr('data-url'); } // Article content is too short, try the first link + // example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg if ($('#js_content').text().length < 80) { return $('#js_content a').attr('href'); } return null; }; +const detectSourceUrl = ($) => { + const matchs = $.root() + .html() + .match(/msg_source_url = '(.+)';/); + + if (matchs) { + return matchs[1]; + } + return null; +}; + /** * Articles from WeChat MP have weird formats, this function is used to fix them. * @@ -90,6 +103,18 @@ const fixArticleContent = (html, skipImg = false) => { replaceTag($, section, 'div'); } }); + + // fix single picture article + // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ + $('script').each((_, script) => { + script = $(script); + const matchs = script.html().match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/); + + if (matchs) { + script.replaceWith(matchs[1].replace(/\r/g, '').replace(/\n/g, '
').replace(/\\x0d/g, '').replace(/\\x0a/g, '
')); + } + }); + // clean scripts $('script').remove(); return $.html(); @@ -156,11 +181,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => { const response = await got(url); const $ = cheerio.load(response.data); - const title = $('meta[property="og:title"]').attr('content'); + const title = $('meta[property="og:title"]').attr('content').replace(/\\r/g, '').replace(/\\n/g, ' '); const author = $('meta[name=author]').attr('content'); let summary = $('meta[name=description]').attr('content'); summary = summary !== title ? summary : ''; - let description = fixArticleContent($('div#js_content.rich_media_content')); + let description = fixArticleContent($('#js_content')); // No article get or article is too short, try the original url const originalUrl = detectOriginalArticleUrl($); @@ -171,6 +196,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => { description += fixArticleContent(original$('#js_content')); } + const sourceUrl = detectSourceUrl($); + if (sourceUrl) { + description += `阅读原文`; + } + let pubDate; const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html(); const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);