fix(utils): 支持微信公众号单图片文章抓取; 增加输出阅读原文链接 (#9557)

* fix(utils): 支持微信公众号单图片文章抓取 * fix(utils): 支持输出微信公众号转载文章阅读原文链接
2025-12-04 11:07:54 +08:00 · 2022-04-19 01:29:45 +08:00
parent 864016c598
commit f3e069d399
1 changed files with 32 additions and 2 deletions
--- a/lib/utils/wechat-mp.js
+++ b/lib/utils/wechat-mp.js
@@ -38,16 +38,29 @@ const replaceTag = ($, oldTag, newTagName) => {

 const detectOriginalArticleUrl = ($) => {
    // No article content get, try the original url
+    // example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw
    if (!$('#js_content').text()) {
        return $('#js_share_source').attr('data-url');
    }
    // Article content is too short, try the first link
+    // example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg
    if ($('#js_content').text().length < 80) {
        return $('#js_content a').attr('href');
    }
    return null;
 };

+const detectSourceUrl = ($) => {
+    const matchs = $.root()
+        .html()
+        .match(/msg_source_url = '(.+)';/);
+
+    if (matchs) {
+        return matchs[1];
+    }
+    return null;
+};
+
 /**
 * Articles from WeChat MP have weird formats, this function is used to fix them.
 *
@@ -90,6 +103,18 @@ const fixArticleContent = (html, skipImg = false) => {
            replaceTag($, section, 'div');
        }
    });
+
+    // fix single picture article
+    // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
+    $('script').each((_, script) => {
+        script = $(script);
+        const matchs = script.html().match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
+
+        if (matchs) {
+            script.replaceWith(matchs[1].replace(/\r/g, '').replace(/\n/g, '<br>').replace(/\\x0d/g, '').replace(/\\x0a/g, '<br>'));
+        }
+    });
+
    // clean scripts
    $('script').remove();
    return $.html();
@@ -156,11 +181,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
        const response = await got(url);
        const $ = cheerio.load(response.data);

-        const title = $('meta[property="og:title"]').attr('content');
+        const title = $('meta[property="og:title"]').attr('content').replace(/\\r/g, '').replace(/\\n/g, ' ');
        const author = $('meta[name=author]').attr('content');
        let summary = $('meta[name=description]').attr('content');
        summary = summary !== title ? summary : '';
-        let description = fixArticleContent($('div#js_content.rich_media_content'));
+        let description = fixArticleContent($('#js_content'));

        // No article get or article is too short, try the original url
        const originalUrl = detectOriginalArticleUrl($);
@@ -171,6 +196,11 @@ const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
            description += fixArticleContent(original$('#js_content'));
        }

+        const sourceUrl = detectSourceUrl($);
+        if (sourceUrl) {
+            description += `<a href="${sourceUrl}">阅读原文</a>`;
+        }
+
        let pubDate;
        const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').first().html();
        const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);