diff --git a/lib/utils/wechat-mp.js b/lib/utils/wechat-mp.js index 0b467c449d..8e32780198 100644 --- a/lib/utils/wechat-mp.js +++ b/lib/utils/wechat-mp.js @@ -8,13 +8,13 @@ * * If your new route is not in the above folders, please add it to the list. * - * If your route need to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`. + * If your route needs to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`. * However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`. * If you find more metadata on the webpage, consider modifying `fetchArticle` to include them. * NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision. * NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you. * - * If your route fetch MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format. + * If your route fetches MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format. * If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them. * * For more details of these functions, please refer to the jsDoc in the source code. @@ -82,6 +82,52 @@ const fixArticleContent = (html, skipImg = false) => { return $.html(); }; +// Ref: +// https://soaked.in/2020/08/wechat-platform-url/ +// Known params (permanent long link): +// __biz (essential), mid (essential), idx (essential), sn (essential), chksm, mpshare, scene, ascene, subscene, srcid, +// lang, sharer_sharetime, sharer_shareid, version, exportkey, pass_ticket, clicktime, enterid, devicetype, nettype, +// abtest_cookie, wx_header +// Known params (temporary link): +// src, timestamp, ver, signature, new (unessential) +const normalizeUrl = (url, bypassHostCheck = false) => { + const oriUrl = url; + const urlObj = new URL(url); + if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') { + throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl); + } + urlObj.protocol = 'https:'; + urlObj.hash = ''; // remove hash + if (urlObj.pathname.match(/^\/s\/.+/)) { + // a short link, just remove all the params + urlObj.search = ''; + } else if (urlObj.pathname.match(/^\/s$/)) { + const biz = urlObj.searchParams.get('__biz'); + const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid'); + const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx'); + const sn = urlObj.searchParams.get('sn') || urlObj.searchParams.get('sign'); + if (biz && mid && idx && sn) { + // a permanent long link, remove all unessential params + // no need to escape anything so no need to use `new URLSearchParams({...}).toString()` + urlObj.search = `?__biz=${biz}&mid=${mid}&idx=${idx}&sn=${sn}`; + } else { + const src = urlObj.searchParams.get('src'); + const timestamp = urlObj.searchParams.get('timestamp'); + const ver = urlObj.searchParams.get('ver'); + const signature = urlObj.searchParams.get('signature'); + if (src && timestamp && ver && signature) { + // a temporary link, remove all unessential params + urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`; + } else { + // unknown link, just let it go + } + } + } else { + // IDK what it is, just let it go + } + return urlObj.href; +}; + /** * Fetch article and its metadata from WeChat MP (mp.weixin.qq.com). * @@ -92,11 +138,7 @@ const fixArticleContent = (html, skipImg = false) => { * @return {Promise} - An object containing the article and its metadata. */ const fetchArticle = async (ctx, url, bypassHostCheck = false) => { - const oriUrl = url; - url = url.replace(/^http:\/\//, 'https://').replace(/#\w*$/, ''); // normalize url - if (!bypassHostCheck && !url.startsWith('https://mp.weixin.qq.com/')) { - throw new Error('wechat-mp: URL must start with https://mp.weixin.qq.com/ or http://mp.weixin.qq.com/, but got ' + oriUrl); - } + url = normalizeUrl(url, bypassHostCheck); return await ctx.cache.tryGet(url, async () => { const response = await got(url); const $ = cheerio.load(response.data); @@ -161,4 +203,7 @@ module.exports = { fixArticleContent, fetchArticle, finishArticleItem, // a new route SHOULD use this function instead of manually calling the above functions + _internal: { + normalizeUrl, // for internal use only, exported for testing + }, }; diff --git a/test/utils/wechat-mp.js b/test/utils/wechat-mp.js index d9bbe80659..2be41f35e1 100644 --- a/test/utils/wechat-mp.js +++ b/test/utils/wechat-mp.js @@ -1,6 +1,11 @@ process.env.REQUEST_TIMEOUT = '500'; const cheerio = require('cheerio'); -const wechatMp = require('../../lib/utils/wechat-mp'); +const { + _internal: { normalizeUrl }, + fetchArticle, + finishArticleItem, + fixArticleContent, +} = require('../../lib/utils/wechat-mp'); const nock = require('nock'); const ctx = require('../../lib/app').context; @@ -31,21 +36,47 @@ describe('wechat-mp', () => { ''; const expectedHtmlSection = '

test

' + '

test

' + '
test
' + '

test

' + '

test

' + '

test

' + '

test

'; let $ = cheerio.load(divHeader + htmlSection + divFooter); - expect(wechatMp.fixArticleContent(htmlSection)).toBe(expectedHtmlSection); - expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlSection); + expect(fixArticleContent(htmlSection)).toBe(expectedHtmlSection); + expect(fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlSection); const htmlImg = 'test' + 'test' + 'test'; const expectedHtmlImg = new Array(3 + 1).join('test'); $ = cheerio.load(divHeader + htmlImg + divFooter); - expect(wechatMp.fixArticleContent(htmlImg)).toBe(expectedHtmlImg); - expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlImg); - expect(wechatMp.fixArticleContent(htmlImg, true)).toBe(htmlImg); - expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'), true)).toBe(htmlImg); + expect(fixArticleContent(htmlImg)).toBe(expectedHtmlImg); + expect(fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlImg); + expect(fixArticleContent(htmlImg, true)).toBe(htmlImg); + expect(fixArticleContent($('div#js_content.rich_media_content'), true)).toBe(htmlImg); - expect(wechatMp.fixArticleContent('')).toBe(''); - expect(wechatMp.fixArticleContent(null)).toBe(''); - expect(wechatMp.fixArticleContent(undefined)).toBe(''); - expect(wechatMp.fixArticleContent($('div#something_not_in.the_document_tree'))).toBe(''); + expect(fixArticleContent('')).toBe(''); + expect(fixArticleContent(null)).toBe(''); + expect(fixArticleContent(undefined)).toBe(''); + expect(fixArticleContent($('div#something_not_in.the_document_tree'))).toBe(''); + }); + + it('normalizeUrl', () => { + const mpRoot = 'https://mp.weixin.qq.com'; + const mpArticleRoot = mpRoot + '/s'; + + const shortUrl = mpArticleRoot + '/-rwvHhqYbKGCVFeXRNknYQ'; + const shortUrlWithQueryAndHash = shortUrl + '?foo=bar#baz'; + expect(normalizeUrl(shortUrlWithQueryAndHash)).toBe(shortUrl); + + const longUrlShortened = mpArticleRoot + '?__biz=MzA4MjQxNjQzMA==' + '&mid=2768628484' + '&idx=1' + '&sn=93dcc54ce807f7793739ee2fd2377056'; + const longUrl = longUrlShortened + '&chksm=bf774d458800c453c94cae866093680e6cac6a1f02cab7e82683f82f35f7f487e2daa1dcde20' + '&scene=75' + '#wechat_redirect'; + expect(normalizeUrl(longUrl)).toBe(longUrlShortened); + + const temporaryUrlShortened = + mpArticleRoot + '?src=11' + '×tamp=1620536401' + '&ver=3057' + '&signature=vCDI8FQcumnNGv4ScvFP-swQRlirdQSqTfjS8m-oFzgHMkqlNM3ljzjSevcjXLC-z-n0RzzMkNt-lwKMUaskfaqFFrpYZNq4ZCKkFFGj8L*KvH780aEUBJFvWTGmMGLC'; + const temporaryUrl = temporaryUrlShortened + '&new=1#foo'; + expect(normalizeUrl(temporaryUrl)).toBe(temporaryUrlShortened); + + const somethingElse = mpRoot + '/something/else?__biz=foo&mid=bar&idx=baz&sn=qux'; + const somethingElseWithHash = somethingElse + '#foo'; + expect(normalizeUrl(somethingElseWithHash.replace('https://', 'http://'))).toBe(somethingElse); + + const notWechatMp = 'https://im.not.wechat.mp/and/an/error/is/expected'; + expect(() => normalizeUrl(notWechatMp)).toThrow(); + expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp); }); it('fetchArticle_&_finishArticleItem', async () => { @@ -69,15 +100,6 @@ describe('wechat-mp', () => { const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle'; const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://'); - let _ret; - try { - _ret = await wechatMp.fetchArticle(ctx, 'https://im.not.wechat.mp/and/an/error/is/expected'); - } catch (e) { - expect(e.name).toBe('Error'); - } - - expect(_ret).toBeUndefined(); - const expectedItem = { title: 'title', summary: 'summary', @@ -88,15 +110,15 @@ describe('wechat-mp', () => { }; const expectedDate = new Date(ct * 1000); - const fetchArticleItem = await wechatMp.fetchArticle(ctx, httpUrl); + const fetchArticleItem = await fetchArticle(ctx, httpUrl); expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true); delete fetchArticleItem.pubDate; expect(fetchArticleItem).toEqual(expectedItem); delete expectedItem.mpName; - const finishArticleItem = await wechatMp.finishArticleItem(ctx, { link: httpUrl }); - expect(compareDate(finishArticleItem.pubDate, expectedDate)).toBe(true); - delete finishArticleItem.pubDate; - expect(finishArticleItem).toEqual(expectedItem); + const finishedArticleItem = await finishArticleItem(ctx, { link: httpUrl }); + expect(compareDate(finishedArticleItem.pubDate, expectedDate)).toBe(true); + delete finishedArticleItem.pubDate; + expect(finishedArticleItem).toEqual(expectedItem); }); });