mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-04 11:07:54 +08:00
feat(core)(utils/wechat-mp): normalize URL (#9497)
Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
This commit is contained in:
@@ -8,13 +8,13 @@
|
|||||||
*
|
*
|
||||||
* If your new route is not in the above folders, please add it to the list.
|
* If your new route is not in the above folders, please add it to the list.
|
||||||
*
|
*
|
||||||
* If your route need to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
|
* If your route needs to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
|
||||||
* However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`.
|
* However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`.
|
||||||
* If you find more metadata on the webpage, consider modifying `fetchArticle` to include them.
|
* If you find more metadata on the webpage, consider modifying `fetchArticle` to include them.
|
||||||
* NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision.
|
* NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision.
|
||||||
* NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you.
|
* NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you.
|
||||||
*
|
*
|
||||||
* If your route fetch MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
|
* If your route fetches MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
|
||||||
* If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them.
|
* If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them.
|
||||||
*
|
*
|
||||||
* For more details of these functions, please refer to the jsDoc in the source code.
|
* For more details of these functions, please refer to the jsDoc in the source code.
|
||||||
@@ -82,6 +82,52 @@ const fixArticleContent = (html, skipImg = false) => {
|
|||||||
return $.html();
|
return $.html();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Ref:
|
||||||
|
// https://soaked.in/2020/08/wechat-platform-url/
|
||||||
|
// Known params (permanent long link):
|
||||||
|
// __biz (essential), mid (essential), idx (essential), sn (essential), chksm, mpshare, scene, ascene, subscene, srcid,
|
||||||
|
// lang, sharer_sharetime, sharer_shareid, version, exportkey, pass_ticket, clicktime, enterid, devicetype, nettype,
|
||||||
|
// abtest_cookie, wx_header
|
||||||
|
// Known params (temporary link):
|
||||||
|
// src, timestamp, ver, signature, new (unessential)
|
||||||
|
const normalizeUrl = (url, bypassHostCheck = false) => {
|
||||||
|
const oriUrl = url;
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
|
||||||
|
throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
|
||||||
|
}
|
||||||
|
urlObj.protocol = 'https:';
|
||||||
|
urlObj.hash = ''; // remove hash
|
||||||
|
if (urlObj.pathname.match(/^\/s\/.+/)) {
|
||||||
|
// a short link, just remove all the params
|
||||||
|
urlObj.search = '';
|
||||||
|
} else if (urlObj.pathname.match(/^\/s$/)) {
|
||||||
|
const biz = urlObj.searchParams.get('__biz');
|
||||||
|
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
|
||||||
|
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
|
||||||
|
const sn = urlObj.searchParams.get('sn') || urlObj.searchParams.get('sign');
|
||||||
|
if (biz && mid && idx && sn) {
|
||||||
|
// a permanent long link, remove all unessential params
|
||||||
|
// no need to escape anything so no need to use `new URLSearchParams({...}).toString()`
|
||||||
|
urlObj.search = `?__biz=${biz}&mid=${mid}&idx=${idx}&sn=${sn}`;
|
||||||
|
} else {
|
||||||
|
const src = urlObj.searchParams.get('src');
|
||||||
|
const timestamp = urlObj.searchParams.get('timestamp');
|
||||||
|
const ver = urlObj.searchParams.get('ver');
|
||||||
|
const signature = urlObj.searchParams.get('signature');
|
||||||
|
if (src && timestamp && ver && signature) {
|
||||||
|
// a temporary link, remove all unessential params
|
||||||
|
urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`;
|
||||||
|
} else {
|
||||||
|
// unknown link, just let it go
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// IDK what it is, just let it go
|
||||||
|
}
|
||||||
|
return urlObj.href;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
|
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
|
||||||
*
|
*
|
||||||
@@ -92,11 +138,7 @@ const fixArticleContent = (html, skipImg = false) => {
|
|||||||
* @return {Promise<object>} - An object containing the article and its metadata.
|
* @return {Promise<object>} - An object containing the article and its metadata.
|
||||||
*/
|
*/
|
||||||
const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
|
const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
|
||||||
const oriUrl = url;
|
url = normalizeUrl(url, bypassHostCheck);
|
||||||
url = url.replace(/^http:\/\//, 'https://').replace(/#\w*$/, ''); // normalize url
|
|
||||||
if (!bypassHostCheck && !url.startsWith('https://mp.weixin.qq.com/')) {
|
|
||||||
throw new Error('wechat-mp: URL must start with https://mp.weixin.qq.com/ or http://mp.weixin.qq.com/, but got ' + oriUrl);
|
|
||||||
}
|
|
||||||
return await ctx.cache.tryGet(url, async () => {
|
return await ctx.cache.tryGet(url, async () => {
|
||||||
const response = await got(url);
|
const response = await got(url);
|
||||||
const $ = cheerio.load(response.data);
|
const $ = cheerio.load(response.data);
|
||||||
@@ -161,4 +203,7 @@ module.exports = {
|
|||||||
fixArticleContent,
|
fixArticleContent,
|
||||||
fetchArticle,
|
fetchArticle,
|
||||||
finishArticleItem, // a new route SHOULD use this function instead of manually calling the above functions
|
finishArticleItem, // a new route SHOULD use this function instead of manually calling the above functions
|
||||||
|
_internal: {
|
||||||
|
normalizeUrl, // for internal use only, exported for testing
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
process.env.REQUEST_TIMEOUT = '500';
|
process.env.REQUEST_TIMEOUT = '500';
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const wechatMp = require('../../lib/utils/wechat-mp');
|
const {
|
||||||
|
_internal: { normalizeUrl },
|
||||||
|
fetchArticle,
|
||||||
|
finishArticleItem,
|
||||||
|
fixArticleContent,
|
||||||
|
} = require('../../lib/utils/wechat-mp');
|
||||||
const nock = require('nock');
|
const nock = require('nock');
|
||||||
const ctx = require('../../lib/app').context;
|
const ctx = require('../../lib/app').context;
|
||||||
|
|
||||||
@@ -31,21 +36,47 @@ describe('wechat-mp', () => {
|
|||||||
'<script>const test = "test"</script>';
|
'<script>const test = "test"</script>';
|
||||||
const expectedHtmlSection = '<p>test</p>' + '<div><p>test</p></div>' + '<div><div>test</div></div>' + '<div><div><p>test</p></div></div>' + '<div><div><p>test</p></div></div>' + '<p>test</p>' + '<div><p>test</p></div>';
|
const expectedHtmlSection = '<p>test</p>' + '<div><p>test</p></div>' + '<div><div>test</div></div>' + '<div><div><p>test</p></div></div>' + '<div><div><p>test</p></div></div>' + '<p>test</p>' + '<div><p>test</p></div>';
|
||||||
let $ = cheerio.load(divHeader + htmlSection + divFooter);
|
let $ = cheerio.load(divHeader + htmlSection + divFooter);
|
||||||
expect(wechatMp.fixArticleContent(htmlSection)).toBe(expectedHtmlSection);
|
expect(fixArticleContent(htmlSection)).toBe(expectedHtmlSection);
|
||||||
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlSection);
|
expect(fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlSection);
|
||||||
|
|
||||||
const htmlImg = '<img alt="test" data-src="http://rsshub.test/test.jpg" src="http://rsshub.test/test.jpg">' + '<img alt="test" data-src="http://rsshub.test/test.jpg">' + '<img alt="test" src="http://rsshub.test/test.jpg">';
|
const htmlImg = '<img alt="test" data-src="http://rsshub.test/test.jpg" src="http://rsshub.test/test.jpg">' + '<img alt="test" data-src="http://rsshub.test/test.jpg">' + '<img alt="test" src="http://rsshub.test/test.jpg">';
|
||||||
const expectedHtmlImg = new Array(3 + 1).join('<img alt="test" src="http://rsshub.test/test.jpg">');
|
const expectedHtmlImg = new Array(3 + 1).join('<img alt="test" src="http://rsshub.test/test.jpg">');
|
||||||
$ = cheerio.load(divHeader + htmlImg + divFooter);
|
$ = cheerio.load(divHeader + htmlImg + divFooter);
|
||||||
expect(wechatMp.fixArticleContent(htmlImg)).toBe(expectedHtmlImg);
|
expect(fixArticleContent(htmlImg)).toBe(expectedHtmlImg);
|
||||||
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlImg);
|
expect(fixArticleContent($('div#js_content.rich_media_content'))).toBe(expectedHtmlImg);
|
||||||
expect(wechatMp.fixArticleContent(htmlImg, true)).toBe(htmlImg);
|
expect(fixArticleContent(htmlImg, true)).toBe(htmlImg);
|
||||||
expect(wechatMp.fixArticleContent($('div#js_content.rich_media_content'), true)).toBe(htmlImg);
|
expect(fixArticleContent($('div#js_content.rich_media_content'), true)).toBe(htmlImg);
|
||||||
|
|
||||||
expect(wechatMp.fixArticleContent('')).toBe('');
|
expect(fixArticleContent('')).toBe('');
|
||||||
expect(wechatMp.fixArticleContent(null)).toBe('');
|
expect(fixArticleContent(null)).toBe('');
|
||||||
expect(wechatMp.fixArticleContent(undefined)).toBe('');
|
expect(fixArticleContent(undefined)).toBe('');
|
||||||
expect(wechatMp.fixArticleContent($('div#something_not_in.the_document_tree'))).toBe('');
|
expect(fixArticleContent($('div#something_not_in.the_document_tree'))).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('normalizeUrl', () => {
|
||||||
|
const mpRoot = 'https://mp.weixin.qq.com';
|
||||||
|
const mpArticleRoot = mpRoot + '/s';
|
||||||
|
|
||||||
|
const shortUrl = mpArticleRoot + '/-rwvHhqYbKGCVFeXRNknYQ';
|
||||||
|
const shortUrlWithQueryAndHash = shortUrl + '?foo=bar#baz';
|
||||||
|
expect(normalizeUrl(shortUrlWithQueryAndHash)).toBe(shortUrl);
|
||||||
|
|
||||||
|
const longUrlShortened = mpArticleRoot + '?__biz=MzA4MjQxNjQzMA==' + '&mid=2768628484' + '&idx=1' + '&sn=93dcc54ce807f7793739ee2fd2377056';
|
||||||
|
const longUrl = longUrlShortened + '&chksm=bf774d458800c453c94cae866093680e6cac6a1f02cab7e82683f82f35f7f487e2daa1dcde20' + '&scene=75' + '#wechat_redirect';
|
||||||
|
expect(normalizeUrl(longUrl)).toBe(longUrlShortened);
|
||||||
|
|
||||||
|
const temporaryUrlShortened =
|
||||||
|
mpArticleRoot + '?src=11' + '×tamp=1620536401' + '&ver=3057' + '&signature=vCDI8FQcumnNGv4ScvFP-swQRlirdQSqTfjS8m-oFzgHMkqlNM3ljzjSevcjXLC-z-n0RzzMkNt-lwKMUaskfaqFFrpYZNq4ZCKkFFGj8L*KvH780aEUBJFvWTGmMGLC';
|
||||||
|
const temporaryUrl = temporaryUrlShortened + '&new=1#foo';
|
||||||
|
expect(normalizeUrl(temporaryUrl)).toBe(temporaryUrlShortened);
|
||||||
|
|
||||||
|
const somethingElse = mpRoot + '/something/else?__biz=foo&mid=bar&idx=baz&sn=qux';
|
||||||
|
const somethingElseWithHash = somethingElse + '#foo';
|
||||||
|
expect(normalizeUrl(somethingElseWithHash.replace('https://', 'http://'))).toBe(somethingElse);
|
||||||
|
|
||||||
|
const notWechatMp = 'https://im.not.wechat.mp/and/an/error/is/expected';
|
||||||
|
expect(() => normalizeUrl(notWechatMp)).toThrow();
|
||||||
|
expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('fetchArticle_&_finishArticleItem', async () => {
|
it('fetchArticle_&_finishArticleItem', async () => {
|
||||||
@@ -69,15 +100,6 @@ describe('wechat-mp', () => {
|
|||||||
const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle';
|
const httpsUrl = 'https://mp.weixin.qq.com/rsshub_test/wechatMp_fetchArticle';
|
||||||
const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://');
|
const httpUrl = httpsUrl.replace(/^https:\/\//, 'http://');
|
||||||
|
|
||||||
let _ret;
|
|
||||||
try {
|
|
||||||
_ret = await wechatMp.fetchArticle(ctx, 'https://im.not.wechat.mp/and/an/error/is/expected');
|
|
||||||
} catch (e) {
|
|
||||||
expect(e.name).toBe('Error');
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(_ret).toBeUndefined();
|
|
||||||
|
|
||||||
const expectedItem = {
|
const expectedItem = {
|
||||||
title: 'title',
|
title: 'title',
|
||||||
summary: 'summary',
|
summary: 'summary',
|
||||||
@@ -88,15 +110,15 @@ describe('wechat-mp', () => {
|
|||||||
};
|
};
|
||||||
const expectedDate = new Date(ct * 1000);
|
const expectedDate = new Date(ct * 1000);
|
||||||
|
|
||||||
const fetchArticleItem = await wechatMp.fetchArticle(ctx, httpUrl);
|
const fetchArticleItem = await fetchArticle(ctx, httpUrl);
|
||||||
expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true);
|
expect(compareDate(fetchArticleItem.pubDate, expectedDate)).toBe(true);
|
||||||
delete fetchArticleItem.pubDate;
|
delete fetchArticleItem.pubDate;
|
||||||
expect(fetchArticleItem).toEqual(expectedItem);
|
expect(fetchArticleItem).toEqual(expectedItem);
|
||||||
|
|
||||||
delete expectedItem.mpName;
|
delete expectedItem.mpName;
|
||||||
const finishArticleItem = await wechatMp.finishArticleItem(ctx, { link: httpUrl });
|
const finishedArticleItem = await finishArticleItem(ctx, { link: httpUrl });
|
||||||
expect(compareDate(finishArticleItem.pubDate, expectedDate)).toBe(true);
|
expect(compareDate(finishedArticleItem.pubDate, expectedDate)).toBe(true);
|
||||||
delete finishArticleItem.pubDate;
|
delete finishedArticleItem.pubDate;
|
||||||
expect(finishArticleItem).toEqual(expectedItem);
|
expect(finishedArticleItem).toEqual(expectedItem);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user