mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-04 02:58:08 +08:00
feat(core)(utils/wechat-mp): normalize URL (#9497)
Signed-off-by: Rongrong <15956627+Rongronggg9@users.noreply.github.com>
This commit is contained in:
@@ -8,13 +8,13 @@
|
||||
*
|
||||
* If your new route is not in the above folders, please add it to the list.
|
||||
*
|
||||
* If your route need to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
|
||||
* If your route needs to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
|
||||
* However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`.
|
||||
* If you find more metadata on the webpage, consider modifying `fetchArticle` to include them.
|
||||
* NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision.
|
||||
* NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you.
|
||||
*
|
||||
* If your route fetch MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
|
||||
* If your route fetches MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
|
||||
* If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them.
|
||||
*
|
||||
* For more details of these functions, please refer to the jsDoc in the source code.
|
||||
@@ -82,6 +82,52 @@ const fixArticleContent = (html, skipImg = false) => {
|
||||
return $.html();
|
||||
};
|
||||
|
||||
// Ref:
|
||||
// https://soaked.in/2020/08/wechat-platform-url/
|
||||
// Known params (permanent long link):
|
||||
// __biz (essential), mid (essential), idx (essential), sn (essential), chksm, mpshare, scene, ascene, subscene, srcid,
|
||||
// lang, sharer_sharetime, sharer_shareid, version, exportkey, pass_ticket, clicktime, enterid, devicetype, nettype,
|
||||
// abtest_cookie, wx_header
|
||||
// Known params (temporary link):
|
||||
// src, timestamp, ver, signature, new (unessential)
|
||||
const normalizeUrl = (url, bypassHostCheck = false) => {
|
||||
const oriUrl = url;
|
||||
const urlObj = new URL(url);
|
||||
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
|
||||
throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
|
||||
}
|
||||
urlObj.protocol = 'https:';
|
||||
urlObj.hash = ''; // remove hash
|
||||
if (urlObj.pathname.match(/^\/s\/.+/)) {
|
||||
// a short link, just remove all the params
|
||||
urlObj.search = '';
|
||||
} else if (urlObj.pathname.match(/^\/s$/)) {
|
||||
const biz = urlObj.searchParams.get('__biz');
|
||||
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
|
||||
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
|
||||
const sn = urlObj.searchParams.get('sn') || urlObj.searchParams.get('sign');
|
||||
if (biz && mid && idx && sn) {
|
||||
// a permanent long link, remove all unessential params
|
||||
// no need to escape anything so no need to use `new URLSearchParams({...}).toString()`
|
||||
urlObj.search = `?__biz=${biz}&mid=${mid}&idx=${idx}&sn=${sn}`;
|
||||
} else {
|
||||
const src = urlObj.searchParams.get('src');
|
||||
const timestamp = urlObj.searchParams.get('timestamp');
|
||||
const ver = urlObj.searchParams.get('ver');
|
||||
const signature = urlObj.searchParams.get('signature');
|
||||
if (src && timestamp && ver && signature) {
|
||||
// a temporary link, remove all unessential params
|
||||
urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`;
|
||||
} else {
|
||||
// unknown link, just let it go
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// IDK what it is, just let it go
|
||||
}
|
||||
return urlObj.href;
|
||||
};
|
||||
|
||||
/**
|
||||
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
|
||||
*
|
||||
@@ -92,11 +138,7 @@ const fixArticleContent = (html, skipImg = false) => {
|
||||
* @return {Promise<object>} - An object containing the article and its metadata.
|
||||
*/
|
||||
const fetchArticle = async (ctx, url, bypassHostCheck = false) => {
|
||||
const oriUrl = url;
|
||||
url = url.replace(/^http:\/\//, 'https://').replace(/#\w*$/, ''); // normalize url
|
||||
if (!bypassHostCheck && !url.startsWith('https://mp.weixin.qq.com/')) {
|
||||
throw new Error('wechat-mp: URL must start with https://mp.weixin.qq.com/ or http://mp.weixin.qq.com/, but got ' + oriUrl);
|
||||
}
|
||||
url = normalizeUrl(url, bypassHostCheck);
|
||||
return await ctx.cache.tryGet(url, async () => {
|
||||
const response = await got(url);
|
||||
const $ = cheerio.load(response.data);
|
||||
@@ -161,4 +203,7 @@ module.exports = {
|
||||
fixArticleContent,
|
||||
fetchArticle,
|
||||
finishArticleItem, // a new route SHOULD use this function instead of manually calling the above functions
|
||||
_internal: {
|
||||
normalizeUrl, // for internal use only, exported for testing
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user