mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-04 02:58:08 +08:00
277 lines
11 KiB
TypeScript
277 lines
11 KiB
TypeScript
/**
|
|
* Author: @Rongronggg9
|
|
*
|
|
* There are at least three folders which are relevant with WeChat MP (Official Account Platform / Media Platform):
|
|
* lib/routes/wechat
|
|
* lib/routes/gov/npma
|
|
* lib/routes/gzh360
|
|
* lib/routes/pku/nsd/gd
|
|
* lib/routes/sdu/cs
|
|
* lib/routes/nua/utils
|
|
* lib/routes/hrbeu
|
|
* lib/routes/freewechat
|
|
*
|
|
* If your new route is not in the above folders, please add it to the list.
|
|
*
|
|
* If your route needs to fetch MP articles from mp.weixin.qq.com, you SHOULD use `finishArticleItem`.
|
|
* However, if your route need to determine some metadata by itself, you MAY use `fetchArticle`.
|
|
* If you find more metadata on the webpage, consider modifying `fetchArticle` to include them.
|
|
* NEVER fetch MP articles from mp.weixin.qq.com in your route in order to avoid cache key collision.
|
|
* NO NEED TO use cache if you are using `finishArticleItem` or `fetchArticle`, they will handle cache for you.
|
|
*
|
|
* If your route fetches MP articles from other websites, you SHOULD use `fixArticleContent` to fix the content format.
|
|
* If you find more fixes that should be applied, consider modifying `fixArticleContent` to include them.
|
|
*
|
|
* For more details of these functions, please refer to the jsDoc in the source code.
|
|
*/
|
|
|
|
import got from '@/utils/got';
|
|
import { load, type Cheerio, type Element } from 'cheerio';
|
|
import { parseDate } from '@/utils/parse-date';
|
|
import cache from '@/utils/cache';
|
|
|
|
const replaceTag = ($, oldTag, newTagName) => {
|
|
oldTag = $(oldTag);
|
|
const NewTag = $($(`<${newTagName} />`));
|
|
const oldTagAttr = oldTag.attr();
|
|
for (const key in oldTagAttr) {
|
|
NewTag.attr(key, oldTagAttr[key]);
|
|
}
|
|
NewTag.append(oldTag.contents());
|
|
oldTag.replaceWith(NewTag);
|
|
};
|
|
|
|
const detectOriginalArticleUrl = ($) => {
|
|
// No article content get, try the original url
|
|
// example: https://mp.weixin.qq.com/s/f6sKObaZZhADTYU2Jl5Bnw
|
|
if (!$('#js_content').text()) {
|
|
return $('#js_share_source').attr('data-url');
|
|
}
|
|
// Article content is too short, try the first link
|
|
// example: https://mp.weixin.qq.com/s/9saVB4KaolRyJfpajzeFRg
|
|
if ($('#js_content').text().length < 80) {
|
|
return $('#js_content a').attr('href');
|
|
}
|
|
return null;
|
|
};
|
|
|
|
const detectSourceUrl = ($) => {
|
|
const matchs = $.root()
|
|
.html()
|
|
.match(/msg_source_url = '(.+)';/);
|
|
|
|
if (matchs) {
|
|
return matchs[1];
|
|
}
|
|
return null;
|
|
};
|
|
|
|
/**
|
|
* Articles from WeChat MP have weird formats, this function is used to fix them.
|
|
*
|
|
* Even though your content are not directly fetched from WeChat MP, you SHOULD still call this function.
|
|
* Calling this function is safe in most situations.
|
|
*
|
|
* Example usage: item.description = fixArticleContent($('div#js_content.rich_media_content'));
|
|
* @param {*} html - The html to be fixed, a string or a cheerio object.
|
|
* @param {boolean} skipImg - Whether to skip fixing images.
|
|
* @return {string} - The fixed html, a string.
|
|
*/
|
|
const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) => {
|
|
let htmlResult = '';
|
|
if (typeof html === 'string') {
|
|
htmlResult = html;
|
|
} else if (html?.html) {
|
|
htmlResult = html.html() || '';
|
|
}
|
|
if (!htmlResult) {
|
|
return '';
|
|
}
|
|
const $ = load(htmlResult, undefined, false);
|
|
if (!skipImg) {
|
|
// fix img lazy loading
|
|
$('img[data-src]').each((_, img) => {
|
|
const $img = $(img);
|
|
const realSrc = $img.attr('data-src');
|
|
if (realSrc) {
|
|
$img.attr('src', realSrc);
|
|
$img.removeAttr('data-src');
|
|
}
|
|
});
|
|
}
|
|
// fix section
|
|
$('section').each((_, section) => {
|
|
const $section = $(section);
|
|
const p_count = $section.find('p').length;
|
|
const div_count = $section.find('div').length;
|
|
const section_count = $section.find('section').length;
|
|
if (p_count + div_count + section_count === 0) {
|
|
// make it a p
|
|
replaceTag($, section, 'p');
|
|
} else {
|
|
// make it a div
|
|
replaceTag($, section, 'div');
|
|
}
|
|
});
|
|
|
|
// add breaks in code section
|
|
$('code').each((_, code) => {
|
|
$('<br>').insertAfter(code);
|
|
});
|
|
|
|
// clear line index tags in code section
|
|
$('.code-snippet__line-index').remove();
|
|
|
|
// fix single picture article
|
|
// example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
|
|
$('script').each((_, script) => {
|
|
const $script = $(script);
|
|
const matchs = $script.html()?.match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
|
|
|
|
if (matchs) {
|
|
$script.replaceWith(matchs[1].replaceAll('\r', '').replaceAll('\n', '<br>').replaceAll('\\x0d', '').replaceAll('\\x0a', '<br>'));
|
|
}
|
|
});
|
|
|
|
// clean scripts
|
|
$('script').remove();
|
|
return $.html();
|
|
};
|
|
|
|
// Ref:
|
|
// https://soaked.in/2020/08/wechat-platform-url/
|
|
// Known params (permanent long link):
|
|
// __biz (essential), mid (essential), idx (essential), sn (essential), chksm, mpshare, scene, ascene, subscene, srcid,
|
|
// lang, sharer_sharetime, sharer_shareid, version, exportkey, pass_ticket, clicktime, enterid, devicetype, nettype,
|
|
// abtest_cookie, wx_header
|
|
// Known params (temporary link):
|
|
// src, timestamp, ver, signature, new (unessential)
|
|
const normalizeUrl = (url, bypassHostCheck = false) => {
|
|
const oriUrl = url;
|
|
const urlObj = new URL(url);
|
|
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
|
|
throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
|
|
}
|
|
urlObj.protocol = 'https:';
|
|
urlObj.hash = ''; // remove hash
|
|
if (/^\/s\/.+/.test(urlObj.pathname)) {
|
|
// a short link, just remove all the params
|
|
urlObj.search = '';
|
|
} else if (/^\/s$/.test(urlObj.pathname)) {
|
|
const biz = urlObj.searchParams.get('__biz');
|
|
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
|
|
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
|
|
const sn = urlObj.searchParams.get('sn') || urlObj.searchParams.get('sign');
|
|
if (biz && mid && idx && sn) {
|
|
// a permanent long link, remove all unessential params
|
|
// no need to escape anything so no need to use `new URLSearchParams({...}).toString()`
|
|
urlObj.search = `?__biz=${biz}&mid=${mid}&idx=${idx}&sn=${sn}`;
|
|
} else {
|
|
const src = urlObj.searchParams.get('src');
|
|
const timestamp = urlObj.searchParams.get('timestamp');
|
|
const ver = urlObj.searchParams.get('ver');
|
|
const signature = urlObj.searchParams.get('signature');
|
|
if (src && timestamp && ver && signature) {
|
|
// a temporary link, remove all unessential params
|
|
urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`;
|
|
} else {
|
|
// unknown link, just let it go
|
|
}
|
|
}
|
|
} else {
|
|
// IDK what it is, just let it go
|
|
}
|
|
return urlObj.href;
|
|
};
|
|
|
|
/**
|
|
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
|
|
*
|
|
* If you use this function, no need to call `fixArticleContent`
|
|
* @param {object} ctx - The context object.
|
|
* @param {string} url - The url of the article.
|
|
* @param {boolean} bypassHostCheck - Whether to bypass host check.
|
|
* @return {Promise<object>} - An object containing the article and its metadata.
|
|
*/
|
|
const fetchArticle = (url, bypassHostCheck = false) => {
|
|
url = normalizeUrl(url, bypassHostCheck);
|
|
return cache.tryGet(url, async () => {
|
|
const response = await got(url);
|
|
// @ts-expect-error custom field
|
|
const $ = load(response.data);
|
|
|
|
const title = ($('meta[property="og:title"]').attr('content') || '').replaceAll('\\r', '').replaceAll('\\n', ' ');
|
|
const author = $('meta[name=author]').attr('content');
|
|
let summary = $('meta[name=description]').attr('content');
|
|
summary = summary === title ? '' : summary;
|
|
let description = fixArticleContent($('#js_content'));
|
|
// No article get or article is too short, try the original url
|
|
const originalUrl = detectOriginalArticleUrl($);
|
|
if (originalUrl) {
|
|
// try to fetch the description from the original article
|
|
const originalResponse = await got(normalizeUrl(originalUrl, bypassHostCheck));
|
|
// @ts-expect-error custom field
|
|
const original$ = load(originalResponse.data);
|
|
description += fixArticleContent(original$('#js_content'));
|
|
}
|
|
|
|
const sourceUrl = detectSourceUrl($);
|
|
if (sourceUrl) {
|
|
description += `<a href="${sourceUrl}">阅读原文</a>`;
|
|
}
|
|
|
|
let pubDate;
|
|
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').text();
|
|
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
|
|
const publish_timestamp = publish_time_match && publish_time_match[1];
|
|
if (publish_timestamp) {
|
|
pubDate = parseDate(Number.parseInt(publish_timestamp) * 1000);
|
|
}
|
|
|
|
let mpName = $('.profile_nickname').first().text();
|
|
mpName = mpName && mpName.trim();
|
|
return { title, author, description, summary, pubDate, mpName, link: response.url };
|
|
}) as Promise<{
|
|
title: string;
|
|
author: string;
|
|
description: string;
|
|
summary: string;
|
|
pubDate?: Date;
|
|
mpName?: string;
|
|
link: string;
|
|
}>;
|
|
};
|
|
|
|
/**
|
|
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com), then fill the `item` object with the result.
|
|
*
|
|
* If you use this function, no need to call `fetchArticle` or `fixArticleContent`
|
|
*
|
|
* A new route SHOULD use this function instead of manually calling the above functions
|
|
*
|
|
* An existing route adopting this function SHOULD either:
|
|
* - set `skipLink` to true (not recommended)
|
|
* - set `item.guid` to `item.link` BEFORE calling this function
|
|
* @param {object} ctx - The context object.
|
|
* @param {object} item - The item object to be filled.
|
|
* @param {boolean} setMpNameAsAuthor - If `true`, `author` will be the MP itself, otherwise the real author of the article.
|
|
* @param {boolean} skipLink - Whether to skip overriding `item.link` with the normalized url.
|
|
* @return {Promise<object>} - The incoming `item` object, with the article and its metadata filled in.
|
|
*/
|
|
const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = false) => {
|
|
const { title, author, description, summary, pubDate, mpName, link } = await fetchArticle(item.link);
|
|
item.title = title || item.title;
|
|
item.description = description || item.description;
|
|
item.summary = summary || item.summary;
|
|
item.pubDate = pubDate || item.pubDate;
|
|
item.author = setMpNameAsAuthor
|
|
? mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
|
|
: author || item.author; // the real author of the article. if your route return articles from a certain account, use this
|
|
if (!skipLink) {
|
|
item.link = link || item.link;
|
|
}
|
|
return item;
|
|
};
|
|
|
|
export { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
|