mirror of
https://github.com/DIYgod/RSSHub.git
synced 2025-12-17 20:28:21 +08:00
381 lines
16 KiB
TypeScript
381 lines
16 KiB
TypeScript
import * as entities from 'entities';
|
||
import { load, type CheerioAPI, type Element } from 'cheerio';
|
||
import { simplecc } from 'simplecc-wasm';
|
||
import got from '@/utils/got';
|
||
import { config } from '@/config';
|
||
import { RE2JS } from 're2js';
|
||
import markdownit from 'markdown-it'
|
||
import htmlToText from 'html-to-text'
|
||
import { MiddlewareHandler } from 'hono';
|
||
import cache from '@/utils/cache';
|
||
import Parser from '@postlight/parser';
|
||
import { Data, DataItem } from '@/types';
|
||
|
||
const md = markdownit({
|
||
html: true,
|
||
})
|
||
|
||
const resolveRelativeLink = ($: CheerioAPI, elem: Element, attr: string, baseUrl?: string) => {
|
||
const $elem = $(elem);
|
||
|
||
if (baseUrl) {
|
||
try {
|
||
const oldAttr = $elem.attr(attr);
|
||
if (oldAttr) {
|
||
// e.g. <video><source src="https://example.com"></video> should leave <video> unchanged
|
||
$elem.attr(attr, new URL(oldAttr, baseUrl).href);
|
||
}
|
||
} catch {
|
||
// no-empty
|
||
}
|
||
}
|
||
};
|
||
|
||
const summarizeArticle = async (articleText: string) => {
|
||
const apiUrl = `${config.openai.endpoint}/chat/completions`;
|
||
const response = await got.post(apiUrl, {
|
||
json: {
|
||
model: config.openai.model,
|
||
max_tokens: config.openai.maxTokens,
|
||
messages: [
|
||
{ role: 'system', content: config.openai.prompt },
|
||
{ role: 'user', content: articleText },
|
||
],
|
||
temperature: config.openai.temperature,
|
||
},
|
||
headers: {
|
||
Authorization: `Bearer ${config.openai.apiKey}`,
|
||
},
|
||
});
|
||
|
||
return response.data.choices[0].message.content;
|
||
};
|
||
|
||
const middleware: MiddlewareHandler = async (ctx, next) => {
|
||
await next();
|
||
|
||
const data = ctx.get('data') as Data;
|
||
if (!data) {
|
||
// throw new Error('wrong path');
|
||
} else {
|
||
if ((!data.item || data.item.length === 0) && !data.allowEmpty) {
|
||
throw new Error('this route is empty, please check the original site or <a href="https://github.com/DIYgod/RSSHub/issues/new/choose">create an issue</a>');
|
||
}
|
||
|
||
// fix allowEmpty
|
||
data.item = data.item || [];
|
||
|
||
// decode HTML entities
|
||
data.title && (data.title = entities.decodeXML(data.title + ''));
|
||
data.description && (data.description = entities.decodeXML(data.description + ''));
|
||
|
||
// sort items
|
||
if (ctx.req.query('sorted') !== 'false') {
|
||
data.item = data.item.sort((a: DataItem, b: DataItem) => +new Date(b.pubDate || 0) - +new Date(a.pubDate || 0));
|
||
}
|
||
|
||
const handleItem = (item: DataItem) => {
|
||
item.title && (item.title = entities.decodeXML(item.title + ''));
|
||
|
||
// handle pubDate
|
||
if (item.pubDate) {
|
||
item.pubDate = new Date(item.pubDate).toUTCString();
|
||
}
|
||
|
||
// handle link
|
||
if (item.link) {
|
||
let baseUrl = data.link;
|
||
if (baseUrl && !/^https?:\/\//.test(baseUrl)) {
|
||
baseUrl = /^\/\//.test(baseUrl) ? 'http:' + baseUrl : 'http://' + baseUrl;
|
||
}
|
||
|
||
item.link = new URL(item.link, baseUrl).href;
|
||
}
|
||
|
||
// handle description
|
||
if (item.description) {
|
||
const $ = load(item.description);
|
||
let baseUrl = item.link || data.link;
|
||
|
||
if (baseUrl && !/^https?:\/\//.test(baseUrl)) {
|
||
baseUrl = /^\/\//.test(baseUrl) ? 'http:' + baseUrl : 'http://' + baseUrl;
|
||
}
|
||
|
||
$('script').remove();
|
||
|
||
$('img').each((_, ele) => {
|
||
const $ele = $(ele);
|
||
|
||
// fix lazyload
|
||
if (!$ele.attr('src')) {
|
||
const lazySrc = $ele.attr('data-src') || $ele.attr('data-original');
|
||
if (lazySrc) {
|
||
$ele.attr('src', lazySrc);
|
||
} else {
|
||
for (const key in ele.attribs) {
|
||
const value = ele.attribs[key].trim();
|
||
if (['.gif', '.png', '.jpg', '.webp'].some((suffix) => value.includes(suffix))) {
|
||
$ele.attr('src', value);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// redundant attributes
|
||
for (const e of ['onclick', 'onerror', 'onload']) {
|
||
$ele.removeAttr(e);
|
||
}
|
||
});
|
||
|
||
// resolve relative link & fix referrer policy
|
||
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
|
||
// https://www.w3schools.com/tags/att_href.asp
|
||
$('a, area').each((_, elem) => {
|
||
resolveRelativeLink($, elem, 'href', baseUrl);
|
||
// $(elem).attr('rel', 'noreferrer'); // currently no such a need
|
||
});
|
||
// https://www.w3schools.com/tags/att_src.asp
|
||
$('img, video, audio, source, iframe, embed, track').each((_, elem) => {
|
||
resolveRelativeLink($, elem, 'src', baseUrl);
|
||
});
|
||
$('video[poster]').each((_, elem) => {
|
||
resolveRelativeLink($, elem, 'poster', baseUrl);
|
||
});
|
||
$('img, iframe').each((_, elem) => {
|
||
$(elem).attr('referrerpolicy', 'no-referrer');
|
||
});
|
||
|
||
item.description = $('body').html() + '' + (config.suffix || '');
|
||
|
||
if (item._extra?.links && $('.rsshub-quote').length) {
|
||
item._extra?.links?.map((e) => {
|
||
e.content_html = $.html($('.rsshub-quote'));
|
||
return e;
|
||
});
|
||
}
|
||
}
|
||
|
||
// handle category
|
||
if (item.category) {
|
||
// convert single string to array, and filter only string type category
|
||
Array.isArray(item.category) || (item.category = [item.category]);
|
||
item.category = item.category.filter((e) => typeof e === 'string');
|
||
}
|
||
return item;
|
||
};
|
||
|
||
data.item = await Promise.all(data.item.map((itm) => handleItem(itm)));
|
||
|
||
// filter
|
||
const engine = config.feature.filter_regex_engine;
|
||
const makeRegex = (str: string) => {
|
||
// default: case_senstivie = true
|
||
const insensitive = ctx.req.query('filter_case_sensitive') === 'false';
|
||
switch (engine) {
|
||
case 'regexp':
|
||
return new RegExp(str, insensitive ? 'i' : '');
|
||
case 're2':
|
||
return RE2JS.compile(str, insensitive ? RE2JS.CASE_INSENSITIVE : 0);
|
||
default:
|
||
throw new Error(`Invalid Engine Value: ${engine}, please check your config.`);
|
||
}
|
||
};
|
||
|
||
if (ctx.req.query('filter')) {
|
||
const regex = makeRegex(ctx.req.query('filter')!);
|
||
|
||
data.item = data.item.filter((item) => {
|
||
const title = item.title || '';
|
||
const description = item.description || title;
|
||
const author = item.author || '';
|
||
const category = item.category || [];
|
||
const isFilter =
|
||
regex instanceof RE2JS
|
||
? regex.matcher(title).find() || regex.matcher(description).find() || regex.matcher(author).find() || category.some((c) => regex.matcher(c).find())
|
||
: title.match(regex) || description.match(regex) || author.match(regex) || category.some((c) => c.match(regex));
|
||
|
||
return isFilter;
|
||
});
|
||
}
|
||
|
||
// 启用filter参数时,无效filter_title/description/author/category
|
||
if (!ctx.req.query('filter') && (ctx.req.query('filter_title') || ctx.req.query('filter_description') || ctx.req.query('filter_author') || ctx.req.query('filter_category'))) {
|
||
data.item = data.item.filter((item) => {
|
||
const title = item.title || '';
|
||
const description = item.description || title;
|
||
const author = item.author || '';
|
||
const category = item.category || [];
|
||
let isFilter = true;
|
||
|
||
if (ctx.req.query('filter_title')) {
|
||
const titleRegex = makeRegex(ctx.req.query('filter_title')!);
|
||
isFilter = titleRegex instanceof RE2JS ? titleRegex.matcher(title).find() : !!title.match(titleRegex);
|
||
}
|
||
if (ctx.req.query('filter_description')) {
|
||
const descriptionRegex = makeRegex(ctx.req.query('filter_description')!);
|
||
isFilter = isFilter && (descriptionRegex instanceof RE2JS ? descriptionRegex.matcher(description).find() : !!description.match(descriptionRegex));
|
||
}
|
||
if (ctx.req.query('filter_author')) {
|
||
const authorRegex = makeRegex(ctx.req.query('filter_author')!);
|
||
isFilter = isFilter && (authorRegex instanceof RE2JS ? authorRegex.matcher(author).find() : !!author.match(authorRegex));
|
||
}
|
||
if (ctx.req.query('filter_category')) {
|
||
const categoryRegex = makeRegex(ctx.req.query('filter_category')!);
|
||
isFilter = isFilter && category.some((c) => (categoryRegex instanceof RE2JS ? categoryRegex.matcher(c).find() : c.match(categoryRegex)));
|
||
}
|
||
|
||
return isFilter;
|
||
});
|
||
}
|
||
|
||
if (ctx.req.query('filterout') || ctx.req.query('filterout_title') || ctx.req.query('filterout_description') || ctx.req.query('filterout_author') || ctx.req.query('filterout_category')) {
|
||
data.item = data.item.filter((item) => {
|
||
const title = item.title;
|
||
const description = item.description || title;
|
||
const author = item.author || '';
|
||
const category = item.category || [];
|
||
let isFilter = true;
|
||
|
||
if (ctx.req.query('filterout')) {
|
||
const titleRegex = makeRegex(ctx.req.query('filterout_title') || ctx.req.query('filterout')!);
|
||
const descriptionRegex = makeRegex(ctx.req.query('filterout_description') || ctx.req.query('filterout')!);
|
||
|
||
isFilter = titleRegex instanceof RE2JS ? !titleRegex.matcher(title).find() : !title.match(titleRegex);
|
||
isFilter = isFilter && (descriptionRegex instanceof RE2JS ? !descriptionRegex.matcher(description).find() : !description.match(descriptionRegex));
|
||
}
|
||
if (ctx.req.query('filterout_author')) {
|
||
const authorRegex = makeRegex(ctx.req.query('filterout_author')!);
|
||
isFilter = isFilter && (authorRegex instanceof RE2JS ? !authorRegex.matcher(author).find() : !author.match(authorRegex));
|
||
}
|
||
if (ctx.req.query('filterout_category')) {
|
||
const categoryRegex = makeRegex(ctx.req.query('filterout_category')!);
|
||
isFilter = isFilter && !category.some((c) => (categoryRegex instanceof RE2JS ? categoryRegex.matcher(c).find() : c.match(categoryRegex)));
|
||
}
|
||
|
||
return isFilter;
|
||
});
|
||
}
|
||
|
||
if (ctx.req.query('filter_time')) {
|
||
const now = Date.now();
|
||
data.item = data.item.filter(({ pubDate }) => {
|
||
let isFilter = true;
|
||
try {
|
||
isFilter = !pubDate || now - new Date(pubDate).getTime() <= Number.parseInt(ctx.req.query('filter_time')!) * 1000;
|
||
} catch {
|
||
// no-empty
|
||
}
|
||
return isFilter;
|
||
});
|
||
}
|
||
|
||
// limit
|
||
if (ctx.req.query('limit')) {
|
||
data.item = data.item.slice(0, Number.parseInt(ctx.req.query('limit')!));
|
||
}
|
||
|
||
// telegram instant view
|
||
if (ctx.req.query('tgiv')) {
|
||
data.item.map((item) => {
|
||
if (item.link) {
|
||
const encodedlink = encodeURIComponent(item.link);
|
||
item.link = `https://t.me/iv?url=${encodedlink}&rhash=${ctx.req.query('tgiv')}`;
|
||
return item;
|
||
}
|
||
});
|
||
}
|
||
|
||
// fulltext
|
||
if (ctx.req.query('mode')?.toLowerCase() === 'fulltext') {
|
||
const tasks = data.item.map(async (item) => {
|
||
const { link, author, description } = item;
|
||
const parsed_result: any = await cache.tryGet(`mercury-cache-${link}`, async () => {
|
||
if (link) {
|
||
// if parser failed, return default description and not report error
|
||
try {
|
||
const { data: res } = await got(link);
|
||
const $ = load(res);
|
||
const result = await Parser.parse(link, {
|
||
html: $.html(),
|
||
});
|
||
return result;
|
||
} catch {
|
||
// no-empty
|
||
}
|
||
}
|
||
});
|
||
|
||
item.author = author || parsed_result?.author;
|
||
item.description = parsed_result && parsed_result.content.length > 40 ? entities.decodeXML(parsed_result.content) : description;
|
||
});
|
||
await Promise.all(tasks);
|
||
}
|
||
|
||
// openai
|
||
if (ctx.req.query('chatgpt') && config.openai.apiKey) {
|
||
data.item = await Promise.all(
|
||
data.item.map(async (item) => {
|
||
if (item.description) {
|
||
try {
|
||
const summary = await cache.tryGet(`openai:${item.link}`, async () => {
|
||
const text = htmlToText.htmlToText(item.description!);
|
||
if (text.length < 300) {
|
||
return '';
|
||
}
|
||
const summary_md = await summarizeArticle(text);
|
||
return md.render(summary_md);
|
||
});
|
||
// 将总结结果添加到文章数据中
|
||
if (summary !== '') {
|
||
item.description = summary + '<hr/><br/>' + item.description;
|
||
}
|
||
} catch {
|
||
// when openai failed, return default description and not write cache
|
||
}
|
||
}
|
||
return item;
|
||
})
|
||
);
|
||
}
|
||
|
||
// scihub
|
||
if (ctx.req.query('scihub')) {
|
||
data.item.map((item) => {
|
||
item.link = item.doi ? `${config.scihub.host}${item.doi}` : `${config.scihub.host}${item.link}`;
|
||
return item;
|
||
});
|
||
}
|
||
|
||
// opencc
|
||
if (ctx.req.query('opencc')) {
|
||
for (const item of data.item) {
|
||
item.title = simplecc(item.title ?? item.link, ctx.req.query('opencc')!);
|
||
item.description = simplecc(item.description ?? item.title ?? item.link, ctx.req.query('opencc')!);
|
||
}
|
||
}
|
||
|
||
// brief
|
||
if (ctx.req.query('brief')) {
|
||
const num = /[1-9]\d{2,}/;
|
||
if (num.test(ctx.req.query('brief')!)) {
|
||
const brief = Number.parseInt(ctx.req.query('brief')!);
|
||
for (const item of data.item) {
|
||
let text;
|
||
if (item.description) {
|
||
text = item.description.replaceAll(/<\/?[^>]+(>|$)/g, '');
|
||
item.description = text.length > brief ? `<p>${text.substring(0, brief)}…</p>` : `<p>${text}</p>`;
|
||
}
|
||
}
|
||
} else {
|
||
throw new Error(`Invalid parameter brief. Please check the doc https://docs.rsshub.app/parameter#shu-chu-jian-xun`);
|
||
}
|
||
}
|
||
// some parameters are processed in `anti-hotlink.js`
|
||
|
||
ctx.set('data', data)
|
||
}
|
||
};
|
||
|
||
export default middleware;
|