feat: Update the Vulture endpoint to be able to generate a... (#4757)

This commit is contained in:
Logan Rockmore
2020-05-17 23:55:57 -04:00
committed by GitHub
parent 509c887a05
commit ec383ae3c4
4 changed files with 54 additions and 68 deletions

View File

@@ -156,14 +156,7 @@ Provides a better reading experience (full text articles) over the official one.
## Vulture
<RouteEn author="loganrockmore" example="/vulture/movies" path="/vulture/:type/:excludetags?" :paramsDesc="['The sub-site name', 'Comma-delimited list of tags. If an article includes one of these tags, it will be excluded from the RSS feed.']">
Supported sub-sites
| TV | Movies | Comedy | Music | TV Recaps | Books | Theater | Art | Awards | Video |
| --- | ------ | ------ | ----- | --------- | ----- | ------- | --- | ------ | ----- |
| tv | movies | comedy | music | tvrecaps | books | theater | art | awards | video |
</RouteEn>
<RouteEn author="loganrockmore" example="/vulture/movies" path="/vulture/:tag/:excludetags?" :paramsDesc="['Tag', 'Comma-delimited list of tags. If an article includes one of these tags, it will be excluded from the RSS feed.']" />
## World Health Organization | WHO

View File

@@ -2215,7 +2215,7 @@ router.get('/mastodon/timeline/:site/:only_media?', require('./routes/mastodon/t
router.get('/aliyun-kernel/index', require('./routes/aliyun-kernel/index'));
// Vulture
router.get('/vulture/:type/:excludetags?', require('./routes/vulture/index'));
router.get('/vulture/:tag/:excludetags?', require('./routes/vulture/index'));
// xinwenlianbo
router.get('/xinwenlianbo/index', require('./routes/xinwenlianbo/index'));

View File

@@ -1,9 +1,14 @@
const utils = require('./utils');
module.exports = async (ctx) => {
const url = `https://www.vulture.com/${ctx.params.type}/`;
const title = `Vulture - ${ctx.params.type}`;
const url = `https://www.vulture.com/news/${ctx.params.tag}/`;
const tagsToExclude = ctx.params.excludetags;
let title = `Vulture - tag ${ctx.params.tag}`;
if (tagsToExclude !== undefined) {
title += ' - excluding tags ';
title += tagsToExclude.split(',').join(', ');
}
ctx.state.data = await utils.getData(ctx, url, title, tagsToExclude);
};

View File

@@ -1,10 +1,17 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
async function load(link) {
const response = await got.get(link);
async function load(articleURL) {
const response = await got.get(articleURL);
const $ = cheerio.load(response.data);
// get the metadata
const title = $('meta[property="og:title"]').attr('content');
const pubDate = $('meta[property="article:published_time"]').attr('content');
const bylineString = 'by ' + $('meta[name="author"]').attr('content');
const tags = $('meta[property="article:tag"]').attr('content').split(', ');
// get the contents of the article
const description = $('div.article-content');
// remove the content that we don't want to show
@@ -16,45 +23,44 @@ async function load(link) {
description.find('div.mobile-secondary-area').remove();
description.find('aside.newsletter-flex-text').remove();
// get the tags
const tagElements = $('div.tags > ul > li > a:not(.more)');
const tags = tagElements
.map(function () {
return $(this).text().toLowerCase();
})
.get();
// add the tags to the end
description.append('<br /><br />tags: ' + tags.join(', '));
return {
title: title,
author: bylineString,
pubDate: pubDate,
link: articleURL,
guid: articleURL,
description: description.html(),
tags: tags,
};
}
async function ProcessFeed(list, caches) {
return await Promise.all(
list.map(async (item) => {
const itemUrl = item.canonicalUrl;
async function ProcessFeed(htmlData, caches) {
const $ = cheerio.load(htmlData);
const allArticles = $('section.paginated-feed li.article');
let bylineString = '';
if (item.byline) {
const byline = item.byline[0];
const bylineNames = byline.names.map((name) => name.text);
const bylineNamesString = bylineNames.join(', ');
// limit the list to only 25 articles, to make sure that load times remain reasonable
const articles = allArticles.slice(0, 25);
bylineString = 'by ' + bylineNamesString;
const articleURLs = [];
$(articles).each(function (index, article) {
const articleLink = $(article).find('a.link-text');
let articleURL = articleLink.attr('href');
if (articleURL.startsWith('//www.')) {
articleURL = 'https:' + articleURL;
} else if (articleURL.startsWith('www.')) {
articleURL = 'https://' + articleURL;
}
articleURLs.push(articleURL);
});
const single = {
title: item.primaryHeadline,
link: itemUrl,
author: bylineString,
guid: itemUrl,
pubDate: item.date,
};
const other = await caches.tryGet(itemUrl, async () => await load(itemUrl));
return Promise.resolve(Object.assign({}, single, other));
return await Promise.all(
articleURLs.map(async (articleURL) => {
const data = await caches.tryGet(articleURL, async () => await load(articleURL));
return Promise.resolve(Object.assign({}, data));
})
);
}
@@ -82,29 +88,7 @@ const getData = async (ctx, url, title, tagsToExclude) => {
});
const htmlData = htmlResponse.data;
const $ = cheerio.load(htmlData);
let dataUri = $('section.paginated-feed').attr('data-uri');
if (dataUri.startsWith('www.')) {
dataUri = 'https://' + dataUri;
}
// get the raw data
const response = await got({
method: 'get',
url: dataUri,
headers: {
Referer: dataUri,
},
});
const data = response.data;
// limit the list to only 25 articles, to make sure that load times remain reasonable
const list = data.articles.slice(0, 25);
let result = await ProcessFeed(list, ctx.cache);
let result = await ProcessFeed(htmlData, ctx.cache);
// filter out specified tags
if (tagsToExclude !== undefined) {
@@ -112,10 +96,14 @@ const getData = async (ctx, url, title, tagsToExclude) => {
result = FilterItemsWithTags(result, tagsToExcludeArray);
}
// get the description
const $ = cheerio.load(htmlData);
const description = $('meta[name="description"]').attr('content');
return {
title: title,
link: url,
description: $('meta[name="description"]').attr('content'),
description: description,
item: result,
};
};