const parser = require('@/utils/rss-parser');
const cheerio = require('cheerio');
const got = require('@/utils/got');
const { parseDate } = require('@/utils/parse-date');
const categoryToXMLFileName = {
opinion: 'RSSOpinion.xml',
world_news: 'RSSWorldNews.xml',
us_bussiness: 'WSJcomUSBusiness.xml',
market_news: 'RSSMarketsMain.xml',
technology: 'RSSWSJD.xml',
lifestyle: 'RSSLifestyle.xml',
};
const categoryToName = {
opinion: 'Opinion',
world_news: 'World News',
us_bussiness: 'U.S. Business',
market_news: 'Markets News',
technology: "Technology: What's News",
lifestyle: 'Lifestyle',
};
module.exports = async (ctx) => {
const language = ctx.params.lang;
const category = ctx.params.category;
let rssUrl;
switch (language) {
case 'en-us':
rssUrl = `https://feeds.a.dj.com/rss/${categoryToXMLFileName[category]}`;
break;
case 'zh-cn':
// Doesn't support categorical subscribtion
rssUrl = `https://cn.wsj.com/zh-hans/rss/`;
break;
case 'zh-tw':
rssUrl = `https://cn.wsj.com/zh-hant/rss/`;
break;
default:
// Doesn't support other languages (e.g. ja) for now
throw Error(`Language ${language} is not supported`);
}
const feed = await parser.parseURL(rssUrl);
const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36';
const items = await Promise.all(
feed.items.map((item) =>
ctx.cache.tryGet(item.link, async () => {
// Fetch the AMP version
const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp');
const response = await got({
url,
method: 'get',
headers: {
'User-Agent': chromeMobileUserAgent,
},
});
const html = response.body;
const $ = cheerio.load(html);
const content = $('.articleBody > section');
// Cover
const cover = $('.articleLead > div.is-lead-inset > div.header > .img-header > div.image-container > amp-img > img');
if (cover.length > 0) {
$(`
`).insertBefore(content[0].childNodes[0]);
$(cover).remove();
}
// Summary
const summary = $('head > meta[name="description"]').attr('content');
// Metadata (categories & updatedAt)
const updatedAt = $('meta[itemprop="dateModified"]').attr('content');
const publishedAt = $('meta[itemprop="datePublished"]').attr('content');
const author = $('.author > a[rel="author"]').html();
const categories = $('meta[name="keywords"]')
.attr('content')
.split(',')
.map((c) => c.trim());
// Images
content.find('amp-img').each((i, e) => {
const img = $(`
`);
// Caption follows, no need to handle caption
$(img).insertBefore(e);
$(e).remove();
});
// iframes (youtube videos and interactive elements)
content.find('amp-iframe').each((i, e) => {
const iframe = $(`