const parser = require('@/utils/rss-parser'); const cheerio = require('cheerio'); const got = require('@/utils/got'); const { parseDate } = require('@/utils/parse-date'); const categoryToXMLFileName = { opinion: 'RSSOpinion.xml', world_news: 'RSSWorldNews.xml', us_bussiness: 'WSJcomUSBusiness.xml', market_news: 'RSSMarketsMain.xml', technology: 'RSSWSJD.xml', lifestyle: 'RSSLifestyle.xml', }; const categoryToName = { opinion: 'Opinion', world_news: 'World News', us_bussiness: 'U.S. Business', market_news: 'Markets News', technology: "Technology: What's News", lifestyle: 'Lifestyle', }; module.exports = async (ctx) => { const language = ctx.params.lang; const category = ctx.params.category; let rssUrl; switch (language) { case 'en-us': rssUrl = `https://feeds.a.dj.com/rss/${categoryToXMLFileName[category]}`; break; case 'zh-cn': // Doesn't support categorical subscribtion rssUrl = `https://cn.wsj.com/zh-hans/rss/`; break; case 'zh-tw': rssUrl = `https://cn.wsj.com/zh-hant/rss/`; break; default: // Doesn't support other languages (e.g. ja) for now throw Error(`Language ${language} is not supported`); } const feed = await parser.parseURL(rssUrl); const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36'; const items = await Promise.all( feed.items.map((item) => ctx.cache.tryGet(item.link, async () => { // Fetch the AMP version const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp'); const response = await got({ url, method: 'get', headers: { 'User-Agent': chromeMobileUserAgent, }, }); const html = response.body; const $ = cheerio.load(html); const content = $('.articleBody > section'); // Cover const cover = $('.articleLead > div.is-lead-inset > div.header > .img-header > div.image-container > amp-img > img'); if (cover.length > 0) { $(``).insertBefore(content[0].childNodes[0]); $(cover).remove(); } // Summary const summary = $('head > meta[name="description"]').attr('content'); // Metadata (categories & updatedAt) const updatedAt = $('meta[itemprop="dateModified"]').attr('content'); const publishedAt = $('meta[itemprop="datePublished"]').attr('content'); const author = $('.author > a[rel="author"]').html(); const categories = $('meta[name="keywords"]') .attr('content') .split(',') .map((c) => c.trim()); // Images content.find('amp-img').each((i, e) => { const img = $(`${e.attribs.alt}`); // Caption follows, no need to handle caption $(img).insertBefore(e); $(e).remove(); }); // iframes (youtube videos and interactive elements) content.find('amp-iframe').each((i, e) => { const iframe = $(`