const asyncPool = require('tiny-async-pool');
const cheerio = require('cheerio');
const got = require('@/utils/got');
const { parseDate } = require('@/utils/parse-date');
const { art } = require('@/utils/render');
const path = require('path');
const UA = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' });
// const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36';
const parseArticle = (item, ctx) =>
ctx.cache.tryGet(item.link, async () => {
// Fetch the AMP version
const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp');
const response = await got({
url,
method: 'get',
headers: {
'User-Agent': UA,
},
});
const html = response.data;
const $ = cheerio.load(html);
// Summary
const summary = $('head > meta[name="description"]').attr('content');
// Metadata (categories & updatedAt)
const updatedAt = $('meta[itemprop="dateModified"]').attr('content');
const publishedAt = $('meta[itemprop="datePublished"]').attr('content');
const author = $('.author > a[rel="author"]').text();
const categories = $('meta[name="keywords"]')
.attr('content')
.split(',')
.map((c) => c.trim());
const article = $('article');
item.subTitle = $('h2.sub-head').html();
// Remove podcast
article.find('.media-object-podcast').remove();
// Authors
article.find('.bylineWrap').each((i, e) => {
$(e)
.find('p')
.each(function () {
$(this).replaceWith($(this).html());
});
});
// Images
article.find('.bigTop-hero').each((i, e) => {
// console.log($(e).html());
const imgSrc = $(e).find('amp-img').attr('src');
const imgAlt = $(e).find('amp-img').attr('alt');
const figCaption = $(e).find('.imageCaption').text().trim();
const figCredit = $(e).find('.imageCredit').text().trim();
const fig = $(`
${figCaption} ${figCredit}`);
$(fig).insertBefore(e);
$(e).remove();
});
article.find('amp-img').each((i, e) => {
const img = $(`
`);
// Caption follows, no need to handle caption
$(img).insertBefore(e);
$(e).remove();
});
// iframes (youtube videos and interactive elements)
article.find('amp-iframe').each((i, e) => {
const iframe = $(`