const asyncPool = require('tiny-async-pool'); const cheerio = require('cheerio'); const got = require('@/utils/got'); const { parseDate } = require('@/utils/parse-date'); const { art } = require('@/utils/render'); const path = require('path'); const UA = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' }); // const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36'; const parseArticle = (item, ctx) => ctx.cache.tryGet(item.link, async () => { // Fetch the AMP version const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp'); const response = await got({ url, method: 'get', headers: { 'User-Agent': UA, }, }); const html = response.data; const $ = cheerio.load(html); // Summary const summary = $('head > meta[name="description"]').attr('content'); // Metadata (categories & updatedAt) const updatedAt = $('meta[itemprop="dateModified"]').attr('content'); const publishedAt = $('meta[itemprop="datePublished"]').attr('content'); const author = $('.author > a[rel="author"]').text(); const categories = $('meta[name="keywords"]') .attr('content') .split(',') .map((c) => c.trim()); const article = $('article'); item.subTitle = $('h2.sub-head').html(); // Remove podcast article.find('.media-object-podcast').remove(); // Authors article.find('.bylineWrap').each((i, e) => { $(e) .find('p') .each(function () { $(this).replaceWith($(this).html()); }); }); // Images article.find('.bigTop-hero').each((i, e) => { // console.log($(e).html()); const imgSrc = $(e).find('amp-img').attr('src'); const imgAlt = $(e).find('amp-img').attr('alt'); const figCaption = $(e).find('.imageCaption').text().trim(); const figCredit = $(e).find('.imageCredit').text().trim(); const fig = $(`
${imgAlt}
${figCaption} ${figCredit}
`); $(fig).insertBefore(e); $(e).remove(); }); article.find('amp-img').each((i, e) => { const img = $(`${e.attribs.alt}`); // Caption follows, no need to handle caption $(img).insertBefore(e); $(e).remove(); }); // iframes (youtube videos and interactive elements) article.find('amp-iframe').each((i, e) => { const iframe = $(`