const asyncPool = require('tiny-async-pool'); const cheerio = require('cheerio'); const got = require('@/utils/got'); const { parseDate } = require('@/utils/parse-date'); const UA = require('@/utils/rand-user-agent')({ browser: 'chrome', os: 'android', device: 'mobile' }); // const chromeMobileUserAgent = 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36'; const parseArticle = (item, ctx) => ctx.cache.tryGet(item.link, async () => { // Fetch the AMP version const url = item.link.replace(/(?<=^https:\/\/\w+\.wsj\.com)/, '/amp'); const response = await got({ url, method: 'get', headers: { 'User-Agent': UA, }, }); const html = response.data; const $ = cheerio.load(html); const content = $('.articleBody > section'); // Cover const cover = $('.articleLead > div.is-lead-inset > div.header > .img-header > div.image-container > amp-img > img'); if (cover.length > 0) { $(``).insertBefore(content[0].childNodes[0]); $(cover).remove(); } // Summary const summary = $('head > meta[name="description"]').attr('content'); // Metadata (categories & updatedAt) const updatedAt = $('meta[itemprop="dateModified"]').attr('content'); const publishedAt = $('meta[itemprop="datePublished"]').attr('content'); const author = $('.author > a[rel="author"]').text(); const categories = $('meta[name="keywords"]') .attr('content') .split(',') .map((c) => c.trim()); // Images content.find('amp-img').each((i, e) => { const img = $(`${e.attribs.alt}`); // Caption follows, no need to handle caption $(img).insertBefore(e); $(e).remove(); }); // iframes (youtube videos and interactive elements) content.find('amp-iframe').each((i, e) => { const iframe = $(`