From 859ba6fdfd0ed1e7a1134d18a10a546d4f0dc20c Mon Sep 17 00:00:00 2001 From: zoenglinghou <11689106+zoenglinghou@users.noreply.github.com> Date: Sat, 29 Feb 2020 13:03:22 +0100 Subject: [PATCH] feat: Add full article for AP News (#4103) --- lib/routes/apnews/topics.js | 79 ++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/lib/routes/apnews/topics.js b/lib/routes/apnews/topics.js index a6f9a70eef..1cc560aa84 100644 --- a/lib/routes/apnews/topics.js +++ b/lib/routes/apnews/topics.js @@ -1,5 +1,6 @@ const got = require('@/utils/got'); const cheerio = require('cheerio'); +const url = require('url'); module.exports = async (ctx) => { const topic = ctx.params.topic; @@ -12,35 +13,59 @@ module.exports = async (ctx) => { const data = response.data; const $ = cheerio.load(data); - const list = $('div.FeedCard'); + // const list = $('div.FeedCard'); + const list = []; + $('div.FeedCard').each(function(index, item) { + if ( + $(item) + .find('a[class^=Component-headline]') + .attr('href') !== undefined + ) { + list.push(item); + } + }); + + const out = await Promise.all( + list.map(async (article) => { + const link = url.resolve( + 'https://apnews.com', + $(article) + .find('a[class^=Component-headline]') + .attr('href') + ); + + const [title, author, pubDate, description] = await ctx.cache.tryGet(link, async () => { + const result = await got.get(link); + + const $ = cheerio.load(result.data); + + const head = JSON.parse($('script[type="application/ld+json"]').html()); + + const title = head.headline; + const author = head.author.join(' & '); + const pubDate = head.datePublished; + + const text = $('div.Article').html(); + const imageUrl = head.image; + const description = `` + text; + + return [title, author, pubDate, description]; + }); + + const item = { + title: title, + description: description, + pubDate: pubDate, + link: link, + author: author, + }; + return Promise.resolve(item); + }) + ); ctx.state.data = { - title: $('title').text(), + title: 'AP News - ' + $('title').text(), link: `https://www.apnews.com/${topic}`, - item: - list && - list - .map((index, item) => { - item = $(item); - - return { - title: item - .find('h1[class^=Component-h1]') - .first() - .text(), - author: item - .find('span[class^=Component-bylines]') - .first() - .text() - .replace('By ', ''), - description: item - .find('div.content') - .first() - .text(), - pubDate: item.find('span[class^="Timestamp Component-root"]').attr('data-source'), - link: item.find('a[class^=Component-headline]').attr('href'), - }; - }) - .get(), + item: out, }; };