diff --git a/docs/traditional-media.md b/docs/traditional-media.md index e7614727e0..17bb39843a 100644 --- a/docs/traditional-media.md +++ b/docs/traditional-media.md @@ -2043,11 +2043,11 @@ category 对应的关键词有 ### 最新上線 - + ### 主頻道 - + | 主頻道名稱 | 主頻道 ID | | ----- | ------ | @@ -2069,7 +2069,7 @@ category 对应的关键词有 ### 子頻道 - + ## 卫报 The Guardian diff --git a/lib/v2/cw/master.js b/lib/v2/cw/master.js index e2e81022e3..aa34918c10 100644 --- a/lib/v2/cw/master.js +++ b/lib/v2/cw/master.js @@ -1,25 +1,30 @@ const cheerio = require('cheerio'); -const { baseUrl, cookieJar, got, parseList, parseItems, getCookie } = require('./utils'); +const { baseUrl, parseList, parseItems, getCookie, setCookies } = require('./utils'); module.exports = async (ctx) => { const { channel } = ctx.params; const pageUrl = `${baseUrl}/masterChannel.action`; - if (!cookieJar) { - await getCookie(); - } - const { data: response } = await got(pageUrl, { - headers: { - Referer: baseUrl, - }, - cookieJar, - searchParams: { - idMasterChannel: channel, - }, + + const browser = await require('@/utils/puppeteer')(); + const cookie = await getCookie(browser, ctx.cache.tryGet); + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('request', (request) => { + request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort(); }); + await setCookies(page, cookie, 'cw.com.tw'); + await page.goto(`${pageUrl}?idMasterChannel=${channel}`, { + waitUntil: 'domcontentloaded', + }); + + const response = await page.evaluate(() => document.documentElement.innerHTML); + await page.close(); const $ = cheerio.load(response); - const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 100); - const items = await parseItems(list, ctx.cache.tryGet); + const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 12); + const items = await parseItems(list, browser, ctx.cache.tryGet); + + await browser.close(); ctx.state.data = { title: $('head title').text(), diff --git a/lib/v2/cw/sub.js b/lib/v2/cw/sub.js index f995171f62..134ff61669 100644 --- a/lib/v2/cw/sub.js +++ b/lib/v2/cw/sub.js @@ -1,25 +1,30 @@ const cheerio = require('cheerio'); -const { baseUrl, cookieJar, got, parseList, parseItems, getCookie } = require('./utils'); +const { baseUrl, parseList, parseItems, getCookie, setCookies } = require('./utils'); module.exports = async (ctx) => { const { channel } = ctx.params; const pageUrl = `${baseUrl}/subchannel.action`; - if (!cookieJar) { - await getCookie(); - } - const { data: response } = await got(pageUrl, { - headers: { - Referer: baseUrl, - }, - cookieJar, - searchParams: { - idSubChannel: channel, - }, + + const browser = await require('@/utils/puppeteer')(); + const cookie = await getCookie(browser, ctx.cache.tryGet); + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('request', (request) => { + request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort(); }); + await setCookies(page, cookie, 'cw.com.tw'); + await page.goto(`${pageUrl}?idSubChannel=${channel}`, { + waitUntil: 'domcontentloaded', + }); + + const response = await page.evaluate(() => document.documentElement.innerHTML); + await page.close(); const $ = cheerio.load(response); - const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 100); - const items = await parseItems(list, ctx.cache.tryGet); + const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 12); + const items = await parseItems(list, browser, ctx.cache.tryGet); + + await browser.close(); ctx.state.data = { title: $('head title').text(), diff --git a/lib/v2/cw/today.js b/lib/v2/cw/today.js index 6e494e37b2..d878b32458 100644 --- a/lib/v2/cw/today.js +++ b/lib/v2/cw/today.js @@ -1,21 +1,29 @@ const cheerio = require('cheerio'); -const { baseUrl, cookieJar, got, parseList, parseItems, getCookie } = require('./utils'); +const { baseUrl, parseList, parseItems, getCookie, setCookies } = require('./utils'); module.exports = async (ctx) => { const pageUrl = `${baseUrl}/today`; - if (!cookieJar) { - await getCookie(); - } - const { data: response } = await got(pageUrl, { - headers: { - Referer: baseUrl, - }, - cookieJar, + + const browser = await require('@/utils/puppeteer')(); + const cookie = await getCookie(browser, ctx.cache.tryGet); + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('request', (request) => { + request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort(); }); + await setCookies(page, cookie, 'cw.com.tw'); + await page.goto(pageUrl, { + waitUntil: 'domcontentloaded', + }); + + const response = await page.evaluate(() => document.documentElement.innerHTML); + await page.close(); const $ = cheerio.load(response); - const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 100); - const items = await parseItems(list, ctx.cache.tryGet); + const list = parseList($, ctx.query.limit ? Number(ctx.query.limit) : 30); + const items = await parseItems(list, browser, ctx.cache.tryGet); + + await browser.close(); ctx.state.data = { title: $('head title').text(), diff --git a/lib/v2/cw/utils.js b/lib/v2/cw/utils.js index 2e583e9dd7..a3735a61d1 100644 --- a/lib/v2/cw/utils.js +++ b/lib/v2/cw/utils.js @@ -1,26 +1,27 @@ const cheerio = require('cheerio'); const { parseDate } = require('@/utils/parse-date'); -const { Cookie, CookieJar } = require('tough-cookie'); -let cookieJar; -const config = require('@/config').value; +const { getCookies, setCookies } = require('@/utils/puppeteer-utils'); +let cookie; const baseUrl = 'https://www.cw.com.tw'; -const got = require('@/utils/got').extend({ - headers: { - 'User-Agent': config.trueUA, - }, -}); - -const getCookie = async () => { - const response = await got(`${baseUrl}/user/get/cookie-bar`); - const cookies = response.headers['set-cookie']; - if (Array.isArray(cookies)) { - cookieJar = cookies.map(Cookie.parse); - } else { - cookieJar = [Cookie.parse(cookieJar)]; +const getCookie = async (browser, tryGet) => { + if (!cookie) { + cookie = await tryGet('cw:cookie', async () => { + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('request', (request) => { + request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort(); + }); + await page.goto(`${baseUrl}/user/get/cookie-bar`, { + waitUntil: 'domcontentloaded', + }); + cookie = await getCookies(page); + await page.close(); + return cookie; + }); } - cookieJar = CookieJar.fromJSON({ cookies: cookieJar }); + return cookie; }; const parseList = ($, limit) => @@ -36,15 +37,24 @@ const parseList = ($, limit) => }) .slice(0, limit); -const parseItems = (list, tryGet) => +const parseItems = (list, browser, tryGet) => Promise.all( list.map((item) => tryGet(item.link, async () => { - const { data: response } = await got(item.link, { - cookieJar, + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('request', (request) => { + request.resourceType() === 'document' || request.resourceType() === 'script' ? request.continue() : request.abort(); + }); + await setCookies(page, cookie, 'cw.com.tw'); + await page.goto(item.link, { + waitUntil: 'domcontentloaded', }); + const response = await page.evaluate(() => document.documentElement.innerHTML); + await page.close(); const $ = cheerio.load(response); + const meta = JSON.parse($('head script[type="application/ld+json"]').eq(0).text()); $('.article__head .breadcrumb, .article__head h1, .article__provideViews, .ad').remove(); $('img.lazyload').each((_, img) => { @@ -67,9 +77,8 @@ const parseItems = (list, tryGet) => module.exports = { baseUrl, - cookieJar, - got, getCookie, + setCookies, parseList, parseItems, };