From a36e6a0cb98687649abd317ac0ff32ab55944cc0 Mon Sep 17 00:00:00 2001 From: Origami404 Date: Tue, 28 Apr 2020 10:26:26 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E4=B8=AD=E5=8D=8E?= =?UTF-8?q?=E4=BA=BA=E6=B0=91=E5=85=B1=E5=92=8C=E5=9B=BD=E5=86=9C=E4=B8=9A?= =?UTF-8?q?=E5=86=9C=E6=9D=91=E9=83=A8=E7=9A=84=E6=96=B0=E9=97=BB=E5=8F=8A?= =?UTF-8?q?=E5=85=AC=E5=91=8A=20(#4544)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/government.md | 15 ++++ lib/router.js | 3 + lib/routes/gov/moa/moa.js | 164 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+) create mode 100644 lib/routes/gov/moa/moa.js diff --git a/docs/government.md b/docs/government.md index f8e435e879..9be1775838 100644 --- a/docs/government.md +++ b/docs/government.md @@ -330,6 +330,21 @@ pageClass: routes +## 中华人民共和国农业农村部 + +### 新闻 + + + +更多例子: + +- `农业农村部动态`的网页链接是`http://www.moa.gov.cn/xw/zwdt/`, 对应的`suburl`是`xw/zwdt` +- `财务公开`的网页链接是`http://www.moa.gov.cn/gk/cwgk_1/`, 对应的`suburl`是`gk/cwgk_1` +- 像[政策法规](http://www.moa.gov.cn/gk/zcfg/)这种页面(`http://www.moa.gov.cn/gk/zcfg/`), 它**不是**一个合法的分类目录, 它是`法律`, `行政法规`, `部门规章`等一堆栏目的集合, 这时候请点开对应栏目的`更多>>`进入栏目的目录, 再根据上面的规则提取`suburl` +- 特别地, `图片新闻`对应的`suburl`为`xw/tpxw/`, `最新公开`对应的`suburl`为`govpublic` + + + ## 中华人民共和国商务部 ### 政务公开 diff --git a/lib/router.js b/lib/router.js index d25bb13b00..b5857251e3 100644 --- a/lib/router.js +++ b/lib/router.js @@ -1724,6 +1724,9 @@ router.get('/cninfo/fund_announcement/:code?/:searchkey?', require('./routes/cni // 中央纪委国家监委网站 router.get('/ccdi/scdc', require('./routes/ccdi/scdc')); +// 中华人民共和国农业农村部 +router.get('/gov/moa/:suburl(.*)', require('./routes/gov/moa/moa')); + // 香水时代 router.get('/nosetime/:id/:type/:sort?', require('./routes/nosetime/comment')); router.get('/nosetime/home', require('./routes/nosetime/home')); diff --git a/lib/routes/gov/moa/moa.js b/lib/routes/gov/moa/moa.js new file mode 100644 index 0000000000..03eaebdc53 --- /dev/null +++ b/lib/routes/gov/moa/moa.js @@ -0,0 +1,164 @@ +const got = require('@/utils/got'); +const cheerio = require('cheerio'); +const date = require('@/utils/date'); + +const hostUrl = 'http://www.moa.gov.cn/'; +const hostUrlObj = new URL(hostUrl); // 用于在下面判断host + +module.exports = async (ctx) => { + const rawSuburl = ctx.params.suburl; + const suburl = rawSuburl.slice(-1) === '/' ? rawSuburl : rawSuburl + '/'; + + // 特殊处理两个, 其他的栏目都可以找到那种一个列表下去的目录 + if (suburl === 'xw/tpxw/') { + // 图片新闻 + ctx.state.data = await dealChannel(ctx, suburl, { + channelTitleSelector: '.pub-media2-head', + listSelector: '.tupian_list li', + titleSelector: 'a[class="block w_fill ellipsis adc ahc"]', + dateSelector: 'span', + }); + } else if (suburl === 'govpublic/') { + // 公开公告 + ctx.state.data = await dealChannel(ctx, 'govpublic/1/index.htm', { + channelTitleSelector: 'title', + listSelector: '.gongkai_centerRList li', + titleSelector: 'a', + dateSelector: 'span', + }); + } else { + ctx.state.data = await dealChannel(ctx, suburl, { + channelTitleSelector: '.pub-media1-head-title', + listSelector: '.ztlb', + titleSelector: 'a', + dateSelector: 'span', + }); + } +}; + +// 处理文章列表, 从那里获得一堆要爬取的页面, 然后爬取 +async function dealChannel(ctx, suburl, selectors) { + const { channelTitleSelector, listSelector, titleSelector, dateSelector } = selectors; + + // 为了与下面解析相对链接的dealLink配合, 这里末尾必须保证有一条斜杠 + const url = hostUrl + suburl; + const respone = await got.get(url); + const $ = cheerio.load(respone.data); + + const channelTitle = $(channelTitleSelector).text(); + + const pageInfos = $(listSelector) + .map((i, e) => { + const element = $(e); + const titleElement = element.find(titleSelector); + + const title = titleElement.text(); + const [link, pageType] = dealLink(titleElement, url); + const dateraw = element.find(dateSelector).text().trim(); + + return { + pageType: pageType, + title: title, + link: link, + // 先在这里获取一个精确到日的时间 + // 如果是正常文章的话可以在那里提取更精确的时间 + // 如果是公示文章或者站外文章的话只能用这个保底了 + pubDate: date(dateraw), + }; + }) + .get(); + + const items = await Promise.all( + pageInfos.map(async (item) => { + const link = item.link; + + const cache = await ctx.cache.get(link); + if (cache) { + return Promise.resolve(JSON.parse(cache)); + } + + if (item.pageType === 'normal') { + // 正常文章 + item = await dealNormalPage(link, item); + } else if (item.pageType === 'govpublic') { + // 公示文章 + item = await dealGovpublicPage(link, item); + } else { + // 外部文章 + item.description = `外部链接: ${item.link}`; + item.author = 'unknown'; + } + + ctx.cache.set(link, JSON.stringify(item)); + return Promise.resolve(item); + }) + ); + + return { + title: `中华人民共和国农业农村部 - ${channelTitle}`, + link: url, + item: items, + }; +} + +// 处理正常文章, 例子: http://www.moa.gov.cn/gk/rsxx_1/202004/t20200421_6342037.htm +async function dealNormalPage(link, item) { + const reponse = await got.get(link); + const $ = cheerio.load(reponse.data); + const metaElements = $('.bjjMAuthorBox span.dc_3').toArray(); + + // 政府网站变动不频繁, 写死第几个应该没有多大关系 + const author = $(metaElements[1]).text(); + const source = $(metaElements[2]).text(); + item.author = `${author} ${source}`; + + // 对于这个网站内的链接, 能提供更精确的时间 + // 这个的日期跟时间之间的空格数量好像会乱变的 + const exactTime = $(metaElements[0]).text(); + const dateMatch = /\d{4}-\d{2}-\d{2}/.exec(exactTime); + const timeMatch = /\d{2}:\d{2}/.exec(exactTime); + item.pubDate = date(`${dateMatch[0]} ${timeMatch[0]}`); + + item.description = $('.arc_body').html(); + + return item; +} + +// 处理那种带索引号的公示文章, 例子: http://www.moa.gov.cn/govpublic/XZQYJ/202004/t20200420_6341913.htm +async function dealGovpublicPage(link, item) { + const respone = await got.get(link); + const $ = cheerio.load(respone.data); + + const head = $('ul.head'); + const body = $('.arc_body'); + + // 日期时间作者等详细信息被包含在了head里面 + // 况且都是政府部门, 提取作者信息无多大意义(还没有特别在页面标注出来), 干脆写在正文 + // 而且我也搞不懂到底是发布部门算作者还是写出来公告的部门算还是那个人算... + + item.description = head.html() + body.html(); + + return item; +} + +// 处理相对url 和 按链接对文章类型进行分类 +function dealLink(element, url) { + const rawLink = element.attr('href'); + const { host, href } = new URL(rawLink, url); + + // host不同的是外部文章, outside + // url里带govpublic的都是公示文章, govpublic + // 其他的都算普通文章, normal + let pageType = null; + if (host !== hostUrlObj.host) { + pageType = 'outside'; + } else { + if (href.indexOf('govpublic') !== -1) { + pageType = 'govpublic'; + } else { + pageType = 'normal'; + } + } + + return [href, pageType]; +}