diff --git a/package-lock.json b/package-lock.json index 7f8c7e63..6ef6ce9b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,7 @@ "lodash": "^4.17.21", "markdown-include": "^0.4.3", "parse-duration": "^1.0.0", + "pdf-parse": "^1.1.1", "srcset": "^4.0.0", "tabletojson": "^2.0.7", "ts-node": "^10.3.0", @@ -3452,6 +3453,11 @@ "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=" }, + "node_modules/node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha1-7K52QVDemYYexcgQ/V0Jaxg5Mqc=" + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -3642,6 +3648,26 @@ "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==" }, + "node_modules/pdf-parse": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.1.tgz", + "integrity": "sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==", + "dependencies": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + }, + "engines": { + "node": ">=6.8.1" + } + }, + "node_modules/pdf-parse/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dependencies": { + "ms": "^2.1.1" + } + }, "node_modules/picocolors": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", @@ -7053,6 +7079,11 @@ "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=" }, + "node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha1-7K52QVDemYYexcgQ/V0Jaxg5Mqc=" + }, "node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -7192,6 +7223,25 @@ "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==" }, + "pdf-parse": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.1.tgz", + "integrity": "sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==", + "requires": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + }, + "dependencies": { + "debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "requires": { + "ms": "^2.1.1" + } + } + } + }, "picocolors": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", diff --git a/package.json b/package.json index f00d0228..01be5f78 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "lodash": "^4.17.21", "markdown-include": "^0.4.3", "parse-duration": "^1.0.0", + "pdf-parse": "^1.1.1", "srcset": "^4.0.0", "tabletojson": "^2.0.7", "ts-node": "^10.3.0", diff --git a/sites/rtb.gov.bn/__data__/Aneka 11 November 2021.pdf b/sites/rtb.gov.bn/__data__/Aneka 11 November 2021.pdf new file mode 100644 index 00000000..9e824685 Binary files /dev/null and b/sites/rtb.gov.bn/__data__/Aneka 11 November 2021.pdf differ diff --git a/sites/rtb.gov.bn/__data__/Sukmaindera 11 November 2021.pdf b/sites/rtb.gov.bn/__data__/Sukmaindera 11 November 2021.pdf new file mode 100644 index 00000000..b7c02aec Binary files /dev/null and b/sites/rtb.gov.bn/__data__/Sukmaindera 11 November 2021.pdf differ diff --git a/sites/rtb.gov.bn/rtb.gov.bn.config.js b/sites/rtb.gov.bn/rtb.gov.bn.config.js new file mode 100644 index 00000000..4126bd89 --- /dev/null +++ b/sites/rtb.gov.bn/rtb.gov.bn.config.js @@ -0,0 +1,70 @@ +const pdf = require('pdf-parse') +const dayjs = require('dayjs') +const utc = require('dayjs/plugin/utc') +const timezone = require('dayjs/plugin/timezone') +const customParseFormat = require('dayjs/plugin/customParseFormat') + +dayjs.extend(utc) +dayjs.extend(timezone) +dayjs.extend(customParseFormat) + +module.exports = { + site: 'rtb.gov.bn', + url: function ({ channel, date }) { + const [position] = channel.site_id.split('#') + + return encodeURI( + `http://www.rtb.gov.bn/PublishingImages/SitePages/Programme Guide/${ + channel.site_id + } ${date.format('DD MMMM YYYY')}.pdf` + ) + }, + logo({ channel }) { + return channel.logo + }, + parser: async function ({ buffer, date }) { + let PM = false + let programs = [] + const items = await parseItems(buffer) + items.forEach(item => { + let start = parseStart(item, date) + if (start.hour() > 11) PM = true + if (start.hour() < 12 && PM) start = start.add(1, 'd') + const stop = start.add(1, 'h') + if (programs.length) { + programs[programs.length - 1].stop = start.toJSON() + } + programs.push({ + title: item.title, + start: start.toJSON(), + stop: stop.toJSON() + }) + }) + + return programs + } +} + +function parseStart(item, date) { + const time = `${date.format('YYYY-MM-DD')} ${item.time}` + + return dayjs.tz(time, 'YYYY-MM-DD HH:mm', 'Asia/Brunei') +} + +async function parseItems(buffer) { + const data = await pdf(buffer).catch(err => null) + if (!data) return [] + + return data.text + .split('\n') + .filter(s => { + const string = s.trim() + + return string && /^\d{2}:\d{2}/.test(string) + }) + .map(s => { + const [_, time, title] = s.trim().match(/^(\d{2}:\d{2}) (.*)/) || [null, null, null] + + return { time, title } + }) +} diff --git a/sites/rtb.gov.bn/rtb.gov.bn.test.js b/sites/rtb.gov.bn/rtb.gov.bn.test.js new file mode 100644 index 00000000..474df3d9 --- /dev/null +++ b/sites/rtb.gov.bn/rtb.gov.bn.test.js @@ -0,0 +1,93 @@ +// npx epg-grabber --config=sites/rtb.gov.bn/rtb.gov.bn.config.js --channels=sites/rtb.gov.bn/rtb.gov.bn_bn.channels.xml --output=.gh-pages/guides/bn/rtb.gov.bn.epg.xml --days=2 + +const { parser, url, logo } = require('./rtb.gov.bn.config.js') +const path = require('path') +const fs = require('fs') +const dayjs = require('dayjs') +const utc = require('dayjs/plugin/utc') +const customParseFormat = require('dayjs/plugin/customParseFormat') +dayjs.extend(customParseFormat) +dayjs.extend(utc) + +const date = dayjs.utc('2021-11-11', 'YYYY-MM-DD').startOf('d') +const channel = { + site_id: 'Sukmaindera', + xmltv_id: 'RTBSukmaindera.bn', + logo: 'http://www.rtb.gov.bn/SiteAssets/SitePages/TV%20Programme%20Division/LOGO%20RTB%20SUKMAINDERA.png' +} + +it('can generate valid url', () => { + expect(url({ channel, date })).toBe( + 'http://www.rtb.gov.bn/PublishingImages/SitePages/Programme%20Guide/Sukmaindera%2011%20November%202021.pdf' + ) +}) + +it('can get logo url', () => { + expect(logo({ channel })).toBe( + 'http://www.rtb.gov.bn/SiteAssets/SitePages/TV%20Programme%20Division/LOGO%20RTB%20SUKMAINDERA.png' + ) +}) + +it('can parse Sukmaindera 11 November 2021.pdf', done => { + const buffer = fs.readFileSync( + path.resolve(__dirname, '__data__/Sukmaindera 11 November 2021.pdf'), + { + charset: 'utf8' + } + ) + parser({ buffer, date }) + .then(results => { + expect(results.length).toBe(47) + expect(results[0]).toMatchObject({ + start: '2021-11-10T22:00:00.000Z', + stop: '2021-11-10T22:05:00.000Z', + title: 'NATIONAL ANTHEM' + }) + expect(results[46]).toMatchObject({ + start: '2021-11-11T21:30:00.000Z', + stop: '2021-11-11T22:30:00.000Z', + title: 'BACAAN SURAH YASSIN' + }) + done() + }) + .catch(error => { + done(error) + }) +}) + +it('can parse Aneka 11 November 2021.pdf', done => { + const buffer = fs.readFileSync(path.resolve(__dirname, '__data__/Aneka 11 November 2021.pdf'), { + charset: 'utf8' + }) + parser({ buffer, date }) + .then(results => { + expect(results.length).toBe(26) + expect(results[4]).toMatchObject({ + start: '2021-11-11T03:00:00.000Z', + stop: '2021-11-11T04:05:00.000Z', + title: 'DRAMA TURKI:' + }) + done() + }) + .catch(error => { + done(error) + }) +}) + +it('can handle empty guide', done => { + parser({ + date, + channel, + content: `