From 461f0cbc5c3e0873932b9cd6b84fd1d6b5a6b207 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:06 +0200 Subject: [PATCH 01/15] Log a warning if selector has no match --- src/sanitizer/index.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index bcec96dbd..884b225c9 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -9,7 +9,13 @@ export default async function sanitize(content, selector) { if (selector) { const { document } = new JSDOM(contentToSanitize).window; - contentToSanitize = document.querySelector(selector); + const selectedContent = document.querySelector(selector); + + if (selectedContent) { + contentToSanitize = selectedContent; + } else { + console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + } } const markdown = turndownService.turndown(contentToSanitize); From 55103b98c1fb1671c9b18e10ba90bc4b9f59cf6a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:34 +0200 Subject: [PATCH 02/15] Fix naming --- src/history/persistor.test.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/history/persistor.test.js b/src/history/persistor.test.js index 332044800..f3a482459 100644 --- a/src/history/persistor.test.js +++ b/src/history/persistor.test.js @@ -10,8 +10,7 @@ const POLICY_TYPE = 'terms_of_service'; const FILE_CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs'; const EXPECTED_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; - -describe('History', () => { +describe('Persistor', () => { describe('#save', () => { context('when service provider’s directory already exist', () => { after(() => { From 321a22a5e5df3d4691bd715c1f39876bd5bbc996 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:09 +0200 Subject: [PATCH 03/15] Fix whitespaces --- src/history/persistor.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index faa254ff7..0b1f23fc5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -16,9 +16,9 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa } export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { -const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; + const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; -if (!fsApi.existsSync(directory)){ + if (!fsApi.existsSync(directory)){ fsApi.mkdirSync(directory); } From 557c9ffab9cb44271112ad4387e8ee2335303324 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:32 +0200 Subject: [PATCH 04/15] Improve var naming --- src/history/persistor.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 0b1f23fc5..d32d010be 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,14 +28,14 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' - const filepath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); + const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); - const status = await git.status(filepath); + const status = await git.status(filePath); if (!(status.includes('modified') || status.includes('added'))) { return; } - await git.add(filepath); + await git.add(filePath); return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); } From c99b2901e2df4187f97f035da94093b46187b288 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:41 +0200 Subject: [PATCH 05/15] Add logs --- src/history/persistor.js | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index d32d010be..9e2a4f75a 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -22,7 +22,10 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit fsApi.mkdirSync(directory); } - return fs.writeFile(`${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`, fileContent); + const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; + return fs.writeFile(filePath, fileContent).then(() => { + console.log(`File ${filePath} saved.`) + }); } export async function commit({ serviceProviderId, policyType, isSanitized }) { @@ -37,5 +40,8 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); + return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + return sha; + }); } From 1d970d5d46b060be2019c7aa5578702f1d7b1156 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:24:13 +0200 Subject: [PATCH 06/15] List all service providers --- src/service_providers/index.js | 23 +++++++++++++++++ src/service_providers/index.test.js | 39 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 src/service_providers/index.js create mode 100644 src/service_providers/index.test.js diff --git a/src/service_providers/index.js b/src/service_providers/index.js new file mode 100644 index 000000000..f83cb7c93 --- /dev/null +++ b/src/service_providers/index.js @@ -0,0 +1,23 @@ +import dotenv from 'dotenv'; +dotenv.config(); + +import fs from 'fs'; +import path from 'path'; + +const __dirname = path.dirname(new URL(import.meta.url).pathname); +const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) + +export default function serviceProviders() { + const result = {}; + + fs.readdirSync(dirPath).forEach((filename) => { + const serviceProviderId = path.basename(filename, '.json'); + if (serviceProviderId.indexOf('.') === 0) { + return; // ignore invisible files such as .DS_Store + } + + result[serviceProviderId] = JSON.parse(fs.readFileSync(path.join(dirPath, filename))); + }); + + return result; +} diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js new file mode 100644 index 000000000..8972c0ce1 --- /dev/null +++ b/src/service_providers/index.test.js @@ -0,0 +1,39 @@ +import chai from 'chai'; +import nock from 'nock'; +import fs from 'fs'; +import path from 'path'; + +import serviceProviders from './index.js'; + +const expect = chai.expect; + +const expectResult = { + first_provider: { + serviceProviderName: 'First Provider', + documents: { + tos: { + url: 'https://www.firstprovider.com/tos', + contentSelector: 'main' + } + } + }, + second_provider: { + serviceProviderName: 'Second Provider', + documents: { + tos: { + url: 'https://www.secondprovider.com/tos', + contentSelector: 'main' + } + } + } + +} + +describe('ServiceProviders', () => { + describe('#serviceProviders', () => { + it('returns an object with all service providers manifests', () => { + const result = serviceProviders(); + expect(result).to.deep.equal(expectResult); + }); + }); +}); From 6fbe0e5f5ff2b4ff8e5dfd0e851b056c4a019393 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 07/15] Add Facebook, Twitter and Snapchat manifests --- providers/facebook.json | 9 +++++++++ providers/snapchat.json | 9 +++++++++ providers/twitter.json | 9 +++++++++ 3 files changed, 27 insertions(+) create mode 100644 providers/facebook.json create mode 100644 providers/snapchat.json create mode 100644 providers/twitter.json diff --git a/providers/facebook.json b/providers/facebook.json new file mode 100644 index 000000000..2818e2554 --- /dev/null +++ b/providers/facebook.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Facebook", + "documents": { + "tos": { + "url": "https://www.facebook.com/legal/terms/plain_text_terms", + "contentSelector": ".UIFullPage_Container" + } + } +} diff --git a/providers/snapchat.json b/providers/snapchat.json new file mode 100644 index 000000000..c446c8dd1 --- /dev/null +++ b/providers/snapchat.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Snapchat", + "documents": { + "tos": { + "url": "https://www.snap.com/terms/", + "contentSelector": ".textarticle-container" + } + } +} diff --git a/providers/twitter.json b/providers/twitter.json new file mode 100644 index 000000000..ba6132b59 --- /dev/null +++ b/providers/twitter.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Twitter", + "documents": { + "tos": { + "url": "https://twitter.com/tos", + "contentSelector": "main" + } + } +} From 01b4048f567ba7258e4452a919714d7f3e5207dc Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 08/15] Update terms for many service providers --- src/index.js | 26 +++++++-- src/index.test.js | 54 ++++++++++++------- test/fixtures/first_provider_terms_raw.html | 14 +++++ .../first_provider_terms_sanitized.md | 12 +++++ test/fixtures/second_provider_terms_raw.html | 14 +++++ .../second_provider_terms_sanitized.md | 12 +++++ test/providers/first_provider.json | 9 ++++ test/providers/second_provider.json | 9 ++++ 8 files changed, 128 insertions(+), 22 deletions(-) create mode 100644 test/fixtures/first_provider_terms_raw.html create mode 100644 test/fixtures/first_provider_terms_sanitized.md create mode 100644 test/fixtures/second_provider_terms_raw.html create mode 100644 test/fixtures/second_provider_terms_sanitized.md create mode 100644 test/providers/first_provider.json create mode 100644 test/providers/second_provider.json diff --git a/src/index.js b/src/index.js index e655523ec..1d1b132da 100644 --- a/src/index.js +++ b/src/index.js @@ -7,11 +7,29 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; +import serviceProviders from './service_providers/index.js'; + +export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { + const content = await scrape(documentUrl); + await persistRaw(serviceProviderId, documentType, content); + const sanitizedContent = await sanitize(content, documentContentSelector); + await persistSanitized(serviceProviderId, documentType, sanitizedContent); +}; export default async function updateTerms() { console.log('Start scraping and saving terms of service…') - const content = await scrape('https://www.facebook.com/legal/terms/plain_text_terms'); - await persistRaw('facebook', 'terms_of_service', content); - const sanitizedContent = await sanitize(content, '.UIFullPage_Container'); - await persistSanitized('facebook', 'terms_of_service', sanitizedContent); + + const promises = []; + const serviceProvidersManifests = serviceProviders(); + + Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { + const { documents } = serviceProvidersManifests[serviceProviderId]; + + Object.keys(documents).forEach(async (documentType) => { + const { url, contentSelector } = documents[documentType]; + promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + }); + }); + + return Promise.all(promises); }; diff --git a/src/index.test.js b/src/index.test.js index be57465a9..c39f83bee 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -9,19 +9,25 @@ const expect = chai.expect; import updateTerms from './index.js'; import { RAW_DIRECTORY, SANITIZED_DIRECTORY } from './history/persistor.js'; -const facebookTermsHTML = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_raw.html'), { encoding: 'utf8' }); -const facebookTermsMD = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_sanitized.md'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_ID = 'first_provider'; +const FIRST_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const FIRST_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_raw.html'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.facebook.com', { - reqheaders: { 'Accept-Language': 'en' } - }).get('/legal/terms/plain_text_terms') - .reply(200, facebookTermsHTML); +const SECOND_SERVICE_PROVIDER_ID = 'second_provider'; +const SECOND_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); +const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -const SERVICE_PROVIDER_ID = 'facebook'; -const POLICY_TYPE = 'terms_of_service'; +nock('https://www.firstprovider.com').get('/tos') + .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -const EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; -const EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.md`; +nock('https://www.secondprovider.com').get('/tos') + .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { describe('#updateTerms', () => { @@ -30,18 +36,30 @@ describe('CGUs', () => { }); after(() => { - fs.unlinkSync(EXPECTED_RAW_FILE_PATH); - fs.unlinkSync(EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); }); - it('persists terms in raw format', async () => { - const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); - expect(resultingRawTerms).to.be.equal(facebookTermsHTML); + it('persists terms in raw format for first service provider', () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_RAW); }); - it('persists terms in sanitized format', async () => { - const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); - expect(resultingSanitizedTerms).to.be.equal(facebookTermsMD); + it('persists terms in sanitized format for first service provider', () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_SANITIZED); + }); + + it('persists terms in raw format for second service provider', async () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_RAW); + }); + + it('persists terms in sanitized format for second service provider', async () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_SANITIZED); }); }); }); diff --git a/test/fixtures/first_provider_terms_raw.html b/test/fixtures/first_provider_terms_raw.html new file mode 100644 index 000000000..4971e89f1 --- /dev/null +++ b/test/fixtures/first_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + First provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/first_provider_terms_sanitized.md b/test/fixtures/first_provider_terms_sanitized.md new file mode 100644 index 000000000..967f45a0d --- /dev/null +++ b/test/fixtures/first_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + First provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/fixtures/second_provider_terms_raw.html b/test/fixtures/second_provider_terms_raw.html new file mode 100644 index 000000000..d76baf758 --- /dev/null +++ b/test/fixtures/second_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + Second provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/second_provider_terms_sanitized.md b/test/fixtures/second_provider_terms_sanitized.md new file mode 100644 index 000000000..8aef98bc9 --- /dev/null +++ b/test/fixtures/second_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + Second provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json new file mode 100644 index 000000000..7682ee091 --- /dev/null +++ b/test/providers/first_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "First Provider", + "documents": { + "tos": { + "url": "https://www.firstprovider.com/tos", + "contentSelector": "main" + } + } +} diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json new file mode 100644 index 000000000..d397066ad --- /dev/null +++ b/test/providers/second_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Second Provider", + "documents": { + "tos": { + "url": "https://www.secondprovider.com/tos", + "contentSelector": "main" + } + } +} From a3faf80058637f643dc335b7d4f3c10e9eb67af0 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:34 +0200 Subject: [PATCH 09/15] Improve logs --- src/history/persistor.js | 2 +- src/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 9e2a4f75a..3ed5b5b4f 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -40,7 +40,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); return sha; }); diff --git a/src/index.js b/src/index.js index 1d1b132da..243c5e38c 100644 --- a/src/index.js +++ b/src/index.js @@ -17,7 +17,7 @@ export async function updateServiceProviderDocument(serviceProviderId, documentT }; export default async function updateTerms() { - console.log('Start scraping and saving terms of service…') + console.log('Start scraping and saving terms of service…'); const promises = []; const serviceProvidersManifests = serviceProviders(); From b89d3610e42a994a67a5c5d153b9f0f721c64395 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:58 +0200 Subject: [PATCH 10/15] Prevent `unmodified` files to be committed --- src/history/persistor.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 3ed5b5b4f..6fc788a95 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -34,7 +34,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); - if (!(status.includes('modified') || status.includes('added'))) { + if (!status.match(/^\*?(modified|added)/)) { return; } From ffb9984e3e1625924d19fc43577561aca4ccc2ca Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:17:35 +0200 Subject: [PATCH 11/15] Avoid concurrent calls when modifying git history --- src/history/persistor.js | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 6fc788a95..51b18a51b 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,6 +28,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } +let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -38,10 +39,17 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - await git.add(filePath); - - return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - return sha; + // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… + // …and others caller of `persistor.commit` will wait + await lock; + lock = new Promise(resolveLock => { + git.add(filePath).then(() => { + git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolveLock(sha); + }); + }); }); + + return lock; } From 7569517b4175773d6343850181142c55d70e25fd Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:39:29 +0200 Subject: [PATCH 12/15] Improve logs --- src/index.js | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/index.js b/src/index.js index 243c5e38c..85f7db7af 100644 --- a/src/index.js +++ b/src/index.js @@ -9,10 +9,17 @@ import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; import serviceProviders from './service_providers/index.js'; -export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { +export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { + console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); + + console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); await persistRaw(serviceProviderId, documentType, content); + + console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); + + console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); await persistSanitized(serviceProviderId, documentType, sanitizedContent); }; @@ -23,11 +30,11 @@ export default async function updateTerms() { const serviceProvidersManifests = serviceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { - const { documents } = serviceProvidersManifests[serviceProviderId]; + const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; Object.keys(documents).forEach(async (documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); From f89f64b84e9b7707471891e20ffdbc05ec33361b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 17:37:58 +0200 Subject: [PATCH 13/15] Use async to manage concurrency on git modifications --- package-lock.json | 5 +++++ package.json | 1 + src/history/persistor.js | 29 ++++++++++++++++------------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/package-lock.json b/package-lock.json index 98be5417c..64b667ffd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -103,6 +103,11 @@ "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "dev": true }, + "async": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.0.tgz", + "integrity": "sha512-TR2mEZFVOj2pLStYxLht7TyfuRzaydfpxr3k9RpHIzMgw7A64dzsdqCxH1WJyQdoe8T10nDXd9wnEigmiuHIZw==" + }, "async-lock": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/async-lock/-/async-lock-1.2.4.tgz", diff --git a/package.json b/package.json index e26f4aef1..1289d262f 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "nock": "^12.0.3" }, "dependencies": { + "async": "^3.2.0", "console-stamp": "^0.2.9", "dotenv": "^8.2.0", "isomorphic-git": "^1.4.0", diff --git a/src/history/persistor.js b/src/history/persistor.js index 51b18a51b..32a0ee508 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -2,6 +2,13 @@ import path from 'path'; import fsApi from 'fs'; const fs = fsApi.promises; +import async from 'async'; + +const commitQueue = async.queue(_commit, 1); +commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { + reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); +}); + import * as git from './git.js'; const __dirname = path.dirname(new URL(import.meta.url).pathname); @@ -28,7 +35,6 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } -let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -39,17 +45,14 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… - // …and others caller of `persistor.commit` will wait - await lock; - lock = new Promise(resolveLock => { - git.add(filePath).then(() => { - git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - resolveLock(sha); - }); - }); - }); + return new Promise((resolve, reject) => { + commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); + }) +} - return lock; +async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { + await git.add(filePath) + const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolve(sha); } From 04766074b8606c980568723d3c97dcfaf9bc57e4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:16:59 +0200 Subject: [PATCH 14/15] Fix review comments --- src/history/persistor.js | 13 +++++++------ src/index.js | 16 ++++++++-------- src/index.test.js | 4 ++-- src/sanitizer/index.js | 2 +- src/service_providers/index.js | 10 +++++----- src/service_providers/index.test.js | 14 +++++++------- test/providers/first_provider.json | 2 +- test/providers/second_provider.json | 2 +- 8 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 32a0ee508..01b2f57b5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -6,7 +6,7 @@ import async from 'async'; const commitQueue = async.queue(_commit, 1); commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { - reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); + reject(new Error(`Could not commit ${policyType} for ${serviceProviderId} (${isSanitized ? 'sanitized' : 'raw'} version) due to error: ${err}`)); }); import * as git from './git.js'; @@ -25,8 +25,8 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; - if (!fsApi.existsSync(directory)){ - fsApi.mkdirSync(directory); + if (!fsApi.existsSync(directory)) { + await fs.mkdir(directory); } const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; @@ -38,6 +38,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' + // Git needs a path relative to the .git directory, not an absolute one const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); @@ -47,12 +48,12 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return new Promise((resolve, reject) => { commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); - }) + }); } async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { - await git.add(filePath) - const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + await git.add(filePath); + const sha = await git.commit(`Update ${isSanitized ? 'sanitized' : 'raw'} ${policyType} for ${serviceProviderId}`); console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); resolve(sha); } diff --git a/src/index.js b/src/index.js index 85f7db7af..9568a3098 100644 --- a/src/index.js +++ b/src/index.js @@ -7,36 +7,36 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; -import serviceProviders from './service_providers/index.js'; +import getServiceProviders from './service_providers/index.js'; export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); - await persistRaw(serviceProviderId, documentType, content); + persistRaw(serviceProviderId, documentType, content); console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); - await persistSanitized(serviceProviderId, documentType, sanitizedContent); + persistSanitized(serviceProviderId, documentType, sanitizedContent); }; export default async function updateTerms() { console.log('Start scraping and saving terms of service…'); - const promises = []; - const serviceProvidersManifests = serviceProviders(); + const documentUpdatePromises = []; + const serviceProvidersManifests = getServiceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; - Object.keys(documents).forEach(async (documentType) => { + Object.keys(documents).forEach((documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); + documentUpdatePromises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); - return Promise.all(promises); + return Promise.all(documentUpdatePromises); }; diff --git a/src/index.test.js b/src/index.test.js index c39f83bee..130d9ec24 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -23,10 +23,10 @@ const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECT const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.firstprovider.com').get('/tos') +nock('https://www.firstprovider.example').get('/tos') .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -nock('https://www.secondprovider.com').get('/tos') +nock('https://www.secondprovider.example').get('/tos') .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 884b225c9..35d3cd6b8 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + console.warn(`The provider selector "${selector}" has no match in the document.`) } } diff --git a/src/service_providers/index.js b/src/service_providers/index.js index f83cb7c93..5ecf355a9 100644 --- a/src/service_providers/index.js +++ b/src/service_providers/index.js @@ -1,13 +1,13 @@ -import dotenv from 'dotenv'; -dotenv.config(); - import fs from 'fs'; import path from 'path'; +import dotenv from 'dotenv'; +dotenv.config(); + const __dirname = path.dirname(new URL(import.meta.url).pathname); -const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) +const dirPath = path.resolve(__dirname, '../..', process.env.NODE_ENV === 'test' ? 'test' : '', 'providers'); -export default function serviceProviders() { +export default function getServiceProviders() { const result = {}; fs.readdirSync(dirPath).forEach((filename) => { diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js index 8972c0ce1..f1d8b51fc 100644 --- a/src/service_providers/index.test.js +++ b/src/service_providers/index.test.js @@ -3,16 +3,16 @@ import nock from 'nock'; import fs from 'fs'; import path from 'path'; -import serviceProviders from './index.js'; +import getServiceProviders from './index.js'; const expect = chai.expect; -const expectResult = { +const expected = { first_provider: { serviceProviderName: 'First Provider', documents: { tos: { - url: 'https://www.firstprovider.com/tos', + url: 'https://www.firstprovider.example/tos', contentSelector: 'main' } } @@ -21,7 +21,7 @@ const expectResult = { serviceProviderName: 'Second Provider', documents: { tos: { - url: 'https://www.secondprovider.com/tos', + url: 'https://www.secondprovider.example/tos', contentSelector: 'main' } } @@ -30,10 +30,10 @@ const expectResult = { } describe('ServiceProviders', () => { - describe('#serviceProviders', () => { + describe('#getServiceProviders', () => { it('returns an object with all service providers manifests', () => { - const result = serviceProviders(); - expect(result).to.deep.equal(expectResult); + const result = getServiceProviders(); + expect(result).to.deep.equal(expected); }); }); }); diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json index 7682ee091..90d6543ba 100644 --- a/test/providers/first_provider.json +++ b/test/providers/first_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "First Provider", "documents": { "tos": { - "url": "https://www.firstprovider.com/tos", + "url": "https://www.firstprovider.example/tos", "contentSelector": "main" } } diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json index d397066ad..b5b0f4c6a 100644 --- a/test/providers/second_provider.json +++ b/test/providers/second_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "Second Provider", "documents": { "tos": { - "url": "https://www.secondprovider.com/tos", + "url": "https://www.secondprovider.example/tos", "contentSelector": "main" } } From e8127e50ef6ddb7bb880012b2623817125ba4181 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:56:35 +0200 Subject: [PATCH 15/15] Wording --- src/sanitizer/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 35d3cd6b8..d0d49594c 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.warn(`The provider selector "${selector}" has no match in the document.`) + console.warn(`The provided selector "${selector}" has no match in the document.`) } }