From 8ef067ebe94db1d5f6dbc08ae26cc5a35bd533b4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:06 +0200 Subject: [PATCH 01/15] Log a warning if selector has no match --- src/sanitizer/index.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index bcec96dbd..884b225c9 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -9,7 +9,13 @@ export default async function sanitize(content, selector) { if (selector) { const { document } = new JSDOM(contentToSanitize).window; - contentToSanitize = document.querySelector(selector); + const selectedContent = document.querySelector(selector); + + if (selectedContent) { + contentToSanitize = selectedContent; + } else { + console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + } } const markdown = turndownService.turndown(contentToSanitize); From dd0ff30cc70c9451d0d56f663550072aad2127f8 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:34 +0200 Subject: [PATCH 02/15] Fix naming --- src/history/persistor.test.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/history/persistor.test.js b/src/history/persistor.test.js index 332044800..f3a482459 100644 --- a/src/history/persistor.test.js +++ b/src/history/persistor.test.js @@ -10,8 +10,7 @@ const POLICY_TYPE = 'terms_of_service'; const FILE_CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs'; const EXPECTED_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; - -describe('History', () => { +describe('Persistor', () => { describe('#save', () => { context('when service provider’s directory already exist', () => { after(() => { From b271388ae0d7ddd9174bb9b193335635cda9c07c Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:09 +0200 Subject: [PATCH 03/15] Fix whitespaces --- src/history/persistor.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index faa254ff7..0b1f23fc5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -16,9 +16,9 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa } export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { -const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; + const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; -if (!fsApi.existsSync(directory)){ + if (!fsApi.existsSync(directory)){ fsApi.mkdirSync(directory); } From 100c11473df57b119420d863099f0866a8296804 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:32 +0200 Subject: [PATCH 04/15] Improve var naming --- src/history/persistor.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 0b1f23fc5..d32d010be 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,14 +28,14 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' - const filepath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); + const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); - const status = await git.status(filepath); + const status = await git.status(filePath); if (!(status.includes('modified') || status.includes('added'))) { return; } - await git.add(filepath); + await git.add(filePath); return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); } From 6100140907f91b4814b49971b7fef512406e4c3a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:41 +0200 Subject: [PATCH 05/15] Add logs --- src/history/persistor.js | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index d32d010be..9e2a4f75a 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -22,7 +22,10 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit fsApi.mkdirSync(directory); } - return fs.writeFile(`${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`, fileContent); + const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; + return fs.writeFile(filePath, fileContent).then(() => { + console.log(`File ${filePath} saved.`) + }); } export async function commit({ serviceProviderId, policyType, isSanitized }) { @@ -37,5 +40,8 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); + return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + return sha; + }); } From 1084c523d97830201d2af34149428c826bdd78e6 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:24:13 +0200 Subject: [PATCH 06/15] List all service providers --- src/service_providers/index.js | 23 +++++++++++++++++ src/service_providers/index.test.js | 39 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 src/service_providers/index.js create mode 100644 src/service_providers/index.test.js diff --git a/src/service_providers/index.js b/src/service_providers/index.js new file mode 100644 index 000000000..f83cb7c93 --- /dev/null +++ b/src/service_providers/index.js @@ -0,0 +1,23 @@ +import dotenv from 'dotenv'; +dotenv.config(); + +import fs from 'fs'; +import path from 'path'; + +const __dirname = path.dirname(new URL(import.meta.url).pathname); +const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) + +export default function serviceProviders() { + const result = {}; + + fs.readdirSync(dirPath).forEach((filename) => { + const serviceProviderId = path.basename(filename, '.json'); + if (serviceProviderId.indexOf('.') === 0) { + return; // ignore invisible files such as .DS_Store + } + + result[serviceProviderId] = JSON.parse(fs.readFileSync(path.join(dirPath, filename))); + }); + + return result; +} diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js new file mode 100644 index 000000000..8972c0ce1 --- /dev/null +++ b/src/service_providers/index.test.js @@ -0,0 +1,39 @@ +import chai from 'chai'; +import nock from 'nock'; +import fs from 'fs'; +import path from 'path'; + +import serviceProviders from './index.js'; + +const expect = chai.expect; + +const expectResult = { + first_provider: { + serviceProviderName: 'First Provider', + documents: { + tos: { + url: 'https://www.firstprovider.com/tos', + contentSelector: 'main' + } + } + }, + second_provider: { + serviceProviderName: 'Second Provider', + documents: { + tos: { + url: 'https://www.secondprovider.com/tos', + contentSelector: 'main' + } + } + } + +} + +describe('ServiceProviders', () => { + describe('#serviceProviders', () => { + it('returns an object with all service providers manifests', () => { + const result = serviceProviders(); + expect(result).to.deep.equal(expectResult); + }); + }); +}); From 0104de6873230d9d6444dc834348c6d0feccfdbf Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 07/15] Add Facebook, Twitter and Snapchat manifests --- providers/facebook.json | 9 +++++++++ providers/snapchat.json | 9 +++++++++ providers/twitter.json | 9 +++++++++ 3 files changed, 27 insertions(+) create mode 100644 providers/facebook.json create mode 100644 providers/snapchat.json create mode 100644 providers/twitter.json diff --git a/providers/facebook.json b/providers/facebook.json new file mode 100644 index 000000000..2818e2554 --- /dev/null +++ b/providers/facebook.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Facebook", + "documents": { + "tos": { + "url": "https://www.facebook.com/legal/terms/plain_text_terms", + "contentSelector": ".UIFullPage_Container" + } + } +} diff --git a/providers/snapchat.json b/providers/snapchat.json new file mode 100644 index 000000000..c446c8dd1 --- /dev/null +++ b/providers/snapchat.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Snapchat", + "documents": { + "tos": { + "url": "https://www.snap.com/terms/", + "contentSelector": ".textarticle-container" + } + } +} diff --git a/providers/twitter.json b/providers/twitter.json new file mode 100644 index 000000000..ba6132b59 --- /dev/null +++ b/providers/twitter.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Twitter", + "documents": { + "tos": { + "url": "https://twitter.com/tos", + "contentSelector": "main" + } + } +} From 8079149a0639f53b7de4d6aec81ccb34035b8694 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 08/15] Update terms for many service providers --- src/index.js | 26 +++++++-- src/index.test.js | 54 ++++++++++++------- test/fixtures/first_provider_terms_raw.html | 14 +++++ .../first_provider_terms_sanitized.md | 12 +++++ test/fixtures/second_provider_terms_raw.html | 14 +++++ .../second_provider_terms_sanitized.md | 12 +++++ test/providers/first_provider.json | 9 ++++ test/providers/second_provider.json | 9 ++++ 8 files changed, 128 insertions(+), 22 deletions(-) create mode 100644 test/fixtures/first_provider_terms_raw.html create mode 100644 test/fixtures/first_provider_terms_sanitized.md create mode 100644 test/fixtures/second_provider_terms_raw.html create mode 100644 test/fixtures/second_provider_terms_sanitized.md create mode 100644 test/providers/first_provider.json create mode 100644 test/providers/second_provider.json diff --git a/src/index.js b/src/index.js index e655523ec..1d1b132da 100644 --- a/src/index.js +++ b/src/index.js @@ -7,11 +7,29 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; +import serviceProviders from './service_providers/index.js'; + +export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { + const content = await scrape(documentUrl); + await persistRaw(serviceProviderId, documentType, content); + const sanitizedContent = await sanitize(content, documentContentSelector); + await persistSanitized(serviceProviderId, documentType, sanitizedContent); +}; export default async function updateTerms() { console.log('Start scraping and saving terms of service…') - const content = await scrape('https://www.facebook.com/legal/terms/plain_text_terms'); - await persistRaw('facebook', 'terms_of_service', content); - const sanitizedContent = await sanitize(content, '.UIFullPage_Container'); - await persistSanitized('facebook', 'terms_of_service', sanitizedContent); + + const promises = []; + const serviceProvidersManifests = serviceProviders(); + + Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { + const { documents } = serviceProvidersManifests[serviceProviderId]; + + Object.keys(documents).forEach(async (documentType) => { + const { url, contentSelector } = documents[documentType]; + promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + }); + }); + + return Promise.all(promises); }; diff --git a/src/index.test.js b/src/index.test.js index be57465a9..c39f83bee 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -9,19 +9,25 @@ const expect = chai.expect; import updateTerms from './index.js'; import { RAW_DIRECTORY, SANITIZED_DIRECTORY } from './history/persistor.js'; -const facebookTermsHTML = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_raw.html'), { encoding: 'utf8' }); -const facebookTermsMD = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_sanitized.md'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_ID = 'first_provider'; +const FIRST_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const FIRST_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_raw.html'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.facebook.com', { - reqheaders: { 'Accept-Language': 'en' } - }).get('/legal/terms/plain_text_terms') - .reply(200, facebookTermsHTML); +const SECOND_SERVICE_PROVIDER_ID = 'second_provider'; +const SECOND_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); +const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -const SERVICE_PROVIDER_ID = 'facebook'; -const POLICY_TYPE = 'terms_of_service'; +nock('https://www.firstprovider.com').get('/tos') + .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -const EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; -const EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.md`; +nock('https://www.secondprovider.com').get('/tos') + .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { describe('#updateTerms', () => { @@ -30,18 +36,30 @@ describe('CGUs', () => { }); after(() => { - fs.unlinkSync(EXPECTED_RAW_FILE_PATH); - fs.unlinkSync(EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); }); - it('persists terms in raw format', async () => { - const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); - expect(resultingRawTerms).to.be.equal(facebookTermsHTML); + it('persists terms in raw format for first service provider', () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_RAW); }); - it('persists terms in sanitized format', async () => { - const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); - expect(resultingSanitizedTerms).to.be.equal(facebookTermsMD); + it('persists terms in sanitized format for first service provider', () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_SANITIZED); + }); + + it('persists terms in raw format for second service provider', async () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_RAW); + }); + + it('persists terms in sanitized format for second service provider', async () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_SANITIZED); }); }); }); diff --git a/test/fixtures/first_provider_terms_raw.html b/test/fixtures/first_provider_terms_raw.html new file mode 100644 index 000000000..4971e89f1 --- /dev/null +++ b/test/fixtures/first_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + First provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/first_provider_terms_sanitized.md b/test/fixtures/first_provider_terms_sanitized.md new file mode 100644 index 000000000..967f45a0d --- /dev/null +++ b/test/fixtures/first_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + First provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/fixtures/second_provider_terms_raw.html b/test/fixtures/second_provider_terms_raw.html new file mode 100644 index 000000000..d76baf758 --- /dev/null +++ b/test/fixtures/second_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + Second provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/second_provider_terms_sanitized.md b/test/fixtures/second_provider_terms_sanitized.md new file mode 100644 index 000000000..8aef98bc9 --- /dev/null +++ b/test/fixtures/second_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + Second provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json new file mode 100644 index 000000000..7682ee091 --- /dev/null +++ b/test/providers/first_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "First Provider", + "documents": { + "tos": { + "url": "https://www.firstprovider.com/tos", + "contentSelector": "main" + } + } +} diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json new file mode 100644 index 000000000..d397066ad --- /dev/null +++ b/test/providers/second_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Second Provider", + "documents": { + "tos": { + "url": "https://www.secondprovider.com/tos", + "contentSelector": "main" + } + } +} From 6ca68a7b4694658fd023b543619e4b337a75e9b2 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:34 +0200 Subject: [PATCH 09/15] Improve logs --- src/history/persistor.js | 2 +- src/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 9e2a4f75a..3ed5b5b4f 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -40,7 +40,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); return sha; }); diff --git a/src/index.js b/src/index.js index 1d1b132da..243c5e38c 100644 --- a/src/index.js +++ b/src/index.js @@ -17,7 +17,7 @@ export async function updateServiceProviderDocument(serviceProviderId, documentT }; export default async function updateTerms() { - console.log('Start scraping and saving terms of service…') + console.log('Start scraping and saving terms of service…'); const promises = []; const serviceProvidersManifests = serviceProviders(); From 23b6497c7b078b9e56a55a3d8e791d968f620307 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:58 +0200 Subject: [PATCH 10/15] Prevent `unmodified` files to be committed --- src/history/persistor.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 3ed5b5b4f..6fc788a95 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -34,7 +34,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); - if (!(status.includes('modified') || status.includes('added'))) { + if (!status.match(/^\*?(modified|added)/)) { return; } From 3331e1f1b298be5b8f9caa63f3c0d3a8866f19da Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:17:35 +0200 Subject: [PATCH 11/15] Avoid concurrent calls when modifying git history --- src/history/persistor.js | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 6fc788a95..51b18a51b 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,6 +28,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } +let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -38,10 +39,17 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - await git.add(filePath); - - return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - return sha; + // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… + // …and others caller of `persistor.commit` will wait + await lock; + lock = new Promise(resolveLock => { + git.add(filePath).then(() => { + git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolveLock(sha); + }); + }); }); + + return lock; } From 3dcbbbdf91d732fe4bff2ee5692a610a9910fccb Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:39:29 +0200 Subject: [PATCH 12/15] Improve logs --- src/index.js | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/index.js b/src/index.js index 243c5e38c..85f7db7af 100644 --- a/src/index.js +++ b/src/index.js @@ -9,10 +9,17 @@ import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; import serviceProviders from './service_providers/index.js'; -export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { +export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { + console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); + + console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); await persistRaw(serviceProviderId, documentType, content); + + console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); + + console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); await persistSanitized(serviceProviderId, documentType, sanitizedContent); }; @@ -23,11 +30,11 @@ export default async function updateTerms() { const serviceProvidersManifests = serviceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { - const { documents } = serviceProvidersManifests[serviceProviderId]; + const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; Object.keys(documents).forEach(async (documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); From 5ad0bfd4978fe904dad65c388778806f13fc4eaa Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 17:37:58 +0200 Subject: [PATCH 13/15] Use async to manage concurrency on git modifications --- package-lock.json | 5 +++++ package.json | 1 + src/history/persistor.js | 29 ++++++++++++++++------------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/package-lock.json b/package-lock.json index 98be5417c..64b667ffd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -103,6 +103,11 @@ "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "dev": true }, + "async": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.0.tgz", + "integrity": "sha512-TR2mEZFVOj2pLStYxLht7TyfuRzaydfpxr3k9RpHIzMgw7A64dzsdqCxH1WJyQdoe8T10nDXd9wnEigmiuHIZw==" + }, "async-lock": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/async-lock/-/async-lock-1.2.4.tgz", diff --git a/package.json b/package.json index e26f4aef1..1289d262f 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "nock": "^12.0.3" }, "dependencies": { + "async": "^3.2.0", "console-stamp": "^0.2.9", "dotenv": "^8.2.0", "isomorphic-git": "^1.4.0", diff --git a/src/history/persistor.js b/src/history/persistor.js index 51b18a51b..32a0ee508 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -2,6 +2,13 @@ import path from 'path'; import fsApi from 'fs'; const fs = fsApi.promises; +import async from 'async'; + +const commitQueue = async.queue(_commit, 1); +commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { + reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); +}); + import * as git from './git.js'; const __dirname = path.dirname(new URL(import.meta.url).pathname); @@ -28,7 +35,6 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } -let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -39,17 +45,14 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… - // …and others caller of `persistor.commit` will wait - await lock; - lock = new Promise(resolveLock => { - git.add(filePath).then(() => { - git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - resolveLock(sha); - }); - }); - }); + return new Promise((resolve, reject) => { + commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); + }) +} - return lock; +async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { + await git.add(filePath) + const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolve(sha); } From 874266c90c8d3be7145ae57976409f3d6eeed252 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:16:59 +0200 Subject: [PATCH 14/15] Fix review comments --- src/history/persistor.js | 13 +++++++------ src/index.js | 16 ++++++++-------- src/index.test.js | 4 ++-- src/sanitizer/index.js | 2 +- src/service_providers/index.js | 10 +++++----- src/service_providers/index.test.js | 14 +++++++------- test/providers/first_provider.json | 2 +- test/providers/second_provider.json | 2 +- 8 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 32a0ee508..01b2f57b5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -6,7 +6,7 @@ import async from 'async'; const commitQueue = async.queue(_commit, 1); commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { - reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); + reject(new Error(`Could not commit ${policyType} for ${serviceProviderId} (${isSanitized ? 'sanitized' : 'raw'} version) due to error: ${err}`)); }); import * as git from './git.js'; @@ -25,8 +25,8 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; - if (!fsApi.existsSync(directory)){ - fsApi.mkdirSync(directory); + if (!fsApi.existsSync(directory)) { + await fs.mkdir(directory); } const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; @@ -38,6 +38,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' + // Git needs a path relative to the .git directory, not an absolute one const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); @@ -47,12 +48,12 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return new Promise((resolve, reject) => { commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); - }) + }); } async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { - await git.add(filePath) - const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + await git.add(filePath); + const sha = await git.commit(`Update ${isSanitized ? 'sanitized' : 'raw'} ${policyType} for ${serviceProviderId}`); console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); resolve(sha); } diff --git a/src/index.js b/src/index.js index 85f7db7af..9568a3098 100644 --- a/src/index.js +++ b/src/index.js @@ -7,36 +7,36 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; -import serviceProviders from './service_providers/index.js'; +import getServiceProviders from './service_providers/index.js'; export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); - await persistRaw(serviceProviderId, documentType, content); + persistRaw(serviceProviderId, documentType, content); console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); - await persistSanitized(serviceProviderId, documentType, sanitizedContent); + persistSanitized(serviceProviderId, documentType, sanitizedContent); }; export default async function updateTerms() { console.log('Start scraping and saving terms of service…'); - const promises = []; - const serviceProvidersManifests = serviceProviders(); + const documentUpdatePromises = []; + const serviceProvidersManifests = getServiceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; - Object.keys(documents).forEach(async (documentType) => { + Object.keys(documents).forEach((documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); + documentUpdatePromises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); - return Promise.all(promises); + return Promise.all(documentUpdatePromises); }; diff --git a/src/index.test.js b/src/index.test.js index c39f83bee..130d9ec24 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -23,10 +23,10 @@ const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECT const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.firstprovider.com').get('/tos') +nock('https://www.firstprovider.example').get('/tos') .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -nock('https://www.secondprovider.com').get('/tos') +nock('https://www.secondprovider.example').get('/tos') .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 884b225c9..35d3cd6b8 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + console.warn(`The provider selector "${selector}" has no match in the document.`) } } diff --git a/src/service_providers/index.js b/src/service_providers/index.js index f83cb7c93..5ecf355a9 100644 --- a/src/service_providers/index.js +++ b/src/service_providers/index.js @@ -1,13 +1,13 @@ -import dotenv from 'dotenv'; -dotenv.config(); - import fs from 'fs'; import path from 'path'; +import dotenv from 'dotenv'; +dotenv.config(); + const __dirname = path.dirname(new URL(import.meta.url).pathname); -const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) +const dirPath = path.resolve(__dirname, '../..', process.env.NODE_ENV === 'test' ? 'test' : '', 'providers'); -export default function serviceProviders() { +export default function getServiceProviders() { const result = {}; fs.readdirSync(dirPath).forEach((filename) => { diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js index 8972c0ce1..f1d8b51fc 100644 --- a/src/service_providers/index.test.js +++ b/src/service_providers/index.test.js @@ -3,16 +3,16 @@ import nock from 'nock'; import fs from 'fs'; import path from 'path'; -import serviceProviders from './index.js'; +import getServiceProviders from './index.js'; const expect = chai.expect; -const expectResult = { +const expected = { first_provider: { serviceProviderName: 'First Provider', documents: { tos: { - url: 'https://www.firstprovider.com/tos', + url: 'https://www.firstprovider.example/tos', contentSelector: 'main' } } @@ -21,7 +21,7 @@ const expectResult = { serviceProviderName: 'Second Provider', documents: { tos: { - url: 'https://www.secondprovider.com/tos', + url: 'https://www.secondprovider.example/tos', contentSelector: 'main' } } @@ -30,10 +30,10 @@ const expectResult = { } describe('ServiceProviders', () => { - describe('#serviceProviders', () => { + describe('#getServiceProviders', () => { it('returns an object with all service providers manifests', () => { - const result = serviceProviders(); - expect(result).to.deep.equal(expectResult); + const result = getServiceProviders(); + expect(result).to.deep.equal(expected); }); }); }); diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json index 7682ee091..90d6543ba 100644 --- a/test/providers/first_provider.json +++ b/test/providers/first_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "First Provider", "documents": { "tos": { - "url": "https://www.firstprovider.com/tos", + "url": "https://www.firstprovider.example/tos", "contentSelector": "main" } } diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json index d397066ad..b5b0f4c6a 100644 --- a/test/providers/second_provider.json +++ b/test/providers/second_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "Second Provider", "documents": { "tos": { - "url": "https://www.secondprovider.com/tos", + "url": "https://www.secondprovider.example/tos", "contentSelector": "main" } } From 814f79395d6962c594a92172d91b21f2342b1252 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:56:35 +0200 Subject: [PATCH 15/15] Wording --- src/sanitizer/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 35d3cd6b8..d0d49594c 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.warn(`The provider selector "${selector}" has no match in the document.`) + console.warn(`The provided selector "${selector}" has no match in the document.`) } }