From 2b77006c96e17575a7e0157fb50fba298e0d23b9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:06 +0200 Subject: [PATCH 01/15] Log a warning if selector has no match --- src/sanitizer/index.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index bcec96dbd..884b225c9 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -9,7 +9,13 @@ export default async function sanitize(content, selector) { if (selector) { const { document } = new JSDOM(contentToSanitize).window; - contentToSanitize = document.querySelector(selector); + const selectedContent = document.querySelector(selector); + + if (selectedContent) { + contentToSanitize = selectedContent; + } else { + console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + } } const markdown = turndownService.turndown(contentToSanitize); From fec4056de68f9a8fca054836b6ac9723226ad4a3 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:22:34 +0200 Subject: [PATCH 02/15] Fix naming --- src/history/persistor.test.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/history/persistor.test.js b/src/history/persistor.test.js index 332044800..f3a482459 100644 --- a/src/history/persistor.test.js +++ b/src/history/persistor.test.js @@ -10,8 +10,7 @@ const POLICY_TYPE = 'terms_of_service'; const FILE_CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs'; const EXPECTED_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; - -describe('History', () => { +describe('Persistor', () => { describe('#save', () => { context('when service provider’s directory already exist', () => { after(() => { From e41ff611a1d6fac61bf9fd22f31b112f94aafa9a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:09 +0200 Subject: [PATCH 03/15] Fix whitespaces --- src/history/persistor.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index faa254ff7..0b1f23fc5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -16,9 +16,9 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa } export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { -const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; + const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; -if (!fsApi.existsSync(directory)){ + if (!fsApi.existsSync(directory)){ fsApi.mkdirSync(directory); } From aaad4f84facb03f7cbdcc7f1c8cc497525e2f3e4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:32 +0200 Subject: [PATCH 04/15] Improve var naming --- src/history/persistor.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 0b1f23fc5..d32d010be 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,14 +28,14 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' - const filepath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); + const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); - const status = await git.status(filepath); + const status = await git.status(filePath); if (!(status.includes('modified') || status.includes('added'))) { return; } - await git.add(filepath); + await git.add(filePath); return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); } From d1845148be378ced9fa537b4dc0205b355fdde21 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:23:41 +0200 Subject: [PATCH 05/15] Add logs --- src/history/persistor.js | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index d32d010be..9e2a4f75a 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -22,7 +22,10 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit fsApi.mkdirSync(directory); } - return fs.writeFile(`${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`, fileContent); + const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; + return fs.writeFile(filePath, fileContent).then(() => { + console.log(`File ${filePath} saved.`) + }); } export async function commit({ serviceProviderId, policyType, isSanitized }) { @@ -37,5 +40,8 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`); + return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + return sha; + }); } From 603bc1da4f0f4dc88844585cf5df7dc5937f2f48 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:24:13 +0200 Subject: [PATCH 06/15] List all service providers --- src/service_providers/index.js | 23 +++++++++++++++++ src/service_providers/index.test.js | 39 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 src/service_providers/index.js create mode 100644 src/service_providers/index.test.js diff --git a/src/service_providers/index.js b/src/service_providers/index.js new file mode 100644 index 000000000..f83cb7c93 --- /dev/null +++ b/src/service_providers/index.js @@ -0,0 +1,23 @@ +import dotenv from 'dotenv'; +dotenv.config(); + +import fs from 'fs'; +import path from 'path'; + +const __dirname = path.dirname(new URL(import.meta.url).pathname); +const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) + +export default function serviceProviders() { + const result = {}; + + fs.readdirSync(dirPath).forEach((filename) => { + const serviceProviderId = path.basename(filename, '.json'); + if (serviceProviderId.indexOf('.') === 0) { + return; // ignore invisible files such as .DS_Store + } + + result[serviceProviderId] = JSON.parse(fs.readFileSync(path.join(dirPath, filename))); + }); + + return result; +} diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js new file mode 100644 index 000000000..8972c0ce1 --- /dev/null +++ b/src/service_providers/index.test.js @@ -0,0 +1,39 @@ +import chai from 'chai'; +import nock from 'nock'; +import fs from 'fs'; +import path from 'path'; + +import serviceProviders from './index.js'; + +const expect = chai.expect; + +const expectResult = { + first_provider: { + serviceProviderName: 'First Provider', + documents: { + tos: { + url: 'https://www.firstprovider.com/tos', + contentSelector: 'main' + } + } + }, + second_provider: { + serviceProviderName: 'Second Provider', + documents: { + tos: { + url: 'https://www.secondprovider.com/tos', + contentSelector: 'main' + } + } + } + +} + +describe('ServiceProviders', () => { + describe('#serviceProviders', () => { + it('returns an object with all service providers manifests', () => { + const result = serviceProviders(); + expect(result).to.deep.equal(expectResult); + }); + }); +}); From 2047637dfb09f4d5c60c9ed3cddde6e5a279bcac Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 07/15] Add Facebook, Twitter and Snapchat manifests --- providers/facebook.json | 9 +++++++++ providers/snapchat.json | 9 +++++++++ providers/twitter.json | 9 +++++++++ 3 files changed, 27 insertions(+) create mode 100644 providers/facebook.json create mode 100644 providers/snapchat.json create mode 100644 providers/twitter.json diff --git a/providers/facebook.json b/providers/facebook.json new file mode 100644 index 000000000..2818e2554 --- /dev/null +++ b/providers/facebook.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Facebook", + "documents": { + "tos": { + "url": "https://www.facebook.com/legal/terms/plain_text_terms", + "contentSelector": ".UIFullPage_Container" + } + } +} diff --git a/providers/snapchat.json b/providers/snapchat.json new file mode 100644 index 000000000..c446c8dd1 --- /dev/null +++ b/providers/snapchat.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Snapchat", + "documents": { + "tos": { + "url": "https://www.snap.com/terms/", + "contentSelector": ".textarticle-container" + } + } +} diff --git a/providers/twitter.json b/providers/twitter.json new file mode 100644 index 000000000..ba6132b59 --- /dev/null +++ b/providers/twitter.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Twitter", + "documents": { + "tos": { + "url": "https://twitter.com/tos", + "contentSelector": "main" + } + } +} From fe3063860707750f69d4d9db659475ddc129fea9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:25:52 +0200 Subject: [PATCH 08/15] Update terms for many service providers --- src/index.js | 26 +++++++-- src/index.test.js | 54 ++++++++++++------- test/fixtures/first_provider_terms_raw.html | 14 +++++ .../first_provider_terms_sanitized.md | 12 +++++ test/fixtures/second_provider_terms_raw.html | 14 +++++ .../second_provider_terms_sanitized.md | 12 +++++ test/providers/first_provider.json | 9 ++++ test/providers/second_provider.json | 9 ++++ 8 files changed, 128 insertions(+), 22 deletions(-) create mode 100644 test/fixtures/first_provider_terms_raw.html create mode 100644 test/fixtures/first_provider_terms_sanitized.md create mode 100644 test/fixtures/second_provider_terms_raw.html create mode 100644 test/fixtures/second_provider_terms_sanitized.md create mode 100644 test/providers/first_provider.json create mode 100644 test/providers/second_provider.json diff --git a/src/index.js b/src/index.js index e655523ec..1d1b132da 100644 --- a/src/index.js +++ b/src/index.js @@ -7,11 +7,29 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; +import serviceProviders from './service_providers/index.js'; + +export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { + const content = await scrape(documentUrl); + await persistRaw(serviceProviderId, documentType, content); + const sanitizedContent = await sanitize(content, documentContentSelector); + await persistSanitized(serviceProviderId, documentType, sanitizedContent); +}; export default async function updateTerms() { console.log('Start scraping and saving terms of service…') - const content = await scrape('https://www.facebook.com/legal/terms/plain_text_terms'); - await persistRaw('facebook', 'terms_of_service', content); - const sanitizedContent = await sanitize(content, '.UIFullPage_Container'); - await persistSanitized('facebook', 'terms_of_service', sanitizedContent); + + const promises = []; + const serviceProvidersManifests = serviceProviders(); + + Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { + const { documents } = serviceProvidersManifests[serviceProviderId]; + + Object.keys(documents).forEach(async (documentType) => { + const { url, contentSelector } = documents[documentType]; + promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + }); + }); + + return Promise.all(promises); }; diff --git a/src/index.test.js b/src/index.test.js index be57465a9..c39f83bee 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -9,19 +9,25 @@ const expect = chai.expect; import updateTerms from './index.js'; import { RAW_DIRECTORY, SANITIZED_DIRECTORY } from './history/persistor.js'; -const facebookTermsHTML = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_raw.html'), { encoding: 'utf8' }); -const facebookTermsMD = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_sanitized.md'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_ID = 'first_provider'; +const FIRST_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const FIRST_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_raw.html'), { encoding: 'utf8' }); +const FIRST_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.facebook.com', { - reqheaders: { 'Accept-Language': 'en' } - }).get('/legal/terms/plain_text_terms') - .reply(200, facebookTermsHTML); +const SECOND_SERVICE_PROVIDER_ID = 'second_provider'; +const SECOND_SERVICE_PROVIDER_POLICY_TYPE = 'tos'; +const SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.html`; +const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.md`; +const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); +const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -const SERVICE_PROVIDER_ID = 'facebook'; -const POLICY_TYPE = 'terms_of_service'; +nock('https://www.firstprovider.com').get('/tos') + .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -const EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`; -const EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.md`; +nock('https://www.secondprovider.com').get('/tos') + .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { describe('#updateTerms', () => { @@ -30,18 +36,30 @@ describe('CGUs', () => { }); after(() => { - fs.unlinkSync(EXPECTED_RAW_FILE_PATH); - fs.unlinkSync(EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH); + fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH); }); - it('persists terms in raw format', async () => { - const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); - expect(resultingRawTerms).to.be.equal(facebookTermsHTML); + it('persists terms in raw format for first service provider', () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_RAW); }); - it('persists terms in sanitized format', async () => { - const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); - expect(resultingSanitizedTerms).to.be.equal(facebookTermsMD); + it('persists terms in sanitized format for first service provider', () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_SANITIZED); + }); + + it('persists terms in raw format for second service provider', async () => { + const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' }); + expect(resultingRawTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_RAW); + }); + + it('persists terms in sanitized format for second service provider', async () => { + const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' }); + expect(resultingSanitizedTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_SANITIZED); }); }); }); diff --git a/test/fixtures/first_provider_terms_raw.html b/test/fixtures/first_provider_terms_raw.html new file mode 100644 index 000000000..4971e89f1 --- /dev/null +++ b/test/fixtures/first_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + First provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/first_provider_terms_sanitized.md b/test/fixtures/first_provider_terms_sanitized.md new file mode 100644 index 000000000..967f45a0d --- /dev/null +++ b/test/fixtures/first_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + First provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/fixtures/second_provider_terms_raw.html b/test/fixtures/second_provider_terms_raw.html new file mode 100644 index 000000000..d76baf758 --- /dev/null +++ b/test/fixtures/second_provider_terms_raw.html @@ -0,0 +1,14 @@ + + + + + Second provider TOS + + +

Terms of service

+

Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. +


+

Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.

+

Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.

+ + diff --git a/test/fixtures/second_provider_terms_sanitized.md b/test/fixtures/second_provider_terms_sanitized.md new file mode 100644 index 000000000..8aef98bc9 --- /dev/null +++ b/test/fixtures/second_provider_terms_sanitized.md @@ -0,0 +1,12 @@ + Second provider TOS + +Terms of service +================ + +Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec. + +* * * + +Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit. + +Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi. \ No newline at end of file diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json new file mode 100644 index 000000000..7682ee091 --- /dev/null +++ b/test/providers/first_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "First Provider", + "documents": { + "tos": { + "url": "https://www.firstprovider.com/tos", + "contentSelector": "main" + } + } +} diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json new file mode 100644 index 000000000..d397066ad --- /dev/null +++ b/test/providers/second_provider.json @@ -0,0 +1,9 @@ +{ + "serviceProviderName": "Second Provider", + "documents": { + "tos": { + "url": "https://www.secondprovider.com/tos", + "contentSelector": "main" + } + } +} From 903f3f9434fc0fe9a5566c010f2f86f92d819277 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:34 +0200 Subject: [PATCH 09/15] Improve logs --- src/history/persistor.js | 2 +- src/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 9e2a4f75a..3ed5b5b4f 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -40,7 +40,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { await git.add(filePath); - return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`).then((sha) => { + return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); return sha; }); diff --git a/src/index.js b/src/index.js index 1d1b132da..243c5e38c 100644 --- a/src/index.js +++ b/src/index.js @@ -17,7 +17,7 @@ export async function updateServiceProviderDocument(serviceProviderId, documentT }; export default async function updateTerms() { - console.log('Start scraping and saving terms of service…') + console.log('Start scraping and saving terms of service…'); const promises = []; const serviceProvidersManifests = serviceProviders(); From d0a4df65fc53351ab1f9d624663b7518ada7f6b6 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 11:56:58 +0200 Subject: [PATCH 10/15] Prevent `unmodified` files to be committed --- src/history/persistor.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 3ed5b5b4f..6fc788a95 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -34,7 +34,7 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); - if (!(status.includes('modified') || status.includes('added'))) { + if (!status.match(/^\*?(modified|added)/)) { return; } From 763477ae110dee92742e2bd95683fab7e388c34e Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:17:35 +0200 Subject: [PATCH 11/15] Avoid concurrent calls when modifying git history --- src/history/persistor.js | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 6fc788a95..51b18a51b 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -28,6 +28,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } +let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -38,10 +39,17 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - await git.add(filePath); - - return git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - return sha; + // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… + // …and others caller of `persistor.commit` will wait + await lock; + lock = new Promise(resolveLock => { + git.add(filePath).then(() => { + git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolveLock(sha); + }); + }); }); + + return lock; } From d2362fbfd4ffe791035e6217c8848ffd07569439 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 16:39:29 +0200 Subject: [PATCH 12/15] Improve logs --- src/index.js | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/index.js b/src/index.js index 243c5e38c..85f7db7af 100644 --- a/src/index.js +++ b/src/index.js @@ -9,10 +9,17 @@ import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; import serviceProviders from './service_providers/index.js'; -export async function updateServiceProviderDocument(serviceProviderId, documentType, documentUrl, documentContentSelector) { +export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { + console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); + + console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); await persistRaw(serviceProviderId, documentType, content); + + console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); + + console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); await persistSanitized(serviceProviderId, documentType, sanitizedContent); }; @@ -23,11 +30,11 @@ export default async function updateTerms() { const serviceProvidersManifests = serviceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { - const { documents } = serviceProvidersManifests[serviceProviderId]; + const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; Object.keys(documents).forEach(async (documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, documentType, url, contentSelector)); + promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); From 172f5e9c12f01fd9dec855e9e3090027c9210593 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 11 Jun 2020 17:37:58 +0200 Subject: [PATCH 13/15] Use async to manage concurrency on git modifications --- package-lock.json | 5 +++++ package.json | 1 + src/history/persistor.js | 29 ++++++++++++++++------------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/package-lock.json b/package-lock.json index 98be5417c..64b667ffd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -103,6 +103,11 @@ "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "dev": true }, + "async": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.0.tgz", + "integrity": "sha512-TR2mEZFVOj2pLStYxLht7TyfuRzaydfpxr3k9RpHIzMgw7A64dzsdqCxH1WJyQdoe8T10nDXd9wnEigmiuHIZw==" + }, "async-lock": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/async-lock/-/async-lock-1.2.4.tgz", diff --git a/package.json b/package.json index e26f4aef1..1289d262f 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "nock": "^12.0.3" }, "dependencies": { + "async": "^3.2.0", "console-stamp": "^0.2.9", "dotenv": "^8.2.0", "isomorphic-git": "^1.4.0", diff --git a/src/history/persistor.js b/src/history/persistor.js index 51b18a51b..32a0ee508 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -2,6 +2,13 @@ import path from 'path'; import fsApi from 'fs'; const fs = fsApi.promises; +import async from 'async'; + +const commitQueue = async.queue(_commit, 1); +commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { + reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); +}); + import * as git from './git.js'; const __dirname = path.dirname(new URL(import.meta.url).pathname); @@ -28,7 +35,6 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit }); } -let lock = Promise.resolve('Initial Promise'); export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' @@ -39,17 +45,14 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return; } - // Ensure asynchronous functions `git.add` and `git.commit` will always be called in sequence… - // …and others caller of `persistor.commit` will wait - await lock; - lock = new Promise(resolveLock => { - git.add(filePath).then(() => { - git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`).then((sha) => { - console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); - resolveLock(sha); - }); - }); - }); + return new Promise((resolve, reject) => { + commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); + }) +} - return lock; +async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { + await git.add(filePath) + const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); + resolve(sha); } From 9fe17d51b3d73a1d11b39dc58c6c9fc1a145363a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:16:59 +0200 Subject: [PATCH 14/15] Fix review comments --- src/history/persistor.js | 13 +++++++------ src/index.js | 16 ++++++++-------- src/index.test.js | 4 ++-- src/sanitizer/index.js | 2 +- src/service_providers/index.js | 10 +++++----- src/service_providers/index.test.js | 14 +++++++------- test/providers/first_provider.json | 2 +- test/providers/second_provider.json | 2 +- 8 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/history/persistor.js b/src/history/persistor.js index 32a0ee508..01b2f57b5 100644 --- a/src/history/persistor.js +++ b/src/history/persistor.js @@ -6,7 +6,7 @@ import async from 'async'; const commitQueue = async.queue(_commit, 1); commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => { - reject(new Error(`_commit for ${serviceProviderId}${isSanitized ? ' sanitized ' : ' '}document ${policyType} experienced an error: ${err}`)); + reject(new Error(`Could not commit ${policyType} for ${serviceProviderId} (${isSanitized ? 'sanitized' : 'raw'} version) due to error: ${err}`)); }); import * as git from './git.js'; @@ -25,8 +25,8 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; - if (!fsApi.existsSync(directory)){ - fsApi.mkdirSync(directory); + if (!fsApi.existsSync(directory)) { + await fs.mkdir(directory); } const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`; @@ -38,6 +38,7 @@ export async function save({ serviceProviderId, policyType, fileContent, isSanit export async function commit({ serviceProviderId, policyType, isSanitized }) { const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`; const fileExtension = isSanitized ? 'md' : 'html' + // Git needs a path relative to the .git directory, not an absolute one const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`); const status = await git.status(filePath); @@ -47,12 +48,12 @@ export async function commit({ serviceProviderId, policyType, isSanitized }) { return new Promise((resolve, reject) => { commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject }); - }) + }); } async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) { - await git.add(filePath) - const sha = await git.commit(`${isSanitized ? 'Update sanitized' : 'Update'} ${serviceProviderId} ${policyType} document`); + await git.add(filePath); + const sha = await git.commit(`Update ${isSanitized ? 'sanitized' : 'raw'} ${policyType} for ${serviceProviderId}`); console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`); resolve(sha); } diff --git a/src/index.js b/src/index.js index 85f7db7af..9568a3098 100644 --- a/src/index.js +++ b/src/index.js @@ -7,36 +7,36 @@ consoleStamp(console); import scrape from './scraper/index.js'; import { persistRaw, persistSanitized } from './history/index.js'; import sanitize from './sanitizer/index.js'; -import serviceProviders from './service_providers/index.js'; +import getServiceProviders from './service_providers/index.js'; export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) { console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`); const content = await scrape(documentUrl); console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`); - await persistRaw(serviceProviderId, documentType, content); + persistRaw(serviceProviderId, documentType, content); console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`); const sanitizedContent = await sanitize(content, documentContentSelector); console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`); - await persistSanitized(serviceProviderId, documentType, sanitizedContent); + persistSanitized(serviceProviderId, documentType, sanitizedContent); }; export default async function updateTerms() { console.log('Start scraping and saving terms of service…'); - const promises = []; - const serviceProvidersManifests = serviceProviders(); + const documentUpdatePromises = []; + const serviceProvidersManifests = getServiceProviders(); Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => { const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId]; - Object.keys(documents).forEach(async (documentType) => { + Object.keys(documents).forEach((documentType) => { const { url, contentSelector } = documents[documentType]; - promises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); + documentUpdatePromises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector)); }); }); - return Promise.all(promises); + return Promise.all(documentUpdatePromises); }; diff --git a/src/index.test.js b/src/index.test.js index c39f83bee..130d9ec24 100644 --- a/src/index.test.js +++ b/src/index.test.js @@ -23,10 +23,10 @@ const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECT const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' }); const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' }); -nock('https://www.firstprovider.com').get('/tos') +nock('https://www.firstprovider.example').get('/tos') .reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW); -nock('https://www.secondprovider.com').get('/tos') +nock('https://www.secondprovider.example').get('/tos') .reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW); describe('CGUs', () => { diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 884b225c9..35d3cd6b8 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.log(`Warning: the provider selector "${selector}" has no match in the document.`) + console.warn(`The provider selector "${selector}" has no match in the document.`) } } diff --git a/src/service_providers/index.js b/src/service_providers/index.js index f83cb7c93..5ecf355a9 100644 --- a/src/service_providers/index.js +++ b/src/service_providers/index.js @@ -1,13 +1,13 @@ -import dotenv from 'dotenv'; -dotenv.config(); - import fs from 'fs'; import path from 'path'; +import dotenv from 'dotenv'; +dotenv.config(); + const __dirname = path.dirname(new URL(import.meta.url).pathname); -const dirPath = path.resolve(__dirname, `../..${process.env.NODE_ENV === 'test' ? '/test/' : '/'}providers`) +const dirPath = path.resolve(__dirname, '../..', process.env.NODE_ENV === 'test' ? 'test' : '', 'providers'); -export default function serviceProviders() { +export default function getServiceProviders() { const result = {}; fs.readdirSync(dirPath).forEach((filename) => { diff --git a/src/service_providers/index.test.js b/src/service_providers/index.test.js index 8972c0ce1..f1d8b51fc 100644 --- a/src/service_providers/index.test.js +++ b/src/service_providers/index.test.js @@ -3,16 +3,16 @@ import nock from 'nock'; import fs from 'fs'; import path from 'path'; -import serviceProviders from './index.js'; +import getServiceProviders from './index.js'; const expect = chai.expect; -const expectResult = { +const expected = { first_provider: { serviceProviderName: 'First Provider', documents: { tos: { - url: 'https://www.firstprovider.com/tos', + url: 'https://www.firstprovider.example/tos', contentSelector: 'main' } } @@ -21,7 +21,7 @@ const expectResult = { serviceProviderName: 'Second Provider', documents: { tos: { - url: 'https://www.secondprovider.com/tos', + url: 'https://www.secondprovider.example/tos', contentSelector: 'main' } } @@ -30,10 +30,10 @@ const expectResult = { } describe('ServiceProviders', () => { - describe('#serviceProviders', () => { + describe('#getServiceProviders', () => { it('returns an object with all service providers manifests', () => { - const result = serviceProviders(); - expect(result).to.deep.equal(expectResult); + const result = getServiceProviders(); + expect(result).to.deep.equal(expected); }); }); }); diff --git a/test/providers/first_provider.json b/test/providers/first_provider.json index 7682ee091..90d6543ba 100644 --- a/test/providers/first_provider.json +++ b/test/providers/first_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "First Provider", "documents": { "tos": { - "url": "https://www.firstprovider.com/tos", + "url": "https://www.firstprovider.example/tos", "contentSelector": "main" } } diff --git a/test/providers/second_provider.json b/test/providers/second_provider.json index d397066ad..b5b0f4c6a 100644 --- a/test/providers/second_provider.json +++ b/test/providers/second_provider.json @@ -2,7 +2,7 @@ "serviceProviderName": "Second Provider", "documents": { "tos": { - "url": "https://www.secondprovider.com/tos", + "url": "https://www.secondprovider.example/tos", "contentSelector": "main" } } From 896d6c08b87f95a1cca326f472bec8e0998e3dba Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Fri, 12 Jun 2020 11:56:35 +0200 Subject: [PATCH 15/15] Wording --- src/sanitizer/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sanitizer/index.js b/src/sanitizer/index.js index 35d3cd6b8..d0d49594c 100644 --- a/src/sanitizer/index.js +++ b/src/sanitizer/index.js @@ -14,7 +14,7 @@ export default async function sanitize(content, selector) { if (selectedContent) { contentToSanitize = selectedContent; } else { - console.warn(`The provider selector "${selector}" has no match in the document.`) + console.warn(`The provided selector "${selector}" has no match in the document.`) } }