Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Obtenir les CGUs de plusieurs fournisseurs #16

Merged
merged 15 commits into from Jun 12, 2020
5 changes: 5 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Expand Up @@ -29,6 +29,7 @@
"nock": "^12.0.3"
},
"dependencies": {
"async": "^3.2.0",
"console-stamp": "^0.2.9",
"dotenv": "^8.2.0",
"isomorphic-git": "^1.4.0",
Expand Down
9 changes: 9 additions & 0 deletions providers/facebook.json
@@ -0,0 +1,9 @@
{
"serviceProviderName": "Facebook",
"documents": {
"tos": {
"url": "https://www.facebook.com/legal/terms/plain_text_terms",
"contentSelector": ".UIFullPage_Container"
}
}
}
9 changes: 9 additions & 0 deletions providers/snapchat.json
@@ -0,0 +1,9 @@
{
"serviceProviderName": "Snapchat",
"documents": {
"tos": {
"url": "https://www.snap.com/terms/",
"contentSelector": ".textarticle-container"
}
}
}
9 changes: 9 additions & 0 deletions providers/twitter.json
@@ -0,0 +1,9 @@
{
"serviceProviderName": "Twitter",
"documents": {
"tos": {
"url": "https://twitter.com/tos",
"contentSelector": "main"
}
}
}
36 changes: 27 additions & 9 deletions src/history/persistor.js
Expand Up @@ -2,6 +2,13 @@ import path from 'path';
import fsApi from 'fs';
const fs = fsApi.promises;

import async from 'async';

const commitQueue = async.queue(_commit, 1);
commitQueue.error((err, { serviceProviderId, policyType, isSanitized, reject }) => {
reject(new Error(`Could not commit ${policyType} for ${serviceProviderId} (${isSanitized ? 'sanitized' : 'raw'} version) due to error: ${err}`));
});

import * as git from './git.js';

const __dirname = path.dirname(new URL(import.meta.url).pathname);
Expand All @@ -16,26 +23,37 @@ export async function persist({ serviceProviderId, policyType, fileContent, isSa
}

export async function save({ serviceProviderId, policyType, fileContent, isSanitized }) {
const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`;
const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`;

if (!fsApi.existsSync(directory)){
fsApi.mkdirSync(directory);
if (!fsApi.existsSync(directory)) {
Ndpnt marked this conversation as resolved.
Show resolved Hide resolved
await fs.mkdir(directory);
}

return fs.writeFile(`${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`, fileContent);
const filePath = `${directory}/${policyType}.${isSanitized ? 'md' : 'html'}`;
return fs.writeFile(filePath, fileContent).then(() => {
console.log(`File ${filePath} saved.`)
});
}

export async function commit({ serviceProviderId, policyType, isSanitized }) {
const directory = `${isSanitized ? SANITIZED_DIRECTORY : RAW_DIRECTORY}/${serviceProviderId}`;
const fileExtension = isSanitized ? 'md' : 'html'
const filepath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`);
// Git needs a path relative to the .git directory, not an absolute one
const filePath = path.relative(path.resolve(__dirname, '../..'), `${directory}/${policyType}.${fileExtension}`);
MattiSG marked this conversation as resolved.
Show resolved Hide resolved

const status = await git.status(filepath);
if (!(status.includes('modified') || status.includes('added'))) {
const status = await git.status(filePath);
if (!status.match(/^\*?(modified|added)/)) {
MattiSG marked this conversation as resolved.
Show resolved Hide resolved
return;
}

await git.add(filepath);
return new Promise((resolve, reject) => {
commitQueue.push({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve, reject });
});
}

return git.commit(`${isSanitized ? 'Sanitized update' : 'Update'} for ${serviceProviderId} ${policyType} document`);
async function _commit({ serviceProviderId, policyType, isSanitized, fileExtension, filePath, resolve }) {
await git.add(filePath);
const sha = await git.commit(`Update ${isSanitized ? 'sanitized' : 'raw'} ${policyType} for ${serviceProviderId}`);
console.log(`Commit ID for document "${serviceProviderId}/${policyType}.${fileExtension}": ${sha}`);
resolve(sha);
}
3 changes: 1 addition & 2 deletions src/history/persistor.test.js
Expand Up @@ -10,8 +10,7 @@ const POLICY_TYPE = 'terms_of_service';
const FILE_CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs';
const EXPECTED_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`;


describe('History', () => {
describe('Persistor', () => {
describe('#save', () => {
context('when service provider’s directory already exist', () => {
after(() => {
Expand Down
35 changes: 30 additions & 5 deletions src/index.js
Expand Up @@ -7,11 +7,36 @@ consoleStamp(console);
import scrape from './scraper/index.js';
import { persistRaw, persistSanitized } from './history/index.js';
import sanitize from './sanitizer/index.js';
import getServiceProviders from './service_providers/index.js';

export async function updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, documentUrl, documentContentSelector) {
console.log(`${serviceProviderName}: Scrape '${documentUrl}'.`);
const content = await scrape(documentUrl);

console.log(`${serviceProviderName}: Persist raw document '${documentType}'.`);
persistRaw(serviceProviderId, documentType, content);

console.log(`${serviceProviderName}: Sanitize raw document '${documentType}'.`);
const sanitizedContent = await sanitize(content, documentContentSelector);

console.log(`${serviceProviderName}: Persist sanitized document '${documentType}'.`);
persistSanitized(serviceProviderId, documentType, sanitizedContent);
};

export default async function updateTerms() {
console.log('Start scraping and saving terms of service…')
const content = await scrape('https://www.facebook.com/legal/terms/plain_text_terms');
await persistRaw('facebook', 'terms_of_service', content);
const sanitizedContent = await sanitize(content, '.UIFullPage_Container');
await persistSanitized('facebook', 'terms_of_service', sanitizedContent);
console.log('Start scraping and saving terms of service…');

const documentUpdatePromises = [];
const serviceProvidersManifests = getServiceProviders();

Object.keys(serviceProvidersManifests).forEach((serviceProviderId) => {
const { documents, serviceProviderName } = serviceProvidersManifests[serviceProviderId];

Object.keys(documents).forEach((documentType) => {
const { url, contentSelector } = documents[documentType];
documentUpdatePromises.push(updateServiceProviderDocument(serviceProviderId, serviceProviderName, documentType, url, contentSelector));
});
});

return Promise.all(documentUpdatePromises);
};
54 changes: 36 additions & 18 deletions src/index.test.js
Expand Up @@ -9,19 +9,25 @@ const expect = chai.expect;
import updateTerms from './index.js';
import { RAW_DIRECTORY, SANITIZED_DIRECTORY } from './history/persistor.js';

const facebookTermsHTML = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_raw.html'), { encoding: 'utf8' });
const facebookTermsMD = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/facebook_terms_sanitized.md'), { encoding: 'utf8' });
const FIRST_SERVICE_PROVIDER_ID = 'first_provider';
const FIRST_SERVICE_PROVIDER_POLICY_TYPE = 'tos';
const FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.html`;
const FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${FIRST_SERVICE_PROVIDER_ID}/${FIRST_SERVICE_PROVIDER_POLICY_TYPE}.md`;
const FIRST_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_raw.html'), { encoding: 'utf8' });
Ndpnt marked this conversation as resolved.
Show resolved Hide resolved
const FIRST_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/first_provider_terms_sanitized.md'), { encoding: 'utf8' });

nock('https://www.facebook.com', {
reqheaders: { 'Accept-Language': 'en' }
}).get('/legal/terms/plain_text_terms')
.reply(200, facebookTermsHTML);
const SECOND_SERVICE_PROVIDER_ID = 'second_provider';
const SECOND_SERVICE_PROVIDER_POLICY_TYPE = 'tos';
const SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.html`;
const SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SECOND_SERVICE_PROVIDER_ID}/${SECOND_SERVICE_PROVIDER_POLICY_TYPE}.md`;
const SECOND_SERVICE_PROVIDER_TOS_RAW = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_raw.html'), { encoding: 'utf8' });
const SECOND_SERVICE_PROVIDER_TOS_SANITIZED = fs.readFileSync(path.resolve(__dirname, '../test/fixtures/second_provider_terms_sanitized.md'), { encoding: 'utf8' });

const SERVICE_PROVIDER_ID = 'facebook';
const POLICY_TYPE = 'terms_of_service';
nock('https://www.firstprovider.example').get('/tos')
.reply(200, FIRST_SERVICE_PROVIDER_TOS_RAW);

const EXPECTED_RAW_FILE_PATH = `${RAW_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.html`;
const EXPECTED_SANITIZED_FILE_PATH = `${SANITIZED_DIRECTORY}/${SERVICE_PROVIDER_ID}/${POLICY_TYPE}.md`;
nock('https://www.secondprovider.example').get('/tos')
.reply(200, SECOND_SERVICE_PROVIDER_TOS_RAW);

describe('CGUs', () => {
describe('#updateTerms', () => {
Expand All @@ -30,18 +36,30 @@ describe('CGUs', () => {
});

after(() => {
fs.unlinkSync(EXPECTED_RAW_FILE_PATH);
fs.unlinkSync(EXPECTED_SANITIZED_FILE_PATH);
fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH);
fs.unlinkSync(FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH);
fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH);
fs.unlinkSync(SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH);
});

it('persists terms in raw format', async () => {
const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' });
expect(resultingRawTerms).to.be.equal(facebookTermsHTML);
it('persists terms in raw format for first service provider', () => {
const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' });
expect(resultingRawTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_RAW);
});

it('persists terms in sanitized format', async () => {
const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' });
expect(resultingSanitizedTerms).to.be.equal(facebookTermsMD);
it('persists terms in sanitized format for first service provider', () => {
const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, FIRST_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' });
expect(resultingSanitizedTerms).to.be.equal(FIRST_SERVICE_PROVIDER_TOS_SANITIZED);
});

it('persists terms in raw format for second service provider', async () => {
const resultingRawTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_RAW_FILE_PATH), { encoding: 'utf8' });
expect(resultingRawTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_RAW);
});

it('persists terms in sanitized format for second service provider', async () => {
const resultingSanitizedTerms = fs.readFileSync(path.resolve(__dirname, SECOND_SERVICE_PROVIDER_EXPECTED_SANITIZED_FILE_PATH), { encoding: 'utf8' });
expect(resultingSanitizedTerms).to.be.equal(SECOND_SERVICE_PROVIDER_TOS_SANITIZED);
});
});
});
8 changes: 7 additions & 1 deletion src/sanitizer/index.js
Expand Up @@ -9,7 +9,13 @@ export default async function sanitize(content, selector) {

if (selector) {
const { document } = new JSDOM(contentToSanitize).window;
contentToSanitize = document.querySelector(selector);
const selectedContent = document.querySelector(selector);

if (selectedContent) {
contentToSanitize = selectedContent;
} else {
console.warn(`The provider selector "${selector}" has no match in the document.`)
Ndpnt marked this conversation as resolved.
Show resolved Hide resolved
}
}

const markdown = turndownService.turndown(contentToSanitize);
Expand Down
23 changes: 23 additions & 0 deletions src/service_providers/index.js
@@ -0,0 +1,23 @@
import fs from 'fs';
import path from 'path';
Ndpnt marked this conversation as resolved.
Show resolved Hide resolved

import dotenv from 'dotenv';
dotenv.config();

const __dirname = path.dirname(new URL(import.meta.url).pathname);
const dirPath = path.resolve(__dirname, '../..', process.env.NODE_ENV === 'test' ? 'test' : '', 'providers');

export default function getServiceProviders() {
const result = {};

fs.readdirSync(dirPath).forEach((filename) => {
const serviceProviderId = path.basename(filename, '.json');
if (serviceProviderId.indexOf('.') === 0) {
return; // ignore invisible files such as .DS_Store
}

result[serviceProviderId] = JSON.parse(fs.readFileSync(path.join(dirPath, filename)));
});

return result;
}
39 changes: 39 additions & 0 deletions src/service_providers/index.test.js
@@ -0,0 +1,39 @@
import chai from 'chai';
import nock from 'nock';
import fs from 'fs';
import path from 'path';

import getServiceProviders from './index.js';

const expect = chai.expect;

const expected = {
first_provider: {
serviceProviderName: 'First Provider',
documents: {
tos: {
url: 'https://www.firstprovider.example/tos',
contentSelector: 'main'
}
}
},
second_provider: {
serviceProviderName: 'Second Provider',
documents: {
tos: {
url: 'https://www.secondprovider.example/tos',
contentSelector: 'main'
}
}
}

}

describe('ServiceProviders', () => {
describe('#getServiceProviders', () => {
it('returns an object with all service providers manifests', () => {
const result = getServiceProviders();
expect(result).to.deep.equal(expected);
});
});
});
14 changes: 14 additions & 0 deletions test/fixtures/first_provider_terms_raw.html
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>First provider TOS</title>
</head>
<body>
<h1>Terms of service</h1>
<p>Dapibus quis diam sagittis vitae orci fames fusce porta metus risus, gravida pharetra cras rhoncus nullam tempor purus mattis sit condimentum senectus, a sodales varius in natoque molestie tortor velit quisque. Parturient sit facilisis risus dictumst quisque penatibus donec, mollis ultrices nibh viverra consequat porttitor dignissim laoreet, auctor fermentum himenaeos ut fusce volutpat. Dignissim nulla penatibus lacus at augue sem hendrerit sit, et potenti cum aptent ultricies aliquam habitant, porta lobortis cubilia primis sociis nascetur etiam. Torquent duis facilisis aliquam est purus curae quam sit porta, augue netus laoreet per pharetra volutpat lectus sem commodo, senectus primis arcu suspendisse id vivamus consequat sapien. Litora eros hendrerit commodo eleifend scelerisque risus in fusce porttitor cubilia ultrices porta sed blandit cras condimentum quisque, class accumsan odio enim magna tempus facilisis auctor imperdiet fringilla arcu tristique volutpat faucibus molestie velit. Per quam tristique tortor viverra tempus class varius condimentum, curabitur tellus cras fusce elementum velit cubilia, hendrerit orci vestibulum libero in enim gravida. Eget praesent dictumst lobortis pretium dignissim at viverra tempus fusce potenti, in morbi nascetur purus natoque sagittis phasellus urna sit commodo, ad tortor hendrerit ante mauris venenatis dolor consequat donec.
<hr>
<p>Suscipit sed eleifend sodales parturient interdum primis vel accumsan, curabitur sem blandit ut pretium sociosqu nam integer vehicula, amet lacinia augue commodo neque sollicitudin duis quam, malesuada cubilia phasellus gravida ad nisl eros. Curae consectetur justo tortor hendrerit morbi habitasse vel eros feugiat per, tristique ante nostra condimentum facilisi enim neque dui taciti dolor, placerat elit molestie luctus etiam magna gravida sed ad. Senectus habitasse sem commodo consectetur litora tempor leo curabitur gravida vestibulum, suscipit tortor viverra pellentesque rhoncus tristique porttitor convallis. Sodales morbi dictumst ut dolor vivamus elit velit orci non tristique nullam, nunc class scelerisque aenean facilisis urna suscipit etiam lobortis purus tortor commodo, duis vel pellentesque taciti aliquam id adipiscing dictum placerat et. At interdum quis eget ut curabitur fusce aliquet commodo suscipit, platea metus libero tortor eros justo in odio ligula fames, gravida placerat augue amet posuere massa fermentum magna. Enim quisque suspendisse accumsan odio ipsum taciti habitasse viverra, venenatis aliquet ac augue auctor nibh cras dignissim magna, convallis varius in sed quam morbi pretium. Habitasse nunc arcu eleifend purus aliquam lobortis morbi porta netus dignissim nisl, vitae class placerat amet sapien at vivamus et dui. Purus risus tellus netus sodales molestie leo nullam nunc, elit curae integer odio nostra ut vitae malesuada, dictum mus justo commodo curabitur placerat luctus. Dui ultrices auctor ultricies fringilla tortor ligula aptent, lectus pulvinar orci nascetur aenean a proin curabitur, ad felis nam nostra nec blandit.</p>
<p>Dui tristique interdum commodo urna blandit fames tortor posuere dictum fermentum sociis, hendrerit ut libero facilisis id morbi habitant iaculis feugiat. Gravida metus netus ac malesuada vel tristique potenti augue cubilia elit, pellentesque suscipit posuere sapien tempus in praesent torquent dapibus, sit viverra iaculis ultrices aliquet mauris non quisque vitae. Senectus gravida a in litora taciti per ullamcorper nascetur, quis auctor adipiscing nostra cras lacinia aliquet, curae id montes vel molestie placerat condimentum. Convallis fermentum eros sagittis viverra curae risus, aliquet lorem dui magna urna suscipit, id condimentum faucibus molestie ridiculus. Mollis enim etiam taciti diam at pharetra sagittis ligula euismod, ridiculus et sociosqu mauris integer lacinia elementum tortor. In sem nibh cubilia curae facilisi mauris massa, facilisis rhoncus velit phasellus etiam cum penatibus neque, tortor vulputate felis est litora mi.</p></p>
</body>
</html>