Merge pull request #1099 from PublicMapping/test/mvm/change-topojson-…

…serialization Change topojson serialization format
PublicMapping · Jan 12, 2022 · 523d298 · 523d298
2 parents 676ecec + cb22be0
commit 523d298
Show file tree

Hide file tree

Showing 8 changed files with 320 additions and 78 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Update README with clearer instructions on email verification [#1082](https://github.com/PublicMapping/districtbuilder/pull/1082)
 - Update Node / Nest.JS / TypeORM / eslint / prettier [#1082](https://github.com/PublicMapping/districtbuilder/pull/1088)
 - Change page titles to be more descriptive [#1096](https://github.com/PublicMapping/districtbuilder/pull/1096)
+- Change serialization format for TopoJSON data [#1099](https://github.com/PublicMapping/districtbuilder/pull/1099)
 
 ### Fixed
 

diff --git a/src/manage/README.md b/src/manage/README.md
@@ -21,7 +21,7 @@ $ npm install -g manage
 $ manage COMMAND
 running command...
 $ manage (-v|--version|version)
-manage/0.1.0 linux-x64 node-v12.22.6
+manage/0.1.0 linux-x64 node-v12.22.8
 $ manage --help [COMMAND]
 USAGE
   $ manage COMMAND
@@ -35,6 +35,7 @@ USAGE
 * [`manage help [COMMAND]`](#manage-help-command)
 * [`manage process-geojson FILE`](#manage-process-geojson-file)
 * [`manage publish-region STATICDATADIR COUNTRYCODE REGIONCODE REGIONNAME`](#manage-publish-region-staticdatadir-countrycode-regioncode-regionname)
+* [`manage serialize-topojson`](#manage-serialize-topojson)
 * [`manage update-organization CONFIG`](#manage-update-organization-config)
 * [`manage update-region STATICDATADIR UPDATES3DIR`](#manage-update-region-staticdatadir-updates3dir)
 
@@ -118,43 +119,52 @@ USAGE
   $ manage process-geojson FILE
 
 OPTIONS
-  -b, --big                            Use this for big GeoJSON files (~1GB+) that need to be streamed
+  -b, --big
+      Use this for big GeoJSON files (~1GB+) that need to be streamed
 
-  -d, --demographics=demographics      [default: population,white,black,asian,hispanic,other] Comma-separated census
-                                       demographics to select and aggregate
-                                       To use a different name for the property from the GeoJSON property,
-                                       separate values by ':'
-                                       e.g. -l pop:population,wht:white,blk:black
+  -d, --demographics=demographics
+      [default: population,white,black,asian,hispanic,other] Comma-separated group of census demographics to select and 
+      aggregate
+             To use a different name for the property from the GeoJSON property, separate values by ':'
+             e.g. -d pop:population,wht:white,blk:black
 
-  -f, --filterPrefix=filterPrefix      Filter to only base geounits containing the specified prefix
+             The first value in the group will be used as population, and the remaining values will be displayed
+             as a percentage of that population.
 
-  -l, --levels=levels                  [default: block,blockgroup,county] Comma-separated geolevel hierarchy: smallest
-                                       to largest
-                                       To use a different name for the layer ID from the GeoJSON property,
-                                       separate values by ':'
-                                       e.g. -l geoid:block,blockgroupuuid:blockgroup,county
+             To create multiple groups, use the -d option once per group.
+             e.g. -d population,white,black,asian,hispanic,other -d "VAP,VAP White, VAP Black, VAP Asian, VAP Hispanic, 
+      VAP Other"
 
-  -n, --levelMinZoom=levelMinZoom      [default: 8,0,0] Comma-separated minimum zoom level per geolevel, must match # of
-                                       levels
+  -f, --filterPrefix=filterPrefix
+      Filter to only base geounits containing the specified prefix
 
-  -o, --outputDir=outputDir            [default: ./] Directory to output files
+  -l, --levels=levels
+      [default: block,blockgroup,county] Comma-separated geolevel hierarchy: smallest to largest
+             To use a different name for the layer ID from the GeoJSON property, separate values by ':'
+             e.g. -l geoid:block,blockgroupuuid:blockgroup,county
 
-  -q, --quantization=quantization      [default: 1e5] Topojson quantization transform, 0 to skip
+  -n, --levelMinZoom=levelMinZoom
+      [default: 8,0,0] Comma-separated minimum zoom level per geolevel, must match # of levels
 
-  -s, --simplification=simplification  [default: 0.0000000025] Topojson simplification amount (minWeight)
+  -o, --outputDir=outputDir
+      [default: ./] Directory to output files
 
-  -u, --inputS3Dir=inputS3Dir          S3 directory for the previous run if we will be updating in-place
+  -q, --quantization=quantization
+      [default: 1e5] Topojson quantization transform, 0 to skip
 
-  -v, --labels=labels                  [default: election:presidential 2016] Comma-separated list of label key-value
-                                       pairs, separated by ':'
+  -s, --simplification=simplification
+      [default: 0.0000000025] Topojson simplification amount (minWeight)
 
-  -v, --voting=voting                  Comma-separated election data to select and aggregate
-                                       To use a different name for the layer property from the GeoJSON property,
-                                       separate values by ':'
-                                       e.g. -v voterep:republican,votedem:democrat,voteoth:other
+  -u, --inputS3Dir=inputS3Dir
+      S3 directory for the previous run if we will be updating in-place
 
-  -x, --levelMaxZoom=levelMaxZoom      [default: g,g,g] Comma-separated maximum zoom level per geolevel, must match # of
-                                       levels
+  -v, --voting=voting
+      Comma-separated election data to select and aggregate
+             To use a different name for the layer property from the GeoJSON property, separate values by ':'
+             e.g. -v voterep:republican,votedem:democrat,voteoth:other
+
+  -x, --levelMaxZoom=levelMaxZoom
+      [default: g,g,g] Comma-separated maximum zoom level per geolevel, must match # of levels
 
 DESCRIPTION
   Note: this can be a very memory-intensive operation,
@@ -188,6 +198,19 @@ OPTIONS
   -b, --bucketName=bucketName  [default: global-districtbuilder-dev-us-east-1] Bucket to upload the files to
 ```
 
+## `manage serialize-topojson`
+
+reprocess topojson files into binary format
+
+```
+USAGE
+  $ manage serialize-topojson
+
+DESCRIPTION
+  Pass a list of s3_uri paths to reprocess, e.g.
+     serialize-topojson s3://bucket-name/regions/US/PA s3://other-bucket-name/regions/US/DE
+```
+
 ## `manage update-organization CONFIG`
 
 update or create organization information from a YAML configuration

diff --git a/src/manage/package.json b/src/manage/package.json
@@ -29,6 +29,7 @@
     "pg": "8.7.1",
     "recursive-readdir": "2.2.2",
     "rxjs": "7.4.0",
+    "streamifier": "0.1.1",
     "topojson-client": "3.1.0",
     "topojson-server": "3.0.1",
     "topojson-simplify": "3.0.3",
@@ -46,6 +47,7 @@
     "@types/lodash": "4.14.149",
     "@types/node": "16.11.7",
     "@types/recursive-readdir": "2.2.0",
+    "@types/streamifier": "^0.1.0",
     "@types/topojson": "3.2.2",
     "@typescript-eslint/eslint-plugin": "5.7.0",
     "@typescript-eslint/parser": "5.7.0",

diff --git a/src/manage/src/commands/process-geojson.ts b/src/manage/src/commands/process-geojson.ts
@@ -21,6 +21,8 @@ import { feature as topo2feature, mergeArcs, quantize } from "topojson-client";
 import { topology } from "topojson-server";
 import { planarTriangleArea, presimplify, simplify } from "topojson-simplify";
 import { GeometryCollection, GeometryObject, Objects, Topology } from "topojson-specification";
+import { serialize, deserialize } from "v8";
+
 import {
   TypedArray,
   GeoLevelInfo,
@@ -107,9 +109,9 @@ it when necessary (file sizes ~1GB+).
       To create multiple groups, use the -d option once per group.
       e.g. -d population,white,black,asian,hispanic,other -d "VAP,VAP White, VAP Black, VAP Asian, VAP Hispanic, VAP Other" 
       `,
-      default: "population,white,black,asian,hispanic,other",
+      default: ["population,white,black,asian,hispanic,other"],
       multiple: true
-    } as const),
+    }),
 
     voting: flags.string({
       char: "v",
@@ -170,9 +172,7 @@ it when necessary (file sizes ~1GB+).
     const votingIds = voting.map(([, id]) => id);
     const minZooms = flags.levelMinZoom.split(",");
     const maxZooms = flags.levelMaxZoom.split(",");
-    // Setting 'multiple: true' makes this return an array, but the inferred type didn't get the message
-    const demographicsFlags = flags.demographics as unknown as readonly string[];
-    const demographics = splitPairs(demographicsFlags.join(","));
+    const demographics = splitPairs(flags.demographics.join(","));
     const demographicIds = demographics.map(([, id]) => id);
     const simplification = parseFloat(flags.simplification);
     const quantization = parseFloat(flags.quantization);
@@ -250,7 +250,7 @@ it when necessary (file sizes ~1GB+).
       }
     }
 
-    await this.writeTopoJson(flags.outputDir, topoJsonHierarchy);
+    this.writeTopoJson(flags.outputDir, topoJsonHierarchy);
 
     this.addGeoLevelIndices(topoJsonHierarchy, geoLevelIds);
 
@@ -298,7 +298,7 @@ it when necessary (file sizes ~1GB+).
       votingMetaData,
       bbox,
       geoLevelHierarchyInfo,
-      this.getDemographicsGroups(demographicsFlags)
+      this.getDemographicsGroups(flags.demographics)
     );
   }
 
@@ -492,30 +492,50 @@ it when necessary (file sizes ~1GB+).
 
   // Reads a TopoJSON file from S3, given the S3 run directory
   async readTopoJsonFromS3(inputS3Dir: string): Promise<Topology<Objects<{}>>> {
+    const s3 = new S3();
     const uriComponents = inputS3Dir.split("/");
     const bucket = uriComponents[2];
-    const key = `${uriComponents.slice(3).join("/")}topo.json`;
-    const response: any = await new S3()
+    const keyPrefix = uriComponents.slice(3).join("/");
+
+    console.log(bucket);
+    const bufFileExists = await s3
+      .headObject({
+        Bucket: bucket,
+        Key: `${keyPrefix}topo.buf`
+      })
+      .promise()
+      .then(
+        () => true,
+        err => {
+          if (err.code === "NotFound") {
+            return false;
+          }
+          throw err;
+        }
+      );
+
+    // Use binary format if it exists, but fallback to text format otherwise
+    const key = bufFileExists ? `${keyPrefix}topo.buf` : `${keyPrefix}topo.json`;
+
+    const response: any = await s3
       .getObject({
         Bucket: bucket,
         Key: key
       })
       .promise();
 
-    // Note: we are not using streaming within the server when reading TopoJSON, so it hasn't been
-    // implemented that way here either. If we ever encounter a TopoJSON file that's large enough
-    // that it needs to be streamed, we'll need to convert both pieces of code appropriately.
-    return JSON.parse(response.Body.toString("utf8"));
+    return bufFileExists
+      ? deserialize(response.Body as Buffer)
+      : JSON.parse(response.Body?.toString("utf8"));
   }
 
   // Write TopoJSON file to disk
-  writeTopoJson(dir: string, topology: Topology<Objects<{}>>): Promise<void> {
+  writeTopoJson(dir: string, topology: Topology<Objects<{}>>) {
     this.log("Writing topojson file");
-    const path = join(dir, "topo.json");
-    const output = createWriteStream(path, { encoding: "utf8" });
-    return new Promise(resolve =>
-      new JsonStreamStringify(topology).pipe(output).on("finish", () => resolve())
-    );
+    const path = join(dir, "topo.buf");
+    const output = createWriteStream(path, { encoding: "binary" });
+    output.write(serialize(topology));
+    output.close();
   }
 
   // Makes an appropriately-sized typed array containing the data

diff --git a/src/manage/src/commands/serialize-topojson.ts b/src/manage/src/commands/serialize-topojson.ts
@@ -0,0 +1,124 @@
+import { Command } from "@oclif/command";
+import S3 from "aws-sdk/clients/s3";
+import cli from "cli-ux";
+import { mapSync } from "event-stream";
+import { FeatureCollection, Polygon } from "geojson";
+import { parse } from "JSONStream";
+import { createReadStream } from "streamifier";
+import { Objects, Topology } from "topojson-specification";
+import { serialize } from "v8";
+
+export default class SerializeTopojson extends Command {
+  static description = `reprocess topojson files into binary format
+  
+  Pass a list of s3_uri paths to reprocess, e.g.
+  serialize-topojson s3://bucket-name/regions/US/PA s3://other-bucket-name/regions/US/DE
+`;
+
+  static strict = false;
+
+  // Streaming reader for GeoJSON files. Works on files over 512MB, but is slow.
+  async readBigJson(path: string): Promise<FeatureCollection<Polygon, {}>> {
+    return new Promise(resolve =>
+      createReadStream(path, { encoding: "utf8" })
+        .pipe(parse("features"))
+        .pipe(
+          mapSync((features: any) => {
+            resolve({ type: "FeatureCollection", features });
+          })
+        )
+    );
+  }
+
+  async run(): Promise<void> {
+    const { argv } = this.parse(SerializeTopojson);
+
+    for (const s3URI of argv) {
+      cli.action.start(`Reading base TopoJSON: ${s3URI}`);
+      const baseTopojson = await this.readTopoJsonFromS3(s3URI);
+      cli.action.stop();
+
+      cli.action.start(`Uploading serialized TopoJSON: ${s3URI}`);
+      await this.writeTopoJsonToS3(s3URI, baseTopojson);
+      cli.action.stop();
+    }
+  }
+
+  // Reads a TopoJSON file from S3, given the S3 run directory
+  async readTopoJsonFromS3(inputS3Dir: string): Promise<Topology<Objects<{}>>> {
+    const uriComponents = inputS3Dir.split("/");
+    const bucket = uriComponents[2];
+    const key = `${uriComponents.slice(3).join("/")}topo.json`;
+    const response: any = await new S3()
+      .getObject({
+        Bucket: bucket,
+        Key: key
+      })
+      .promise();
+
+    const objects = await new Promise(resolve =>
+      createReadStream(response.Body as Buffer)
+        .pipe(parse("objects"))
+        .pipe(
+          mapSync((objects: any) => {
+            resolve(objects);
+          })
+        )
+    );
+
+    const arcs = await new Promise(resolve =>
+      createReadStream(response.Body as Buffer)
+        .pipe(parse("arcs"))
+        .pipe(
+          mapSync((arcs: any) => {
+            resolve(arcs);
+          })
+        )
+    );
+
+    const bbox = await new Promise(resolve =>
+      createReadStream(response.Body as Buffer)
+        .pipe(parse("bbox"))
+        .pipe(
+          mapSync((bbox: any) => {
+            resolve(bbox);
+          })
+        )
+    );
+
+    const transform = await new Promise(resolve =>
+      createReadStream(response.Body as Buffer)
+        .pipe(parse("transform"))
+        .pipe(
+          mapSync((transform: any) => {
+            resolve(transform);
+          })
+        )
+    );
+
+    return {
+      type: "Topology",
+      bbox,
+      transform,
+      objects,
+      arcs
+    } as Topology<Objects<{}>>;
+  }
+
+  // Write TopoJSON binary file to S3
+  writeTopoJsonToS3(inputS3Dir: string, topology: Topology<Objects<{}>>) {
+    this.log("Streaming topojson file");
+    const uriComponents = inputS3Dir.split("/");
+    const bucket = uriComponents[2];
+    const key = `${uriComponents.slice(3).join("/")}topo.buf`;
+
+    const s3Client = new S3();
+    return s3Client
+      .upload({
+        Body: serialize(topology),
+        Bucket: bucket,
+        Key: key
+      })
+      .promise();
+  }
+}