Skip to content

Commit

Permalink
shard pages state (on top of existing sharding for nodes)
Browse files Browse the repository at this point in the history
  • Loading branch information
pieh committed Dec 13, 2020
1 parent 4d1da68 commit 65f3f72
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 15 deletions.
183 changes: 182 additions & 1 deletion packages/gatsby/src/redux/__tests__/index.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
const _ = require(`lodash`)
const path = require(`path`)
const v8 = require(`v8`)

const writeToCache = jest.spyOn(require(`../persist`), `writeToCache`)
const v8Serialize = jest.spyOn(v8, `serialize`)
const v8Deserialize = jest.spyOn(v8, `deserialize`)

const { saveState, store, readState } = require(`../index`)

const {
actions: { createPage },
actions: { createPage, createNode },
} = require(`../actions`)

const mockWrittenContent = new Map()
Expand Down Expand Up @@ -120,6 +124,9 @@ describe(`redux db`, () => {
}

beforeEach(() => {
store.dispatch({
type: `DELETE_CACHE`,
})
writeToCache.mockClear()
mockWrittenContent.clear()
})
Expand Down Expand Up @@ -166,4 +173,178 @@ describe(`redux db`, () => {

expect(mockWrittenContent.has(legacyLocation)).toBe(false)
})

describe(`Sharding`, () => {
afterAll(() => {
v8Serialize.mockRestore()
v8Deserialize.mockRestore()
})

// we set limit to 1.5 * 1024 * 1024 * 1024 per shard
// simulating size for page and nodes will allow us to see if we create expected amount of shards
// and that we stitch them back together correctly
const nodeShardsScenarios = [
{
numberOfNodes: 50,
simulatedNodeObjectSize: 5 * 1024 * 1024,
expectedNumberOfNodeShards: 1,
},
{
numberOfNodes: 5,
simulatedNodeObjectSize: 0.6 * 1024 * 1024 * 1024,
expectedNumberOfNodeShards: 3,
},
]
const pageShardsScenarios = [
{
numberOfPages: 50,
simulatedPageObjectSize: 10 * 1024 * 1024,
expectedNumberOfPageShards: 1,
},
{
numberOfPages: 5,
simulatedPageObjectSize: 0.9 * 1024 * 1024 * 1024,
expectedNumberOfPageShards: 5,
},
]

const scenarios = []
for (let nodeShardsParams of nodeShardsScenarios) {
for (let pageShardsParams of pageShardsScenarios) {
scenarios.push([
nodeShardsParams.numberOfNodes,
nodeShardsParams.simulatedNodeObjectSize,
nodeShardsParams.expectedNumberOfNodeShards,
pageShardsParams.numberOfPages,
pageShardsParams.simulatedPageObjectSize,
pageShardsParams.expectedNumberOfPageShards,
])
}
}

it.each(scenarios)(
`Scenario Nodes %i x %i bytes = %i shards / Pages %i x %i bytes = %i shards`,
async (
numberOfNodes,
simulatedNodeObjectSize,
expectedNumberOfNodeShards,
numberOfPages,
simulatedPageObjectSize,
expectedNumberOfPageShards
) => {
// just some baseline checking to make sure test setup is correct - check both in-memory state and persisted state
// and make sure it's empty
const initialStateInMemory = store.getState()
expect(initialStateInMemory.pages).toEqual(new Map())
expect(initialStateInMemory.nodes).toEqual(new Map())

// we expect to have no persisted state yet - this returns empty object
// and let redux to use initial states for all redux slices
const initialPersistedState = readState()
expect(initialPersistedState.pages).toBeUndefined()
expect(initialPersistedState.nodes).toBeUndefined()
expect(initialPersistedState).toEqual({})

for (let nodeIndex = 0; nodeIndex < numberOfNodes; nodeIndex++) {
store.dispatch(
createNode(
{
id: `node-${nodeIndex}`,
context: {
objectType: `node`,
},
internal: {
type: `Foo`,
contentDigest: `contentDigest-${nodeIndex}`,
},
},
{ name: `gatsby-source-test` }
)
)
}

createPages(
new Array(numberOfPages).fill(undefined).map((_, index) => {
return {
path: `/page-${index}/`,
component: `/Users/username/dev/site/src/templates/my-sweet-new-page.js`,
context: {
objectType: `page`,
possiblyHugeField: `let's pretend this field is huge (we will simulate that by mocking some things used to asses size of object)`,
},
}
})
)

const currentStateInMemory = store.getState()
expect(currentStateInMemory.nodes.size).toEqual(numberOfNodes)
expect(currentStateInMemory.pages.size).toEqual(numberOfPages)

// this is just to make sure that any implementation changes in readState
// won't affect this test - so we clone current state of things and will
// use that for assertions
const clonedCurrentNodes = new Map(currentStateInMemory.nodes)
const clonedCurrentPages = new Map(currentStateInMemory.pages)

// we expect to have no persisted state yet and that current in-memory state doesn't affect it
const persistedStateBeforeSaving = readState()
expect(persistedStateBeforeSaving.pages).toBeUndefined()
expect(persistedStateBeforeSaving.nodes).toBeUndefined()
expect(persistedStateBeforeSaving).toEqual({})

// simulate that nodes/pages have sizes set in scenario parameters
// it changes implementation to JSON.stringify because calling v8.serialize
// again cause max stack size errors :shrug: - this also requires adjusting
// deserialize implementation
v8Serialize.mockImplementation(obj => {
if (obj?.[1]?.context?.objectType === `node`) {
return {
toString: () => JSON.stringify(obj),
length: simulatedNodeObjectSize,
}
} else if (obj?.[1]?.context?.objectType === `page`) {
return {
toString: () => JSON.stringify(obj),
length: simulatedPageObjectSize,
}
} else {
return JSON.stringify(obj)
}
})
v8Deserialize.mockImplementation(obj => JSON.parse(obj.toString()))

await saveState()

const shardsWritten = {
rest: 0,
node: 0,
page: 0,
}

for (let fileWritten of mockWrittenContent.keys()) {
const basename = path.basename(fileWritten)
if (basename.startsWith(`redux.rest`)) {
shardsWritten.rest++
} else if (basename.startsWith(`redux.node`)) {
shardsWritten.node++
} else if (basename.startsWith(`redux.page`)) {
shardsWritten.page++
}
}

expect(writeToCache).toBeCalled()

expect(shardsWritten.rest).toEqual(1)
expect(shardsWritten.node).toEqual(expectedNumberOfNodeShards)
expect(shardsWritten.page).toEqual(expectedNumberOfPageShards)

// and finally - let's make sure that reading shards stitches it back together
// correctly
const persistedStateAfterSaving = readState()

expect(persistedStateAfterSaving.nodes).toEqual(clonedCurrentNodes)
expect(persistedStateAfterSaving.pages).toEqual(clonedCurrentPages)
}
)
})
})
66 changes: 53 additions & 13 deletions packages/gatsby/src/redux/persist.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
removeSync,
writeFileSync,
} from "fs-extra"
import { IGatsbyNode, ICachedReduxState } from "./types"
import { IGatsbyNode, ICachedReduxState, IGatsbyPage } from "./types"
import { sync as globSync } from "glob"
import report from "gatsby-cli/lib/reporter"

Expand All @@ -28,16 +28,19 @@ function reduxSharedFile(dir: string): string {
function reduxChunkedNodesFilePrefix(dir: string): string {
return path.join(dir, `redux.node.state_`)
}
function reduxChunkedPagesFilePrefix(dir: string): string {
return path.join(dir, `redux.page.state_`)
}

function readFromLegacyCache(): ICachedReduxState {
return v8.deserialize(readFileSync(getLegacyCacheFile()))
}

export function readFromCache(): ICachedReduxState {
// The cache is stored in two steps; the nodes in chunks and the rest
// First we revive the rest, then we inject the nodes into that obj (if any)
// The cache is stored in two steps; the nodes and pages in chunks and the rest
// First we revive the rest, then we inject the nodes and pages into that obj (if any)
// Each chunk is stored in its own file, this circumvents max buffer lengths
// for sites with a _lot_ of content. Since all nodes go into a Map, the order
// for sites with a _lot_ of content. Since all nodes / pages go into a Map, the order
// of reading them is not relevant.

const reduxCacheFolder = getReduxCacheFolder()
Expand All @@ -51,13 +54,13 @@ export function readFromCache(): ICachedReduxState {
)

// Note: at 1M pages, this will be 1M/chunkSize chunks (ie. 1m/10k=100)
const chunks = globSync(
const nodesChunks = globSync(
reduxChunkedNodesFilePrefix(reduxCacheFolder) + `*`
).map(file => v8.deserialize(readFileSync(file)))

const nodes: Array<[string, IGatsbyNode]> = [].concat(...chunks)
const nodes: Array<[string, IGatsbyNode]> = [].concat(...nodesChunks)

if (!chunks.length) {
if (!nodesChunks.length) {
report.info(
`Cache exists but contains no nodes. There should be at least some nodes available so it seems the cache was corrupted. Disregarding the cache and proceeding as if there was none.`
)
Expand All @@ -67,10 +70,29 @@ export function readFromCache(): ICachedReduxState {

obj.nodes = new Map(nodes)

// Note: at 1M pages, this will be 1M/chunkSize chunks (ie. 1m/10k=100)
const pagesChunks = globSync(
reduxChunkedPagesFilePrefix(reduxCacheFolder) + `*`
).map(file => v8.deserialize(readFileSync(file)))

const pages: Array<[string, IGatsbyPage]> = [].concat(...pagesChunks)

if (!pagesChunks.length) {
report.info(
`Cache exists but contains no pages. There should be at least some pages available so it seems the cache was corrupted. Disregarding the cache and proceeding as if there was none.`
)
// TODO: this is a DeepPartial<ICachedReduxState> but requires a big change
return {} as ICachedReduxState
}

obj.pages = new Map(pages)

return obj
}

function guessSafeChunkSize(values: Array<[string, IGatsbyNode]>): number {
export function guessSafeChunkSize(
values: Array<[string, IGatsbyNode]> | Array<[string, IGatsbyPage]>
): number {
// Pick a few random elements and measure their size then pick a chunk size
// ceiling based on the worst case. Each test takes time so there's trade-off.
// This attempts to prevent small sites with very large pages from OOMing.
Expand Down Expand Up @@ -103,18 +125,22 @@ function prepareCacheFolder(
targetDir: string,
contents: ICachedReduxState
): void {
// Temporarily save the nodes and remove them from the main redux store
// Temporarily save the nodes and pages and remove them from the main redux store
// This prevents an OOM when the page nodes collectively contain to much data
const map = contents.nodes
const nodesMap = contents.nodes
contents.nodes = undefined

const pagesMap = contents.pages
contents.pages = undefined

writeFileSync(reduxSharedFile(targetDir), v8.serialize(contents))
// Now restore them on the redux store
contents.nodes = map
contents.nodes = nodesMap
contents.pages = pagesMap

if (map) {
if (nodesMap) {
// Now store the nodes separately, chunk size determined by a heuristic
const values: Array<[string, IGatsbyNode]> = [...map.entries()]
const values: Array<[string, IGatsbyNode]> = [...nodesMap.entries()]
const chunkSize = guessSafeChunkSize(values)
const chunks = Math.ceil(values.length / chunkSize)

Expand All @@ -125,6 +151,20 @@ function prepareCacheFolder(
)
}
}

if (pagesMap) {
// Now store the nodes separately, chunk size determined by a heuristic
const values: Array<[string, IGatsbyPage]> = [...pagesMap.entries()]
const chunkSize = guessSafeChunkSize(values)
const chunks = Math.ceil(values.length / chunkSize)

for (let i = 0; i < chunks; ++i) {
writeFileSync(
reduxChunkedPagesFilePrefix(targetDir) + i,
v8.serialize(values.slice(i * chunkSize, i * chunkSize + chunkSize))
)
}
}
}

function safelyRenameToBak(reduxCacheFolder: string): string {
Expand Down
2 changes: 1 addition & 1 deletion packages/gatsby/src/redux/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ export interface ICachedReduxState {
webpackCompilationHash: IGatsbyState["webpackCompilationHash"]
pageDataStats: IGatsbyState["pageDataStats"]
pageData: IGatsbyState["pageData"]
pages: IGatsbyState["pages"]
pages?: IGatsbyState["pages"]
staticQueriesByTemplate: IGatsbyState["staticQueriesByTemplate"]
pendingPageDataWrites: IGatsbyState["pendingPageDataWrites"]
queries: IGatsbyState["queries"]
Expand Down

0 comments on commit 65f3f72

Please sign in to comment.