apache · nbali · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022 · Sep 6, 2022
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/GroupIntoBatches.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/GroupIntoBatches.java
@@ -117,6 +117,11 @@ public class GroupIntoBatches<K, InputT>
    */
   @AutoValue
   public abstract static class BatchingParams<InputT> implements Serializable {
+    public static <InputT> BatchingParams<InputT> createDefault() {
+      return new AutoValue_GroupIntoBatches_BatchingParams(
+          Long.MAX_VALUE, Long.MAX_VALUE, null, Duration.ZERO);
+    }
+
     public static <InputT> BatchingParams<InputT> create(
         long batchSize,
         long batchSizeBytes,
@@ -170,8 +175,7 @@ private GroupIntoBatches(BatchingParams<InputT> params) {
   /** Aim to create batches each with the specified element count. */
   public static <K, InputT> GroupIntoBatches<K, InputT> ofSize(long batchSize) {
     Preconditions.checkState(batchSize < Long.MAX_VALUE);
-    return new GroupIntoBatches<>(
-        BatchingParams.create(batchSize, Long.MAX_VALUE, null, Duration.ZERO));
+    return new GroupIntoBatches<K, InputT>(BatchingParams.createDefault()).withSize(batchSize);
   }
 
   /**
@@ -185,9 +189,8 @@ public static <K, InputT> GroupIntoBatches<K, InputT> ofSize(long batchSize) {
    * {@link #ofByteSize(long, SerializableFunction)} to specify code to calculate the byte size.
    */
   public static <K, InputT> GroupIntoBatches<K, InputT> ofByteSize(long batchSizeBytes) {
-    Preconditions.checkState(batchSizeBytes < Long.MAX_VALUE);
-    return new GroupIntoBatches<>(
-        BatchingParams.create(Long.MAX_VALUE, batchSizeBytes, null, Duration.ZERO));
+    return new GroupIntoBatches<K, InputT>(BatchingParams.createDefault())
+        .withByteSize(batchSizeBytes);
   }
 
   /**
@@ -196,16 +199,49 @@ public static <K, InputT> GroupIntoBatches<K, InputT> ofByteSize(long batchSizeB
    */
   public static <K, InputT> GroupIntoBatches<K, InputT> ofByteSize(
       long batchSizeBytes, SerializableFunction<InputT, Long> getElementByteSize) {
-    Preconditions.checkState(batchSizeBytes < Long.MAX_VALUE);
-    return new GroupIntoBatches<>(
-        BatchingParams.create(Long.MAX_VALUE, batchSizeBytes, getElementByteSize, Duration.ZERO));
+    return new GroupIntoBatches<K, InputT>(BatchingParams.createDefault())
+        .withByteSize(batchSizeBytes, getElementByteSize);
   }
 
   /** Returns user supplied parameters for batching. */
   public BatchingParams<InputT> getBatchingParams() {
     return params;
   }
 
+  /** @see #ofSize(long) */
+  public GroupIntoBatches<K, InputT> withSize(long batchSize) {
+    Preconditions.checkState(batchSize < Long.MAX_VALUE);
+    return new GroupIntoBatches<>(
+        BatchingParams.create(
+            batchSize,
+            params.getBatchSizeBytes(),
+            params.getElementByteSize(),
+            params.getMaxBufferingDuration()));
+  }
+
+  /** @see #ofByteSize(long) */
+  public GroupIntoBatches<K, InputT> withByteSize(long batchSizeBytes) {
+    Preconditions.checkState(batchSizeBytes < Long.MAX_VALUE);
+    return new GroupIntoBatches<>(
+        BatchingParams.create(
+            params.getBatchSize(),
+            batchSizeBytes,
+            params.getElementByteSize(),
+            params.getMaxBufferingDuration()));
+  }
+
+  /** @see #ofByteSize(long, SerializableFunction) */
+  public GroupIntoBatches<K, InputT> withByteSize(
+      long batchSizeBytes, SerializableFunction<InputT, Long> getElementByteSize) {
+    Preconditions.checkState(batchSizeBytes < Long.MAX_VALUE);
+    return new GroupIntoBatches<>(
+        BatchingParams.create(
+            params.getBatchSize(),
+            batchSizeBytes,
+            getElementByteSize,
+            params.getMaxBufferingDuration()));
+  }
+
   /**
    * Sets a time limit (in processing time) on how long an incomplete batch of elements is allowed
    * to be buffered. Once a batch is flushed to output, the timer is reset. The provided limit must

diff --git a/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -113,6 +113,9 @@ class BatchLoads<DestinationT, ElementT>
   // If user triggering is supplied, we will trigger the file write after this many records are
   // written.
   static final int FILE_TRIGGERING_RECORD_COUNT = 500000;
+  // If user triggering is supplied, we will trigger the file write after this many bytes are
+  // written.
+  static final long FILE_TRIGGERING_BYTE_COUNT = 100 * (1L << 20); // 100MiB
 Integer getGcsUploadBufferSizeBytes(); 
 Integer getGcsUploadBufferSizeBytes(); 
 
   // If using auto-sharding for unbounded data, we batch the records before triggering file write
   // to avoid generating too many small files.
@@ -647,6 +650,7 @@ PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFil
     return input
         .apply(
             GroupIntoBatches.<DestinationT, ElementT>ofSize(FILE_TRIGGERING_RECORD_COUNT)
+                .withByteSize(FILE_TRIGGERING_BYTE_COUNT)
                 .withMaxBufferingDuration(maxBufferingDuration)
                 .withShardedKey())
         .setCoder(