web/src/webgpu.ts - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 import { assert } from "./support";
 import { Pointer } from "./ctypes";
 import { Memory } from "./memory";
 import { Disposable } from "./types";

 /** A pointer to points to the raw address space. */
 export type GPUPointer = number;

 export interface GPUDeviceDetectOutput {
   adapter: GPUAdapter;
   adapterInfo: GPUAdapterInfo;
   device: GPUDevice;
 }

 /**
  * DetectGPU device in the environment.
  */
 export async function detectGPUDevice(powerPreference: "low-power" | "high-performance" = "high-performance"): Promise<GPUDeviceDetectOutput | undefined> {
   if (typeof navigator !== "undefined" && navigator.gpu !== undefined) {
     const adapter = await navigator.gpu.requestAdapter({ powerPreference });
     if (adapter == null) {
       throw Error(
         "Unable to find a compatible GPU. This issue might be because your computer doesn't have a GPU, or your system settings are not configured properly. " +
           "Please check if your device has a GPU properly set up and if your your browser supports WebGPU. " +
           "You can also consult your browser's compatibility chart to see if it supports WebGPU. " +
           "For more information about WebGPU support in your browser, visit https://webgpureport.org/"
       );
     }
     const computeMB = (value: number) => {
       return Math.ceil(value / (1 << 20)) + "MB";
     }

     // more detailed error message
     let requiredMaxBufferSize = 1 << 30;  // 1GB
     if (requiredMaxBufferSize > adapter.limits.maxBufferSize) {
       // If 1GB is too large, try 256MB (default size stated in WebGPU doc)
       const backupRequiredMaxBufferSize = 1 << 28;  // 256MB
       console.log(
         `Requested maxBufferSize exceeds limit. \n` +
         `requested=${computeMB(requiredMaxBufferSize)}, \n` +
         `limit=${computeMB(adapter.limits.maxBufferSize)}. \n` +
         `WARNING: Falling back to ${computeMB(backupRequiredMaxBufferSize)}...`
       );
       requiredMaxBufferSize = backupRequiredMaxBufferSize;
       if (backupRequiredMaxBufferSize > adapter.limits.maxBufferSize) {
         // Fail if 256MB is still too big
         throw Error(
           `Cannot initialize runtime because of requested maxBufferSize ` +
           `exceeds limit. requested=${computeMB(backupRequiredMaxBufferSize)}, ` +
           `limit=${computeMB(adapter.limits.maxBufferSize)}. ` +
           `Consider upgrading your browser.`
         );
       }
     }

     let requiredMaxStorageBufferBindingSize = 1 << 30;  // 1GB
     if (requiredMaxStorageBufferBindingSize > adapter.limits.maxStorageBufferBindingSize) {
       // If 1GB is too large, try 128MB (default size stated in WebGPU doc)
       const backupRequiredMaxStorageBufferBindingSize = 1 << 27;  // 128MB
       console.log(
         `Requested maxStorageBufferBindingSize exceeds limit. \n` +
         `requested=${computeMB(requiredMaxStorageBufferBindingSize)}, \n` +
         `limit=${computeMB(adapter.limits.maxStorageBufferBindingSize)}. \n` +
         `WARNING: Falling back to ${computeMB(backupRequiredMaxStorageBufferBindingSize)}...`
       );
       requiredMaxStorageBufferBindingSize = backupRequiredMaxStorageBufferBindingSize;
       if (backupRequiredMaxStorageBufferBindingSize > adapter.limits.maxStorageBufferBindingSize) {
         // Fail if 128MB is still too big
         throw Error(
           `Cannot initialize runtime because of requested maxStorageBufferBindingSize ` +
           `exceeds limit. requested=${computeMB(backupRequiredMaxStorageBufferBindingSize)}, ` +
           `limit=${computeMB(adapter.limits.maxStorageBufferBindingSize)}. `
         );
       }
     }

     const requiredMaxComputeWorkgroupStorageSize = 32 << 10;
     if (requiredMaxComputeWorkgroupStorageSize > adapter.limits.maxComputeWorkgroupStorageSize) {
       throw Error(
         `Cannot initialize runtime because of requested maxComputeWorkgroupStorageSize ` +
         `exceeds limit. requested=${requiredMaxComputeWorkgroupStorageSize}, ` +
         `limit=${adapter.limits.maxComputeWorkgroupStorageSize}. `
       );
     }

     const requiredMaxStorageBuffersPerShaderStage = 10;  // default is 8
     if (requiredMaxStorageBuffersPerShaderStage > adapter.limits.maxStorageBuffersPerShaderStage) {
       throw Error(
         `Cannot initialize runtime because of requested maxStorageBuffersPerShaderStage ` +
         `exceeds limit. requested=${requiredMaxStorageBuffersPerShaderStage}, ` +
         `limit=${adapter.limits.maxStorageBuffersPerShaderStage}. `
       );
     }

     const requiredFeatures: GPUFeatureName[] = [];
     // Always require f16 if available
     if (adapter.features.has("shader-f16")) {
       requiredFeatures.push("shader-f16");
     }

     // requestAdapterInfo() is deprecated, causing requestAdapterInfo to raise
     // issue when building. However, it is still needed for older browsers, hence `as any`.
     const adapterInfo = adapter.info || await (adapter as any).requestAdapterInfo();
     const device = await adapter.requestDevice({
       requiredLimits: {
         maxBufferSize: requiredMaxBufferSize,
         maxStorageBufferBindingSize: requiredMaxStorageBufferBindingSize,
         maxComputeWorkgroupStorageSize: requiredMaxComputeWorkgroupStorageSize,
         maxStorageBuffersPerShaderStage: requiredMaxStorageBuffersPerShaderStage,
       },
       requiredFeatures
     });
     return {
       adapter: adapter,
       adapterInfo: adapterInfo,
       device: device
     };
   } else {
     return undefined;
   }
 }

 /**
  * Create GPU buffer with `createBuffer()` but with error catching; destroy if error caught.
  * @param device The GPUDevice used to create a buffer.
  * @param descriptor The GPUBufferDescriptor passed to `createBuffer()`.
  * @returns The buffer created by `createBuffer()`.
  *
  * @note We treat any error occurred at `createBuffer()` fatal and expect the user to handle
  *   `device.destroy()` with `device.lost.then()`.
  */
 function tryCreateBuffer(device: GPUDevice, descriptor: GPUBufferDescriptor) {
   device.pushErrorScope("out-of-memory");
   device.pushErrorScope("validation");
   device.pushErrorScope("internal");

   const buffer = device.createBuffer(descriptor);

   device.popErrorScope().then((error) => {if (error) {device.destroy(); console.error(error);}});
   device.popErrorScope().then((error) => {if (error) {device.destroy(); console.error(error);}});
   device.popErrorScope().then((error) => {if (error) {device.destroy(); console.error(error);}});

   return buffer;
 }

 const canvasRenderWGSL = `
 @group(0) @binding(0) var my_sampler : sampler;
 @group(0) @binding(1) var my_texture : texture_2d<f32>;

 struct VertexOutput {
   @builtin(position) position : vec4<f32>,
   @location(0) uv : vec2<f32>,
 }

 @vertex
 fn vertex_main(@builtin(vertex_index) vidx : u32) -> VertexOutput {
   const pos = array(
     vec2( 1.0,  1.0),
     vec2( 1.0, -1.0),
     vec2(-1.0, -1.0),
     vec2( 1.0,  1.0),
     vec2(-1.0, -1.0),
     vec2(-1.0,  1.0),
   );

   const uv = array(
     vec2(1.0, 0.0),
     vec2(1.0, 1.0),
     vec2(0.0, 1.0),
     vec2(1.0, 0.0),
     vec2(0.0, 1.0),
     vec2(0.0, 0.0),
   );

   var output : VertexOutput;
   output.position = vec4(pos[vidx], 0.0, 1.0);
   output.uv = uv[vidx];
   return output;
 }

 @fragment
 fn fragment_main(@location(0) uv : vec2<f32>) -> @location(0) vec4<f32> {
   return textureSample(my_texture, my_sampler, uv);
 }

 @fragment
 fn fragment_clear(@location(0) uv : vec2<f32>) -> @location(0) vec4<f32> {
   return vec4(1.0, 1.0, 1.0, 1.0);
 }
 `
 class CanvasRenderManager implements Disposable {
   private device: GPUDevice;
   private canvasContext: GPUCanvasContext;
   private stagingTexture: GPUTexture;
   private renderSampler: GPUSampler;
   private renderPipeline: GPURenderPipeline;
   private clearPipeline: GPURenderPipeline;
   private canvasTextureFormat: GPUTextureFormat;

   constructor(device: GPUDevice, canvas: HTMLCanvasElement) {
     this.device = device;
     const ctx = canvas.getContext("webgpu");
     if (ctx == null) {
       throw Error("Cannot bind WebGPU context");
     }
     // avoid possible ts complain
     this.canvasContext = ctx as any;
     this.canvasTextureFormat = navigator.gpu.getPreferredCanvasFormat();
     this.canvasContext.configure({
       device: this.device,
       format: this.canvasTextureFormat,
       alphaMode: "opaque",
     });

     this.renderPipeline = device.createRenderPipeline({
       layout: "auto",
       vertex: {
         module: device.createShaderModule({
           code: canvasRenderWGSL,
         }),
         entryPoint: "vertex_main",
       },
       fragment: {
         module: device.createShaderModule({
           code: canvasRenderWGSL,
         }),
         entryPoint: "fragment_main",
         targets: [{
           format: this.canvasTextureFormat,
         }],
       },
       primitive: {
         topology: "triangle-list",
       },
     });

     this.clearPipeline = device.createRenderPipeline({
       layout: "auto",
       vertex: {
         module: device.createShaderModule({
           code: canvasRenderWGSL,
         }),
         entryPoint: "vertex_main",
       },
       fragment: {
         module: device.createShaderModule({
           code: canvasRenderWGSL,
         }),
         entryPoint: "fragment_clear",
         targets: [{
           format: this.canvasTextureFormat,
         }],
       },
       primitive: {
         topology: "triangle-list",
       },
     });

     this.renderSampler = device.createSampler({
       magFilter: "linear",
       minFilter: "linear",
     });
     // staging texture always be in RGBA
     this.stagingTexture = device.createTexture({
       size: [canvas.height, canvas.width, 1],
       format: "rgba8unorm",
       usage:
         GPUTextureUsage.TEXTURE_BINDING |
         GPUTextureUsage.COPY_DST |
         GPUTextureUsage.RENDER_ATTACHMENT,
     });
   }

   clear() {
     const commandEncoder = this.device.createCommandEncoder();
     const passEncoder = commandEncoder.beginRenderPass({
       colorAttachments: [
         {
           view: this.canvasContext.getCurrentTexture().createView(),
           clearValue: { r: 0.0, g: 0.0, b: 0.0, a: 1.0 },
           loadOp: "clear",
           storeOp: "store",
         },
       ],
     });
     passEncoder.setPipeline(this.clearPipeline);
     const renderBindingGroup = this.device.createBindGroup({
       layout: this.renderPipeline.getBindGroupLayout(0),
       entries: [
         { binding: 0, resource: this.renderSampler },
         { binding: 1, resource: this.stagingTexture.createView() },
       ],
     });
     passEncoder.setBindGroup(0, renderBindingGroup);
     passEncoder.draw(6, 1, 0, 0);
     passEncoder.end();
     this.device.queue.submit([commandEncoder.finish()]);
   }

   draw(buffer: GPUBuffer, height: number, width: number) {
     // resize the staging texture
     if (height != this.stagingTexture.height || width != this.stagingTexture.width) {
       this.stagingTexture.destroy();
       this.stagingTexture = this.device.createTexture({
         size: [height, width, 1],
         format: "rgba8unorm",
         usage:
           GPUTextureUsage.TEXTURE_BINDING |
           GPUTextureUsage.COPY_DST |
           GPUTextureUsage.RENDER_ATTACHMENT,
       });
     }

     const commandEncoder = this.device.createCommandEncoder();
     commandEncoder.copyBufferToTexture({
       buffer: buffer,
       offset: 0,
       bytesPerRow: this.stagingTexture.width * 4
     }, {
       texture: this.stagingTexture
     }, {
       width: this.stagingTexture.width,
       height: this.stagingTexture.height
     });

     const passEncoder = commandEncoder.beginRenderPass({
       colorAttachments: [
         {
           view: this.canvasContext.getCurrentTexture().createView(),
           clearValue: { r: 0.0, g: 0.0, b: 0.0, a: 1.0 },
           loadOp: "clear",
           storeOp: "store",
         },
       ],
     });
     passEncoder.setPipeline(this.renderPipeline);
     const renderBindingGroup = this.device.createBindGroup({
       layout: this.renderPipeline.getBindGroupLayout(0),
       entries: [
         { binding: 0, resource: this.renderSampler },
         { binding: 1, resource: this.stagingTexture.createView() },
       ],
     });
     passEncoder.setBindGroup(0, renderBindingGroup);
     passEncoder.draw(6, 1, 0, 0);
     passEncoder.end();
     this.device.queue.submit([commandEncoder.finish()]);
   }

   dispose(): void {
     this.stagingTexture.destroy();
   }
 }

 /**
  * Function info from the API
  */
 export interface FunctionInfo {
   name: string;
   arg_types: Array<string>;
   launch_param_tags: Array<string>;
 }

 /**
  * WebGPU context
  * Manages all the webgpu resources here.
  */
 export class WebGPUContext {
   device: GPUDevice;
   memory: Memory;
   // internal data
   private bufferTable: Array<GPUBuffer | undefined> = [undefined];
   private bufferTableFreeId: Array<number> = [];
   private podArgStagingBuffers: Array<GPUBuffer> = [];
   private canvasRenderManager?: CanvasRenderManager = undefined;
   // number of pod arg staging buffers
   private maxNumPodArgsStagingBuffers = 2;
   // flags for debugging
   // stats of the runtime.
   // peak allocation
   private peakAllocatedBytes = 0;
   // current allocation
   private currAllocatedBytes = 0;
   // all allocation(ignoring free)
   private allAllocatedBytes = 0;
   // shader submit counter
   private shaderSubmitCounter = 0;
   // limite number of shaders to be submitted, useful for debugging, default to -1
   protected debugShaderSubmitLimit = -1;
   // log and sync each step
   protected debugLogFinish = false;

   constructor(memory: Memory, device: GPUDevice) {
     this.memory = memory;
     this.device = device;
   }

   /**
    * Dispose context.
    */
   dispose() {
     this.canvasRenderManager?.dispose();
     this.bufferTableFreeId = [];
     while (this.bufferTable.length != 0) {
       this.bufferTable.pop()?.destroy();
     }
     while (this.podArgStagingBuffers.length != 0) {
       this.podArgStagingBuffers.pop()?.destroy();
     }
     this.device.destroy();
   }

   /**
    * Wait for all pending GPU tasks to complete
    */
   async sync(): Promise<void> {
     await this.device.queue.onSubmittedWorkDone();
   }

   /**
    * Obtain the runtime information in readable format.
    */
   runtimeStatsText(): string {
     let info = "peak-memory=" + Math.ceil(this.peakAllocatedBytes / (1 << 20)) + " MB";
     info += ", all-memory=" + Math.ceil(this.allAllocatedBytes / (1 << 20)) + " MB";
     info += ", shader-submissions=" + this.shaderSubmitCounter;
     return info;
   }

   /**
    * Draw image from data in storage buffer.
    * @param ptr The GPU ptr
    * @param height The height of the image.
    * @param width The width of the image.
    */
   drawImageFromBuffer(ptr: GPUPointer, height: number, width: number) {
     if (this.canvasRenderManager == undefined) {
       throw Error("Do not have a canvas context, call bindCanvas first");
     }
     this.canvasRenderManager.draw(this.gpuBufferFromPtr(ptr), height, width);
   }

   /**
    * Copy raw bytes into buffer ptr.
    *
    * @param rawBytes The raw bytes
    * @param toPtr The target gpu buffer ptr
    * @param toOffset The beginning offset
    * @param nbytes Number of bytes
    */
   copyRawBytesToBuffer(
     rawBytes: Uint8Array,
     toPtr: GPUPointer,
     toOffset: number,
     nbytes: number
   ): void {
     // Perhaps it would be more useful to use a staging buffer?
     this.device.queue.writeBuffer(
       this.gpuBufferFromPtr(toPtr),
       toOffset,
       rawBytes,
       0,
       nbytes
     );
   }
   /**
    * Clear canvas
    */
   clearCanvas() {
     this.canvasRenderManager?.clear();
   }

   /**
    * Bind a canvas element to the runtime.
    * @param canvas The HTML canvas/
    */
   bindCanvas(canvas: HTMLCanvasElement) {
     this.canvasRenderManager = new CanvasRenderManager(this.device, canvas);
   }

   /**
    * Create a PackedFunc that runs the given shader
    * via createComputePipeline
    *
    * @param info The function information already parsed as a record.
    * @param code The shader data(in WGSL)
    * @returns The shader
    */
   createShader(finfo: FunctionInfo, code: string): Function {
     return this.createShadeInternal(finfo, code, false) as Function;
   }

   /**
    * Create a PackedFunc that runs the given shader asynchronously
    * via createComputePipelineAsync
    *
    * @param info The function information already parsed as a record.
    * @param code The shader data(in WGSL)
    * @returns The shader
    */
   async createShaderAsync(finfo: FunctionInfo, code: string): Promise<Function> {
     return await (this.createShadeInternal(finfo, code, true) as Promise<Function>);
   }

   /**
    * Get the pod arg staging buffer
    * \param nbytes The minimum size.
    * \return The allocated buffer
    */
   private getPodArgsBuffer(nbytes: number): GPUBuffer {
     let buffer: GPUBuffer | undefined = undefined;
     if (this.podArgStagingBuffers.length >= this.maxNumPodArgsStagingBuffers) {
       buffer = this.podArgStagingBuffers.shift();
     }
     // minimum of 16 bytes
     let allocSize = 16;
     if (buffer !== undefined) {
       allocSize = buffer.size;
       if (buffer.size < nbytes) {
         buffer.destroy();
         buffer = undefined;
       }
     }
     while (allocSize < nbytes) {
       allocSize *= 2;
     }

     if (buffer == undefined) {
       // create uniform buffer
       buffer = tryCreateBuffer(this.device, {
         size: allocSize,
         usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
       });
     }
     assert(nbytes <= buffer.size);
     return buffer;
   }

   /**
    * Internal impl of createShader for both async and sync mode.
    *
    * @param info The function information already parsed as a record.
    * @param code The shader data(in WGSL)
    * @param asyncMode Whether use async mode.
    * @returns The shader function or promise of shader func.
    */
   private createShadeInternal(
     finfo: FunctionInfo,
     code: string,
     asyncMode: boolean
   ): Function | Promise<Function> {
     const dispatchToDim: Array<number> = [];
     let paramWriteAccess: Array<number> = [];

     for (let i = 0; i < finfo.launch_param_tags.length; ++i) {
       const tag: string = finfo.launch_param_tags[i];
       if (tag.startsWith("blockIdx.")) {
         const target: number = tag.charCodeAt(tag.length - 1) - ("x".charCodeAt(0));
         assert(target >= 0 && target < 3);
         dispatchToDim.push(target);
       } else if (tag.startsWith("threadIdx.")) {
         const target: number = tag.charCodeAt(tag.length - 1) - ("x".charCodeAt(0));
         assert(target >= 0 && target < 3);
         dispatchToDim.push(target + 3);
       } else if (tag.startsWith("paramWriteAccess:")) {
         paramWriteAccess = JSON.parse(tag.substring(17));
       } else {
         throw new Error("Cannot handle thread_axis " + tag);
       }
     }


     const layoutEntries: Array<GPUBindGroupLayoutEntry> = [];
     const bufferArgIndices: Array<number> = [];
     const podArgIndices: Array<number> = [];

     for (let i = 0; i < finfo.arg_types.length; ++i) {
       const dtype = finfo.arg_types[i];
       if (dtype == "handle") {
         layoutEntries.push({
           binding: bufferArgIndices.length,
           visibility: GPUShaderStage.COMPUTE,
           buffer: {
             type: paramWriteAccess[bufferArgIndices.length] ? "storage" : "read-only-storage"
           }
         });
         bufferArgIndices.push(i);
       } else if (dtype.startsWith("int") || dtype.startsWith("uint") || dtype.startsWith("float")) {
         podArgIndices.push(i);
       } else {
         throw new Error("Cannot handle argument type " + dtype + " in WebGPU shader");
       }
     }

     assert(paramWriteAccess.length == bufferArgIndices.length);
     // POD arguments are pass in the end
     layoutEntries.push({
       binding: bufferArgIndices.length,
       visibility: GPUShaderStage.COMPUTE,
       buffer: {
         type: "uniform"
       }
     });

     const bindGroupLayout = this.device.createBindGroupLayout({
       entries: layoutEntries
     });
     const pipelineLayout = this.device.createPipelineLayout({
       bindGroupLayouts: [bindGroupLayout]
     });

     // Function to create the pipeline.
     const createShaderFunc = (pipeline: GPUComputePipeline): Function => {
       const submitShader = (...args: Array<GPUPointer | number>): void => {
         if (this.debugShaderSubmitLimit != -1 &&
           this.shaderSubmitCounter >= this.debugShaderSubmitLimit) {
           this.shaderSubmitCounter += 1;
           return;
         }

         const commandEncoder = this.device.createCommandEncoder();
         const compute = commandEncoder.beginComputePass();
         compute.setPipeline(pipeline);
         const bindGroupEntries: Array<GPUBindGroupEntry> = [];
         const numBufferOrPodArgs = bufferArgIndices.length + podArgIndices.length;

         assert(args.length == numBufferOrPodArgs + dispatchToDim.length);

         const workDim: Array<number> = [1, 1, 1, 1, 1, 1];
         for (let i = 0; i < dispatchToDim.length; ++i) {
           workDim[dispatchToDim[i]] = args[numBufferOrPodArgs + i];
         }

         // get around 65535 restriction of blockIdx.x
         if (workDim[2] != 1) {
           throw Error("WebGPU: blockIdx.z is reserved for internal use");
         }
         const packDimX = workDim[0];
         // spread thinsg out into blockIdx.z
         if (workDim[0] >= (1 << 16)) {
           let wl_x = workDim[0];
           let wl_z = workDim[2];

           while (wl_x >= (1 << 16)) {
             if (wl_x % 2 == 0) {
               wl_x = wl_x / 2;
             } else {
               // pad up
               wl_x = (wl_x + 1) / 2;
             }
             wl_z *= 2;
           }
           workDim[0] = wl_x;
           workDim[2] = wl_z;
           assert(wl_x * wl_z >= packDimX);
         }

         for (let i = 0; i < bufferArgIndices.length; ++i) {
           bindGroupEntries.push({
             binding: i,
             resource: {
               buffer: this.gpuBufferFromPtr(args[bufferArgIndices[i]])
             }
           });
         }

         // push pod buffer
         const sizeOfI32 = 4;
         const podArgBuffer = this.getPodArgsBuffer((podArgIndices.length + 1) * sizeOfI32);
         const i32View = new Int32Array(podArgIndices.length + 1);
         const u32View = new Uint32Array(i32View.buffer);
         const f32View = new Float32Array(i32View.buffer);

         for (let i = 0; i < podArgIndices.length; ++i) {
           const value = args[podArgIndices[i]];
           const dtype = finfo.arg_types[podArgIndices[i]];
           if (dtype.startsWith("int")) {
             i32View[i] = value;
           } else if (dtype.startsWith("uint")) {
             u32View[i] = value;
           } else if (dtype.startsWith("float")) {
             f32View[i] = value;
           } else {
             throw Error("Unknown pod dtype " + dtype);
           }
         }
         // always pass in dim z launching grid size in
         u32View[podArgIndices.length] = packDimX;
         this.device.queue.writeBuffer(podArgBuffer, 0, i32View.buffer);

         bindGroupEntries.push({
           binding: bufferArgIndices.length,
           resource: {
             buffer: podArgBuffer,
             size: i32View.buffer.byteLength
           }
         });

         compute.setBindGroup(0, this.device.createBindGroup({
           layout: bindGroupLayout,
           entries: bindGroupEntries
         }));

         compute.dispatchWorkgroups(workDim[0], workDim[1], workDim[2])
         compute.end()
         const command = commandEncoder.finish();
         this.device.queue.submit([command]);

         if (this.debugLogFinish) {
           const currCounter = this.shaderSubmitCounter;
           this.device.queue.onSubmittedWorkDone().then(() => {
             console.log("[" + currCounter + "][Debug] finish shader" + finfo.name);
           });
         }
         this.shaderSubmitCounter += 1;
       };
       return submitShader;
     };

     const shaderModule = this.device.createShaderModule({
       code: code,
       compilationHints: [
         {
           entryPoint: "main",
           layout: pipelineLayout
         }
       ]
     });

     if (asyncMode) {
       return this.device.createComputePipelineAsync({
         layout: pipelineLayout,
         compute: {
           module: shaderModule,
           entryPoint: finfo.name
         }
       }).then((pipeline: GPUComputePipeline) => {
         return createShaderFunc(pipeline);
       });
     } else {
       const pipeline = this.device.createComputePipeline({
         layout: pipelineLayout,
         compute: {
           module: shaderModule,
           entryPoint: finfo.name
         }
       });
       return createShaderFunc(pipeline);
     }
   }

   /**
    * Get the device API according to its name
    * @param The name of the API.
    * @returns The corresponding device api.
    */
   getDeviceAPI(name: string): Function {
     if (name == "deviceAllocDataSpace") {
       return (nbytes: number): GPUPointer => {
         return this.deviceAllocDataSpace(nbytes);
       };
     } else if (name == "deviceFreeDataSpace") {
       return (ptr: GPUPointer): void => {
         return this.deviceFreeDataSpace(ptr);
       };
     } else if (name == "deviceCopyToGPU") {
       return (
         from: Pointer,
         to: GPUPointer,
         toOffset: number,
         nbytes: number
       ): void => {
         this.deviceCopyToGPU(from, to, toOffset, nbytes);
       };
     } else if (name == "deviceCopyFromGPU") {
       return (
         from: GPUPointer,
         fromOffset: number,
         to: Pointer,
         nbytes: number
       ): void => {
         this.deviceCopyFromGPU(from, fromOffset, to, nbytes);
       };
     } else if (name == "deviceCopyWithinGPU") {
       return (
         from: GPUPointer,
         fromOffset: number,
         to: Pointer,
         toOffset: number,
         nbytes: number
       ): void => {
         this.deviceCopyWithinGPU(from, fromOffset, to, toOffset, nbytes);
       };
     } else {
       throw new Error("Unknown DeviceAPI function " + name);
     }
   }

   // DeviceAPI
   private deviceAllocDataSpace(nbytes: number): GPUPointer {
     // allocate 0 bytes buffer as 1 bytes buffer.
     if (nbytes == 0) {
       nbytes = 1;
     }
     const buffer = tryCreateBuffer(this.device, {
       size: nbytes,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
     });
     this.currAllocatedBytes += nbytes;
     this.allAllocatedBytes += nbytes;
     if (this.currAllocatedBytes > this.peakAllocatedBytes) {
       this.peakAllocatedBytes = this.currAllocatedBytes;
     }
     const ptr = this.attachToBufferTable(buffer);
     return ptr;
   }

   private deviceFreeDataSpace(ptr: GPUPointer): void {
     const idx = ptr;
     const buffer = this.bufferTable[idx];
     this.bufferTable[idx] = undefined;
     assert(buffer !== undefined);
     this.bufferTableFreeId.push(idx);
     this.currAllocatedBytes -= buffer.size;
     buffer.destroy();
   }

   private deviceCopyToGPU(
     from: Pointer,
     to: GPUPointer,
     toOffset: number,
     nbytes: number
   ): void {
     // Perhaps it would be more useful to use a staging buffer?
     let rawBytes = this.memory.loadRawBytes(from, nbytes);
     if (rawBytes.length % 4 !== 0) {
       // writeBuffer requires length to be multiples of 4, so we pad here
       const toPad = 4 - rawBytes.length % 4;
       rawBytes = new Uint8Array(rawBytes.length + toPad);
       rawBytes.set(rawBytes);
       nbytes = nbytes + toPad;
     }
     this.device.queue.writeBuffer(
       this.gpuBufferFromPtr(to),
       toOffset,
       rawBytes,
       0,
       nbytes
     );
   }

   private deviceCopyFromGPU(
     from: GPUPointer,
     fromOffset: number,
     to: Pointer,
     nbytes: number
   ): void {
     // Perhaps it would be more useful to resuse a staging buffer?
     const gpuTemp = tryCreateBuffer(this.device, {
       size: nbytes,
       usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
     });

     const copyEncoder = this.device.createCommandEncoder();
     copyEncoder.copyBufferToBuffer(
       this.gpuBufferFromPtr(from),
       fromOffset,
       gpuTemp,
       0,
       nbytes
     );
     const copyCommands = copyEncoder.finish();
     this.device.queue.submit([copyCommands]);

     gpuTemp.mapAsync(GPUMapMode.READ).then(() => {
       const data = gpuTemp.getMappedRange();
       this.memory.storeRawBytes(to, new Uint8Array(data));
       gpuTemp.destroy();
     });
   }

   private deviceCopyWithinGPU(
     from: GPUPointer,
     fromOffset: number,
     to: Pointer,
     toOffset: number,
     nbytes: number
   ): void {
     const copyEncoder = this.device.createCommandEncoder();
     copyEncoder.copyBufferToBuffer(
       this.gpuBufferFromPtr(from),
       fromOffset,
       this.gpuBufferFromPtr(to),
       toOffset,
       nbytes
     );
     const copyCommands = copyEncoder.finish();
     this.device.queue.submit([copyCommands]);
   }

   private gpuBufferFromPtr(ptr: GPUPointer): GPUBuffer {
     const buffer = this.bufferTable[ptr];
     assert(buffer !== undefined);
     return buffer;
   }

   private attachToBufferTable(buffer: GPUBuffer): GPUPointer {
     if (this.bufferTableFreeId.length != 0) {
       const idx = this.bufferTableFreeId.pop() as number;
       this.bufferTable[idx] = buffer;
       return idx;
     } else {
       const idx = this.bufferTable.length;
       this.bufferTable.push(buffer);
       return idx;
     }
   }
 }