blob: 1c38ce39ee5b43b2fb3aa0ccf91fa1fd037ef7b7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
const float32View = new Float32Array(1);
const int32View = new Int32Array(float32View.buffer);
export function toFloat16(value: number) {
float32View[0] = value;
const floatValue = int32View[0];
const sign = (floatValue >>> 16) & 0x8000; // sign only
const exponent = ((floatValue >>> 23) & 0xff) - 127; // extract exponent from floatValue
const significand = floatValue & 0x7fffff; // extract significand from floatValue
if (exponent === 128) { // floatValue is NaN or Infinity
return sign | 0x7c00 | (significand !== 0 ? 0x0200 : 0);
}
if (exponent > 15) {
return sign | 0x7c00; // return Infinity
}
if (exponent < -14) {
// subnormal
// shift amount = 13 - 14 - exponent = -1 - exponent
return sign | ((significand | 0x800000) >> (13 - 14 - exponent));
}
return sign | ((exponent + 15) << 10) | (significand >> 13);
}
const float32ViewBf = new Float32Array(1);
const uint32ViewBf = new Uint32Array(float32ViewBf.buffer);
/**
* Convert float32 to bfloat16 bits (round-to-nearest, ties-to-even).
* BFloat16 layout: 1 sign, 8 exponent, 7 mantissa.
*/
export function toBFloat16(value: number): number {
float32ViewBf[0] = value;
const bits = uint32ViewBf[0];
const exponent = (bits >> 23) & 0xff;
if (exponent === 255) {
return (bits >> 16) & 0xffff;
}
const remainder = bits & 0x1ffff;
let u = (bits + 0x8000) >> 16;
if (remainder === 0x8000 && (u & 1) !== 0) {
u--;
}
return u & 0xffff;
}