2025-08-21 17:09.18: New job: test ahrefs/ocannl https://github.com/ahrefs/ocannl.git#refs/heads/master (dbc50df8e2dc8257128cda47f8b3ca669707a340) (linux-x86_64:(lint-fmt))Base: ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47aocamlformat version: version 0.27.0 (from opam)To reproduce locally:git clone --recursive "https://github.com/ahrefs/ocannl.git" -b "master" && cd "ocannl" && git reset --hard dbc50df8cat > Dockerfile <<'END-OF-DOCKERFILE'FROM ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47aUSER 1000:1000RUN cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -uRUN opam depext -i duneWORKDIR /srcRUN opam depext -i ocamlformat=0.27.0COPY --chown=1000:1000 . /src/RUN opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)END-OF-DOCKERFILEdocker build .END-REPRO-BLOCK2025-08-21 17:09.18: Using cache hint "ahrefs/ocannl-ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a-debian-12-4.08_opam-2.4-ocamlformat-b8021439f8c57ba6435bc2263f6596671f4f4466"2025-08-21 17:09.18: Using OBuilder spec:((from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)(user (uid 1000) (gid 1000))(run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))(run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "opam depext -i dune"))(workdir /src)(run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "opam depext -i ocamlformat=0.27.0"))(copy (src .) (dst /src/))(run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)")))2025-08-21 17:09.18: Waiting for resource in pool OCluster2025-08-21 17:09.18: Waiting for worker…2025-08-21 17:10.35: Got resource from pool OClusterBuilding on eumacheHEAD is now at 9afb059a Customize `@claude` -> `@claude-opus` and set up the build environment for Claude mentions Note: the Claude PR review action not updated here.HEAD is now at dbc50df8 Give the Claude action for @-mentions very broad permissions(from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)Unable to find image 'ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a' locallydocker.io/ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a: Pulling from ocaml/opam2b437dab448b: Already exists348b54b4f842: Already existsd67c1d522a4e: Already existsdea856d1a4d8: Already existsb580a9d7b5b1: Already existsDigest: sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47aStatus: Downloaded newer image for ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a2025-08-21 17:11.20 ---> saved as "1d0024db739bd078f91b2384c47919652a4b72a425e3e24ce24cfd1f6debdfbc"/: (user (uid 1000) (gid 1000))/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))b8021439f8 Merge pull request #28261 from kit-ty-kate/deploy-fix-25819<><> Updating package repositories ><><><><><><><><><><><><><><><><><><><><><><>[default] Initialiseddefault (at git+file:///home/opam/opam-repository):[INFO] opam 2.1 and 2.2 include many performance and security improvements over 2.0; please consider upgrading (https://opam.ocaml.org/doc/Install.html)Everything as up-to-date as possible (run with --verbose to show unavailable upgrades).However, you may "opam upgrade" these packages explicitly, which will ask permission to downgrade or uninstall the conflicting packages.Nothing to do.# Run eval $(opam env) to update the current shell environment2025-08-21 17:12.50 ---> saved as "76d9d96bb26da3c78200d383fd35f876d80571baf05962331a1fff5f47db0e2e"/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "opam depext -i dune"))# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian# No extra OS packages requirements found.# All required OS packages found.# Now letting opam install the packagesThe following actions will be performed:- install dune 3.19.1<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>[dune.3.19.1] found in cache<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>-> installed dune.3.19.1Done.# Run eval $(opam env) to update the current shell environment2025-08-21 17:13.52 ---> saved as "da0888a20a067de19f6183f0b497dcc2d1ea7f7036861cc50f633c662efdce4f"/: (workdir /src)/src: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))(network host)(shell "opam depext -i ocamlformat=0.27.0"))# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian# No extra OS packages requirements found.# All required OS packages found.# Now letting opam install the packagesThe following actions will be performed:- install sexplib0 v0.14.0 [required by base]- install menhirLib 20240715 [required by ocamlformat-lib]- install menhirCST 20240715 [required by menhir]- install ocamlbuild 0.16.1 [required by fpath, astring, uuseg]- install cmdliner 1.3.0 [required by ocamlformat]- install menhirSdk 20240715 [required by ocamlformat-lib]- install either 1.0.0 [required by ocamlformat-lib]- install csexp 1.5.2 [required by ocamlformat]- install dune-build-info 3.19.1 [required by ocamlformat-lib]- install camlp-streams 5.0.1 [required by ocamlformat-lib]- install seq base [required by re]- install fix 20250428 [required by ocamlformat-lib]- install ocaml-version 4.0.1 [required by ocamlformat-lib]- install ocamlfind 1.9.8 [required by ocp-indent, astring, fpath, uuseg]- install menhir 20240715 [required by ocamlformat-lib]- install dune-configurator 3.19.1 [required by base]- install re 1.11.0 [required by ocamlformat]- install topkg 1.1.0 [required by fpath, astring, uuseg]- install base-bytes base [required by ocp-indent]- install base v0.14.3 [required by ocamlformat-lib]- install uutf 1.0.4 [required by ocamlformat-lib]- install astring 0.8.5 [required by ocamlformat-lib]- install ocp-indent 1.8.1 [required by ocamlformat-lib]- install stdio v0.14.0 [required by ocamlformat-lib]- install uucp 15.0.0 [required by uuseg]- install fpath 0.7.3 [required by ocamlformat-lib]- install uuseg 15.0.0 [required by ocamlformat-lib]- install ocamlformat-lib 0.27.0 [required by ocamlformat]- install ocamlformat 0.27.0===== 29 to install =====<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>[astring.0.8.5] found in cache[base.v0.14.3] found in cache[camlp-streams.5.0.1] found in cache[cmdliner.1.3.0] found in cache[csexp.1.5.2] found in cache[dune-build-info.3.19.1] found in cache[dune-configurator.3.19.1] found in cache[either.1.0.0] found in cache[fix.20250428] found in cache[fpath.0.7.3] found in cache[menhir.20240715] found in cache[menhirCST.20240715] found in cache[menhirLib.20240715] found in cache[menhirSdk.20240715] found in cache[ocaml-version.4.0.1] found in cache[ocamlbuild.0.16.1] found in cache[ocamlfind.1.9.8] found in cache[ocamlformat.0.27.0] found in cache[ocamlformat-lib.0.27.0] found in cache[ocp-indent.1.8.1] found in cache[re.1.11.0] found in cache[sexplib0.v0.14.0] found in cache[stdio.v0.14.0] found in cache[topkg.1.1.0] found in cache[uucp.15.0.0] found in cache[uuseg.15.0.0] found in cache[uutf.1.0.4] found in cache<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>-> installed seq.base-> installed camlp-streams.5.0.1-> installed csexp.1.5.2-> installed either.1.0.0-> installed fix.20250428-> installed cmdliner.1.3.0-> installed menhirCST.20240715-> installed menhirLib.20240715-> installed menhirSdk.20240715-> installed ocaml-version.4.0.1-> installed re.1.11.0-> installed sexplib0.v0.14.0-> installed dune-build-info.3.19.1-> installed dune-configurator.3.19.1-> installed ocamlfind.1.9.8-> installed base-bytes.base-> installed ocp-indent.1.8.1-> installed ocamlbuild.0.16.1-> installed base.v0.14.3-> installed topkg.1.1.0-> installed stdio.v0.14.0-> installed uutf.1.0.4-> installed astring.0.8.5-> installed fpath.0.7.3-> installed menhir.20240715-> installed uucp.15.0.0-> installed uuseg.15.0.0-> installed ocamlformat-lib.0.27.0-> installed ocamlformat.0.27.0Done.<><> ocp-indent.1.8.1 installed successfully ><><><><><><><><><><><><><><><><><>=> This package requires additional configuration for use in editors. Install package 'user-setup', or manually:* for Emacs, add these lines to ~/.emacs:(add-to-list 'load-path "/home/opam/.opam/4.08/share/emacs/site-lisp")(require 'ocp-indent)* for Vim, add this line to ~/.vimrc:set rtp^="/home/opam/.opam/4.08/share/ocp-indent/vim"# Run eval $(opam env) to update the current shell environment2025-08-21 17:15.20 ---> saved as "86ec8dcb8046a1e5dacfb1841e8c026d30cfead67649bcb6d6a8f9ddd6fb153d"/src: (copy (src .) (dst /src/))2025-08-21 17:15.20 ---> saved as "c29a78c959e4fe2bc10dec03cfdc8e6c016c503d747e486a780452ac7eff54c5"/src: (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)"))File "arrayjit/lib/dune", line 7, characters 30-51:7 | (libraries base stdio pprint ppx_minidebug.runtime)^^^^^^^^^^^^^^^^^^^^^Error: Library "ppx_minidebug.runtime" not found.-> required by library "arrayjit.utils" in _build/default/arrayjit/lib-> required by executable ocannl_read_config in test/operations/config/dune:5-> required by _build/default/test/operations/config/ocannl_read_config.exe-> required by %{dep:ocannl_read_config.exe} at test/operations/dune:22-> required by _build/default/test/operations/config/ocannl_backend.txt-> required by %{read:config/ocannl_backend.txt} at test/operations/dune:37-> required by Computing directory contents of _build/default/test/operationsFile "arrayjit/lib/builtins_cuda.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/builtins_cuda.ml b/_build/default/arrayjit/lib/.formatted/builtins_cuda.mlindex 4dd3451..7a265ed 100644--- a/_build/default/arrayjit/lib/builtins_cuda.ml+++ b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml@@ -1,103 +1,119 @@(* CUDA builtin code split into (key, definition, dependencies) triples for filtering *)-let builtins = [- ("uint4x32_t", {|typedef struct {+let builtins =+ [+ ("uint4x32_t", {|typedef struct {unsigned int v[4];} uint4x32_t;|}, []);-- ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);- ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);- ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);- ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);- ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);- ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);- ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);- ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);-- ("uint32_to_single_uniform", {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {+ ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);+ ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);+ ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);+ ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);+ ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);+ ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);+ ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);+ ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);+ ( "uint32_to_single_uniform",+ {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {/* Use __uint2float_rn for correct rounding */return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f);-}|}, []);-- ("uint32_to_double_uniform", {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {+}|},+ [] );+ ( "uint32_to_double_uniform",+ {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {return __uint2double_rn(x) * (1.0 / 4294967296.0);-}|}, []);-- ("uint4x32_to_single_uniform", {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {+}|},+ [] );+ ( "uint4x32_to_single_uniform",+ {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {return uint32_to_single_uniform(x.v[0]);-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_double_uniform", {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_double_uniform",+ {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {unsigned long long combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0);-}|}, ["uint4x32_t"]);-- ("uint4x32_to_int32_uniform", {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_int32_uniform",+ {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {return (int)x.v[0];-}|}, ["uint4x32_t"]);-- ("uint4x32_to_i64_uniform", {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_i64_uniform",+ {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {return __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));-}|}, ["uint4x32_t"]);-- ("uint4x32_to_u32_uniform", {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_u32_uniform",+ {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {return x.v[0];-}|}, ["uint4x32_t"]);-- ("uint4x32_to_u64_uniform", {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_u64_uniform",+ {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {return (unsigned long long)__double_as_longlong(__hiloint2double(x.v[1], x.v[0]));-}|}, ["uint4x32_t"]);-- ("uint4x32_to_i8_uniform", {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_i8_uniform",+ {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {return (signed char)(x.v[0] & 0xFF);-}|}, ["uint4x32_t"]);-- ("uint4x32_to_u8_uniform", {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_u8_uniform",+ {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {return (unsigned char)(x.v[0] & 0xFF);-}|}, ["uint4x32_t"]);-- ("uint4x32_to_bfloat16_uniform", {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_bfloat16_uniform",+ {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {float f = uint32_to_single_uniform(x.v[0]);return (unsigned short)(__float_as_uint(f) >> 16);-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_half_uniform", {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_half_uniform",+ {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {float f = uint32_to_single_uniform(x.v[0]);return __float2half(f);-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_single_uniform_vec", {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_single_uniform_vec",+ {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {float4_t result;#pragma unrollfor (int i = 0; i < 4; i++) {result.v[i] = uint32_to_single_uniform(x.v[i]);}return result;-}|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_double_uniform_vec", {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_double_uniform_vec",+ {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {double2_t result;result.v[0] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[1], x.v[0]))) * (1.0 / 18446744073709551616.0);result.v[1] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[3], x.v[2]))) * (1.0 / 18446744073709551616.0);return result;-}|}, ["uint4x32_t"; "double2_t"]);-- ("uint4x32_to_int32_uniform_vec", {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "double2_t" ] );+ ( "uint4x32_to_int32_uniform_vec",+ {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {int32x4_t result;#pragma unrollfor (int i = 0; i < 4; i++) {result.v[i] = (int)x.v[i];}return result;-}|}, ["uint4x32_t"; "int32x4_t"]);-- ("uint4x32_to_i64_uniform_vec", {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "int32x4_t" ] );+ ( "uint4x32_to_i64_uniform_vec",+ {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {int64x2_t result;result.v[0] = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));result.v[1] = __double_as_longlong(__hiloint2double(x.v[3], x.v[2]));return result;-}|}, ["uint4x32_t"; "int64x2_t"]);-- ("uint4x32_to_i8_uniform_vec", {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "int64x2_t" ] );+ ( "uint4x32_to_i8_uniform_vec",+ {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {int8x16_t result;#pragma unrollfor (int i = 0; i < 4; i++) {@@ -107,9 +123,10 @@ let builtins = [result.v[i*4 + 3] = (signed char)((x.v[i] >> 24) & 0xFF);}return result;-}|}, ["uint4x32_t"; "int8x16_t"]);-- ("uint4x32_to_u16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "int8x16_t" ] );+ ( "uint4x32_to_u16_uniform_vec",+ {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {uint16x8_t result;#pragma unrollfor (int i = 0; i < 4; i++) {@@ -117,9 +134,10 @@ let builtins = [result.v[i*2 + 1] = (unsigned short)((x.v[i] >> 16) & 0xFFFF);}return result;-}|}, ["uint4x32_t"; "uint16x8_t"]);-- ("uint4x32_to_bfloat16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "uint16x8_t" ] );+ ( "uint4x32_to_bfloat16_uniform_vec",+ {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {uint16x8_t result;#pragma unrollfor (int i = 0; i < 4; i++) {@@ -130,9 +148,10 @@ let builtins = [result.v[i*2 + 1] = (unsigned short)(__float_as_uint(f2) >> 16);}return result;-}|}, ["uint4x32_t"; "uint16x8_t"]);-- ("uint4x32_to_half_uniform_vec", {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "uint16x8_t" ] );+ ( "uint4x32_to_half_uniform_vec",+ {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {half8_t result;#pragma unrollfor (int i = 0; i < 4; i++) {@@ -142,9 +161,10 @@ let builtins = [result.v[i*2 + 1] = __float2half(f2);}return result;-}|}, ["uint4x32_t"; "half8_t"]);-- ("uint4x32_to_u8_uniform_vec", {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {+}|},+ [ "uint4x32_t"; "half8_t" ] );+ ( "uint4x32_to_u8_uniform_vec",+ {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {uint8x16_t result;#pragma unrollfor (int i = 0; i < 4; i++) {@@ -154,70 +174,81 @@ let builtins = [result.v[i*4 + 3] = (unsigned char)((x.v[i] >> 24) & 0xFF);}return result;-}|}, ["uint4x32_t"; "uint8x16_t"]);-- ("single_to_uint4x32", {|__device__ uint4x32_t single_to_uint4x32(float x) {+}|},+ [ "uint4x32_t"; "uint8x16_t" ] );+ ( "single_to_uint4x32",+ {|__device__ uint4x32_t single_to_uint4x32(float x) {unsigned int bits = __float_as_uint(x);uint4x32_t result = {{bits, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("double_to_uint4x32", {|__device__ uint4x32_t double_to_uint4x32(double x) {+}|},+ [ "uint4x32_t" ] );+ ( "double_to_uint4x32",+ {|__device__ uint4x32_t double_to_uint4x32(double x) {unsigned long long bits = __double_as_longlong(x);uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("int32_to_uint4x32", {|__device__ uint4x32_t int32_to_uint4x32(int x) {+}|},+ [ "uint4x32_t" ] );+ ( "int32_to_uint4x32",+ {|__device__ uint4x32_t int32_to_uint4x32(int x) {uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("int64_to_uint4x32", {|__device__ uint4x32_t int64_to_uint4x32(long long x) {+}|},+ [ "uint4x32_t" ] );+ ( "int64_to_uint4x32",+ {|__device__ uint4x32_t int64_to_uint4x32(long long x) {unsigned long long bits = (unsigned long long)x;uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("uint32_to_uint4x32", {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint32_to_uint4x32",+ {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {uint4x32_t result = {{x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("uint64_to_uint4x32", {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint64_to_uint4x32",+ {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {uint4x32_t result = {{(unsigned int)(x & 0xFFFFFFFF), (unsigned int)(x >> 32), 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("byte_to_uint4x32", {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {+}|},+ [ "uint4x32_t" ] );+ ( "byte_to_uint4x32",+ {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("uint16_to_uint4x32", {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {+}|},+ [ "uint4x32_t" ] );+ ( "uint16_to_uint4x32",+ {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("bfloat16_to_uint4x32", {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {+}|},+ [ "uint4x32_t" ] );+ ( "bfloat16_to_uint4x32",+ {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("half_to_uint4x32", {|__device__ uint4x32_t half_to_uint4x32(__half x) {+}|},+ [ "uint4x32_t" ] );+ ( "half_to_uint4x32",+ {|__device__ uint4x32_t half_to_uint4x32(__half x) {unsigned short bits = __half_as_ushort(x);uint4x32_t result = {{(unsigned int)bits, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("fp8_to_uint4x32", {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {+}|},+ [ "uint4x32_t" ] );+ ( "fp8_to_uint4x32",+ {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};return result;-}|}, ["uint4x32_t"]);-- ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);-- ("THREEFRY_ROTATION", {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {+}|},+ [ "uint4x32_t" ] );+ ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);+ ( "THREEFRY_ROTATION",+ {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {{13, 15, 26, 6},{17, 29, 16, 24},{13, 15, 26, 6},@@ -226,13 +257,15 @@ let builtins = [{17, 29, 16, 24},{13, 15, 26, 6},{17, 29, 16, 24}-};|}, []);-- ("rotl32", {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {+};|},+ [] );+ ( "rotl32",+ {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {return __funnelshift_l(x, x, n);-}|}, []);-- ("threefry_round", {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {+}|},+ [] );+ ( "threefry_round",+ {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;@@ -246,9 +279,10 @@ let builtins = [tmp = x.y;x.y = x.w;x.w = tmp;-}|}, ["rotl32"]);-- ("arrayjit_threefry4x32", {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {+}|},+ [ "rotl32" ] );+ ( "arrayjit_threefry4x32",+ {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]);uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]);@@ -310,5 +344,6 @@ let builtins = [result.v[2] = x.z;result.v[3] = x.w;return result;-}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]);-]\ No newline at end of file+}|},+ [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] );+ ]File "arrayjit/lib/builtins_cc.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/builtins_cc.ml b/_build/default/arrayjit/lib/.formatted/builtins_cc.mlindex d4d6f9c..a66ed26 100644--- a/_build/default/arrayjit/lib/builtins_cc.ml+++ b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml@@ -1,4 +1,5 @@-let includes = {|+let includes =+ {|#include <stdio.h>#include <math.h>#include <stdint.h>@@ -9,74 +10,84 @@ let includes = {||}(* Each entry is (key, definition, dependencies) *)-let builtins = [- (* Float16 feature detection and type definitions *)- ("HAS_NATIVE_FLOAT16", {|+let builtins =+ [+ (* Float16 feature detection and type definitions *)+ ( "HAS_NATIVE_FLOAT16",+ {|#ifdef __FLT16_MAX__#define HAS_NATIVE_FLOAT16 1#else#define HAS_NATIVE_FLOAT16 0#endif-|}, []);-- ("HALF_T", {|+|},+ [] );+ ( "HALF_T",+ {|#if HAS_NATIVE_FLOAT16#define HALF_T _Float16#else#define HALF_T uint16_t#endif-|}, ["HAS_NATIVE_FLOAT16"]);-- ("HALF_TO_FP", {|+|},+ [ "HAS_NATIVE_FLOAT16" ] );+ ( "HALF_TO_FP",+ {|#if HAS_NATIVE_FLOAT16#define HALF_TO_FP(x) (x) /* Identity - already in floating point */#else#define HALF_TO_FP(x) half_to_float_emulated(x) /* Convert to float for computation */#endif-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);-- ("FP_TO_HALF", {|+|},+ [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );+ ( "FP_TO_HALF",+ {|#if HAS_NATIVE_FLOAT16#define FP_TO_HALF(x) (x) /* Identity - already half precision */#else#define FP_TO_HALF(x) float_to_half_emulated(x) /* Convert back from float */#endif-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);-- ("HALF_TO_FLOAT", {|+|},+ [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );+ ( "HALF_TO_FLOAT",+ {|#if HAS_NATIVE_FLOAT16#define HALF_TO_FLOAT(x) ((float)(x))#else#define HALF_TO_FLOAT(x) half_to_float_emulated(x)#endif-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);-- ("FLOAT_TO_HALF", {|+|},+ [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );+ ( "FLOAT_TO_HALF",+ {|#if HAS_NATIVE_FLOAT16#define FLOAT_TO_HALF(x) ((_Float16)(x))#else#define FLOAT_TO_HALF(x) float_to_half_emulated(x)#endif-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);-- ("HALF_TO_UINT16", {|+|},+ [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );+ ( "HALF_TO_UINT16",+ {|#if HAS_NATIVE_FLOAT16#define HALF_TO_UINT16(x) ({ _Float16 _h = (x); uint16_t _r; memcpy(&_r, &_h, 2); _r; })#else#define HALF_TO_UINT16(x) (x)#endif-|}, ["HAS_NATIVE_FLOAT16"]);-- ("UINT16_TO_HALF", {|+|},+ [ "HAS_NATIVE_FLOAT16" ] );+ ( "UINT16_TO_HALF",+ {|#if HAS_NATIVE_FLOAT16#define UINT16_TO_HALF(x) ({ uint16_t _u = (x); _Float16 _h; memcpy(&_h, &_u, 2); _h; })#else#define UINT16_TO_HALF(x) (x)#endif-|}, ["HAS_NATIVE_FLOAT16"]);-- (* Float16 emulation functions *)- ("half_to_float_emulated", {|+|},+ [ "HAS_NATIVE_FLOAT16" ] );+ (* Float16 emulation functions *)+ ( "half_to_float_emulated",+ {|#if !HAS_NATIVE_FLOAT16/* Convert IEEE 754 half precision (stored as uint16_t) to float */static float half_to_float_emulated(uint16_t h) {@@ -108,9 +119,10 @@ static float half_to_float_emulated(uint16_t h) {}}#endif-|}, ["HAS_NATIVE_FLOAT16"]);-- ("float_to_half_emulated", {|+|},+ [ "HAS_NATIVE_FLOAT16" ] );+ ( "float_to_half_emulated",+ {|#if !HAS_NATIVE_FLOAT16/* Convert float to IEEE 754 half precision (stored as uint16_t) */static uint16_t float_to_half_emulated(float f) {@@ -197,16 +209,16 @@ static uint16_t float_to_half_emulated(float f) {}}#endif-|}, ["HAS_NATIVE_FLOAT16"]);-- (* Threefry4x32 types and complete implementation *)- ("uint4x32_t", {|+|},+ [ "HAS_NATIVE_FLOAT16" ] );+ (* Threefry4x32 types and complete implementation *)+ ("uint4x32_t", {|typedef struct {uint32_t v[4];} uint4x32_t;|}, []);-- ("arrayjit_threefry4x32", {|+ ( "arrayjit_threefry4x32",+ {|/* Threefry4x32 constants */const uint32_t THREEFRY_C240 = 0x1BD11BDA;@@ -329,75 +341,72 @@ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {result.v[3] = x[3];return result;}-|}, ["uint4x32_t"]);-- (* Vector types with half precision *)- ("half8_t", {|+|},+ [ "uint4x32_t" ] );+ (* Vector types with half precision *)+ ("half8_t", {|typedef struct { HALF_T v[8]; } half8_t;-|}, ["HALF_T"]);-- ("float4_t", {|+|}, [ "HALF_T" ]);+ ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);-- ("double2_t", {|+ ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);-- ("int32x4_t", {|+ ("int32x4_t", {|typedef struct { int32_t v[4]; } int32x4_t;|}, []);-- ("int64x2_t", {|+ ("int64x2_t", {|typedef struct { int64_t v[2]; } int64x2_t;|}, []);-- ("int8x16_t", {|+ ("int8x16_t", {|typedef struct { int8_t v[16]; } int8x16_t;|}, []);-- ("uint16x8_t", {|+ ("uint16x8_t", {|typedef struct { uint16_t v[8]; } uint16x8_t;|}, []);-- ("uint8x16_t", {|+ ("uint8x16_t", {|typedef struct { uint8_t v[16]; } uint8x16_t;|}, []);-- (* Basic conversion functions *)- ("uint32_to_single_uniform", {|+ (* Basic conversion functions *)+ ( "uint32_to_single_uniform",+ {|/* Convert to float in [0, 1) */float uint32_to_single_uniform(uint32_t x) {/* Use upper 24 bits for float mantissa (23 bits + implicit 1) */return (x >> 8) * (1.0f / 16777216.0f);}-|}, []);-- ("uint32_to_double_uniform", {|+|},+ [] );+ ( "uint32_to_double_uniform",+ {|/* Convert to double in [0, 1) */double uint32_to_double_uniform(uint32_t x) {return x * (1.0 / 4294967296.0);}-|}, []);-- (* Conversion functions with dependencies *)- ("uint4x32_to_single_uniform", {|+|},+ [] );+ (* Conversion functions with dependencies *)+ ( "uint4x32_to_single_uniform",+ {|/* Uint4x32 to float32 uniform - uses first 32 bits */float uint4x32_to_single_uniform(uint4x32_t x) {return uint32_to_single_uniform(x.v[0]);}-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_half_uniform", {|+|},+ [ "uint4x32_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_half_uniform",+ {|/* Uint4x32 to float16 uniform - uses first 16 bits */uint16_t uint4x32_to_half_uniform(uint4x32_t x) {/* Convert through float for consistent behavior */float f = (x.v[0] & 0xFFFF) * (1.0f / 65536.0f);return FLOAT_TO_HALF(f);}-|}, ["uint4x32_t"; "FLOAT_TO_HALF"]);-- ("uint4x32_to_half_uniform_vec", {|+|},+ [ "uint4x32_t"; "FLOAT_TO_HALF" ] );+ ( "uint4x32_to_half_uniform_vec",+ {|/* Convert uint4x32 to 8 float16s uniform */half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {half8_t result;@@ -412,10 +421,11 @@ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "half8_t"; "FLOAT_TO_HALF"]);-- (* Pure C conversion functions *)- ("bfloat16_to_single", {|+|},+ [ "uint4x32_t"; "half8_t"; "FLOAT_TO_HALF" ] );+ (* Pure C conversion functions *)+ ( "bfloat16_to_single",+ {|/* BFloat16 to Float conversion (C function) */float bfloat16_to_single(uint16_t bf16){@@ -424,9 +434,10 @@ float bfloat16_to_single(uint16_t bf16)uint32_t f32 = ((uint32_t)bf16) << 16;return *((float *)&f32);}-|}, []);-- ("single_to_bfloat16", {|+|},+ [] );+ ( "single_to_bfloat16",+ {|/* Float to BFloat16 conversion (C function) */uint16_t single_to_bfloat16(float f){@@ -436,27 +447,30 @@ uint16_t single_to_bfloat16(float f)uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);return (uint16_t)(rounded >> 16);}-|}, []);-- ("half_to_single", {|+|},+ [] );+ ( "half_to_single",+ {|/* Half (Float16) to Float conversion (C function) */float half_to_single(uint16_t h){HALF_T half_val = UINT16_TO_HALF(h);return HALF_TO_FLOAT(half_val);}-|}, ["HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT"]);-- ("single_to_half", {|+|},+ [ "HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT" ] );+ ( "single_to_half",+ {|/* Float to Half (Float16) conversion (C function) */uint16_t single_to_half(float f){HALF_T half_val = FLOAT_TO_HALF(f);return HALF_TO_UINT16(half_val);}-|}, ["HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16"]);-- ("fp8_to_single", {|+|},+ [ "HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16" ] );+ ( "fp8_to_single",+ {|/* FP8 E5M2 format to Float conversion (C function)Format: 1 sign bit, 5 exponent bits, 2 mantissa bits */float fp8_to_single(uint8_t fp8)@@ -500,9 +514,10 @@ float fp8_to_single(uint8_t fp8)return result;}-|}, []);-- ("single_to_fp8", {|+|},+ [] );+ ( "single_to_fp8",+ {|/* Float to FP8 E5M2 conversion (C function) */uint8_t single_to_fp8(float f){@@ -560,143 +575,162 @@ uint8_t single_to_fp8(float f)return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));}-|}, []);-- (* Conversion functions from various precisions to uint4x32_t *)- ("int32_to_uint4x32", {|+|},+ [] );+ (* Conversion functions from various precisions to uint4x32_t *)+ ( "int32_to_uint4x32",+ {|uint4x32_t int32_to_uint4x32(int32_t x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("int64_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "int64_to_uint4x32",+ {|uint4x32_t int64_to_uint4x32(int64_t x) {uint64_t bits = (uint64_t)x;uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("uint32_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "uint32_to_uint4x32",+ {|uint4x32_t uint32_to_uint4x32(uint32_t x) {uint4x32_t result = {{x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("uint64_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "uint64_to_uint4x32",+ {|uint4x32_t uint64_to_uint4x32(uint64_t x) {uint4x32_t result = {{(uint32_t)(x & 0xFFFFFFFF), (uint32_t)(x >> 32), 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("single_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "single_to_uint4x32",+ {|uint4x32_t single_to_uint4x32(float x) {uint32_t bits;memcpy(&bits, &x, sizeof(float));uint4x32_t result = {{bits, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("double_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "double_to_uint4x32",+ {|uint4x32_t double_to_uint4x32(double x) {uint64_t bits;memcpy(&bits, &x, sizeof(double));uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("byte_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "byte_to_uint4x32",+ {|uint4x32_t byte_to_uint4x32(unsigned char x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("uint16_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "uint16_to_uint4x32",+ {|uint4x32_t uint16_to_uint4x32(uint16_t x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("bfloat16_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "bfloat16_to_uint4x32",+ {|uint4x32_t bfloat16_to_uint4x32(uint16_t x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("half_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "half_to_uint4x32",+ {|uint4x32_t half_to_uint4x32(uint16_t x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- ("fp8_to_uint4x32", {|+|},+ [ "uint4x32_t" ] );+ ( "fp8_to_uint4x32",+ {|uint4x32_t fp8_to_uint4x32(uint8_t x) {uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};return result;}-|}, ["uint4x32_t"]);-- (* More uint4x32 to various precision conversion functions *)- ("uint4x32_to_double_uniform", {|+|},+ [ "uint4x32_t" ] );+ (* More uint4x32 to various precision conversion functions *)+ ( "uint4x32_to_double_uniform",+ {|/* Uint4x32 to float64 uniform - uses first 64 bits */double uint4x32_to_double_uniform(uint4x32_t x) {uint64_t combined = ((uint64_t)x.v[1] << 32) | x.v[0];return combined * (1.0 / 18446744073709551616.0);}-|}, ["uint4x32_t"]);-- ("uint4x32_to_int32_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_int32_uniform",+ {|/* Uint4x32 to int32 uniform - full range */int32_t uint4x32_to_int32_uniform(uint4x32_t x) {return (int32_t)x.v[0];}-|}, ["uint4x32_t"]);-- ("uint4x32_to_int64_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_int64_uniform",+ {|/* Uint4x32 to int64 uniform - full range */int64_t uint4x32_to_int64_uniform(uint4x32_t x) {return (int64_t)(((uint64_t)x.v[1] << 32) | x.v[0]);}-|}, ["uint4x32_t"]);-- ("uint4x32_to_uint32_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_uint32_uniform",+ {|/* Uint4x32 to uint32 uniform - full range */uint32_t uint4x32_to_uint32_uniform(uint4x32_t x) {return x.v[0];}-|}, ["uint4x32_t"]);-- ("uint4x32_to_uint64_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_uint64_uniform",+ {|/* Uint4x32 to uint64 uniform - full range */uint64_t uint4x32_to_uint64_uniform(uint4x32_t x) {return ((uint64_t)x.v[1] << 32) | x.v[0];}-|}, ["uint4x32_t"]);-- ("uint4x32_to_byte_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_byte_uniform",+ {|/* Uint4x32 to int8 uniform - full range */int8_t uint4x32_to_byte_uniform(uint4x32_t x) {return (int8_t)(x.v[0] & 0xFF);}-|}, ["uint4x32_t"]);-- ("uint4x32_to_uint16_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_uint16_uniform",+ {|/* Uint4x32 to uint16 uniform - full range */uint16_t uint4x32_to_uint16_uniform(uint4x32_t x) {return (uint16_t)(x.v[0] & 0xFFFF);}-|}, ["uint4x32_t"]);-- ("uint4x32_to_bfloat16_uniform", {|+|},+ [ "uint4x32_t" ] );+ ( "uint4x32_to_bfloat16_uniform",+ {|/* Uint4x32 to bfloat16 uniform - uses first 16 bits */uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {/* Convert to float first, then to bfloat16 */@@ -708,17 +742,19 @@ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {if ((bits & 0x8000) && ((bits & 0x7FFF) || (bf & 1))) bf++;return bf;}-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_fp8_uniform", {|+|},+ [ "uint4x32_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_fp8_uniform",+ {|/* Uint4x32 to fp8 uniform - uses first 8 bits */uint8_t uint4x32_to_fp8_uniform(uint4x32_t x) {return (uint8_t)(x.v[0] & 0xFF);}-|}, ["uint4x32_t"]);-- (* Vectorized conversion functions *)- ("uint4x32_to_single_uniform_vec", {|+|},+ [ "uint4x32_t" ] );+ (* Vectorized conversion functions *)+ ( "uint4x32_to_single_uniform_vec",+ {|/* Convert uint4x32 to 4 floats in [0, 1) */float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {float4_t result;@@ -727,9 +763,10 @@ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_double_uniform_vec", {|+|},+ [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_double_uniform_vec",+ {|/* Convert uint4x32 to 2 doubles in [0, 1) */double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {double2_t result;@@ -739,9 +776,10 @@ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {result.v[1] = combined2 * (1.0 / 18446744073709551616.0);return result;}-|}, ["uint4x32_t"; "double2_t"]);-- ("uint4x32_to_int32_uniform_vec", {|+|},+ [ "uint4x32_t"; "double2_t" ] );+ ( "uint4x32_to_int32_uniform_vec",+ {|/* Convert uint4x32 to 4 int32s - full range */int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {int32x4_t result;@@ -750,9 +788,10 @@ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "int32x4_t"]);-- ("uint4x32_to_int64_uniform_vec", {|+|},+ [ "uint4x32_t"; "int32x4_t" ] );+ ( "uint4x32_to_int64_uniform_vec",+ {|/* Convert uint4x32 to 2 int64s - full range */int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {int64x2_t result;@@ -760,9 +799,10 @@ int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {result.v[1] = (int64_t)(((uint64_t)x.v[3] << 32) | x.v[2]);return result;}-|}, ["uint4x32_t"; "int64x2_t"]);-- ("uint4x32_to_byte_uniform_vec", {|+|},+ [ "uint4x32_t"; "int64x2_t" ] );+ ( "uint4x32_to_byte_uniform_vec",+ {|/* Convert uint4x32 to 16 int8s - full range */int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {int8x16_t result;@@ -774,9 +814,10 @@ int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "int8x16_t"]);-- ("uint4x32_to_uint16_uniform_vec", {|+|},+ [ "uint4x32_t"; "int8x16_t" ] );+ ( "uint4x32_to_uint16_uniform_vec",+ {|/* Convert uint4x32 to 8 uint16s - full range */uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {uint16x8_t result;@@ -786,9 +827,10 @@ uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "uint16x8_t"]);-- ("uint4x32_to_bfloat16_uniform_vec", {|+|},+ [ "uint4x32_t"; "uint16x8_t" ] );+ ( "uint4x32_to_bfloat16_uniform_vec",+ {|/* Convert uint4x32 to 8 bfloat16s uniform */uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {uint16x8_t result;@@ -810,9 +852,10 @@ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "uint16x8_t"]);-- ("uint4x32_to_fp8_uniform_vec", {|+|},+ [ "uint4x32_t"; "uint16x8_t" ] );+ ( "uint4x32_to_fp8_uniform_vec",+ {|/* Convert uint4x32 to 16 fp8s uniform */uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {uint8x16_t result;@@ -824,7 +867,8 @@ uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {}return result;}-|}, ["uint4x32_t"; "uint8x16_t"]);-]+|},+ [ "uint4x32_t"; "uint8x16_t" ] );+ ]-let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)\ No newline at end of file+let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)File "arrayjit/lib/builtins_metal.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/builtins_metal.ml b/_build/default/arrayjit/lib/.formatted/builtins_metal.mlindex 7d2de15..0514d95 100644--- a/_build/default/arrayjit/lib/builtins_metal.ml+++ b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml@@ -1,24 +1,22 @@(* Metal builtin code split into (key, definition, dependencies) triples for filtering *)-let builtins = [- ("METAL_HEADERS", {|#include <metal_stdlib>+let builtins =+ [+ ("METAL_HEADERS", {|#include <metal_stdlib>using namespace metal;|}, []);-- ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);-- ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);- ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);- ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);- ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);- ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);- ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);- ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);- ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);-- ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {+ ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);+ ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);+ ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);+ ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);+ ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);+ ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);+ ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);+ ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);+ ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);+ ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {return rotate(x, n);}|}, []);-- ("threefry_round", {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {+ ( "threefry_round",+ {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;@@ -32,9 +30,10 @@ using namespace metal;|}, []);tmp = x.y;x.y = x.w;x.w = tmp;-}|}, ["rotl32"]);-- ("arrayjit_threefry4x32", {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {+}|},+ [ "rotl32" ] );+ ( "arrayjit_threefry4x32",+ {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {uint4 x = counter;uint4 k = key;@@ -119,115 +118,142 @@ using namespace metal;|}, []);x.w += 5;return x;-}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1";- "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0";- "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]);-- ("float4_t", {|struct float4_t { float4 v; };|}, []);- ("float2_t", {|struct float2_t { float2 v; };|}, []);- ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);- ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);- ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);- ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);- ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);- ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);- ("half8_t", {|struct half8_t { half v[8]; };|}, []);-- ("uint32_to_single_uniform", {|inline float uint32_to_single_uniform(uint32_t x) {+}|},+ [+ "THREEFRY_C240";+ "threefry_round";+ "THREEFRY_ROTATION_0_0";+ "THREEFRY_ROTATION_0_1";+ "THREEFRY_ROTATION_0_2";+ "THREEFRY_ROTATION_0_3";+ "THREEFRY_ROTATION_1_0";+ "THREEFRY_ROTATION_1_1";+ "THREEFRY_ROTATION_1_2";+ "THREEFRY_ROTATION_1_3";+ ] );+ ("float4_t", {|struct float4_t { float4 v; };|}, []);+ ("float2_t", {|struct float2_t { float2 v; };|}, []);+ ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);+ ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);+ ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);+ ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);+ ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);+ ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);+ ("half8_t", {|struct half8_t { half v[8]; };|}, []);+ ( "uint32_to_single_uniform",+ {|inline float uint32_to_single_uniform(uint32_t x) {return (x >> 8) * (1.0f / 16777216.0f);-}|}, []);-- ("uint4x32_to_single_uniform", {|float uint4x32_to_single_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_single_uniform",+ {|float uint4x32_to_single_uniform(uint4 x) {return uint32_to_single_uniform(x.x);-}|}, ["uint32_to_single_uniform"]);-- ("uint4x32_to_double_uniform", {|float uint4x32_to_double_uniform(uint4 x) {+}|},+ [ "uint32_to_single_uniform" ] );+ ( "uint4x32_to_double_uniform",+ {|float uint4x32_to_double_uniform(uint4 x) {/* Fallback to float precision */uint64_t combined = (uint64_t(x.y) << 32) | x.x;return float(combined) * (1.0f / 18446744073709551616.0f);-}|}, []);-- ("uint4x32_to_int32_uniform", {|int32_t uint4x32_to_int32_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_int32_uniform",+ {|int32_t uint4x32_to_int32_uniform(uint4 x) {return int32_t(x.x);-}|}, []);-- ("uint4x32_to_int64_uniform", {|int64_t uint4x32_to_int64_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_int64_uniform",+ {|int64_t uint4x32_to_int64_uniform(uint4 x) {return int64_t((uint64_t(x.y) << 32) | x.x);-}|}, []);-- ("uint4x32_to_uint32_uniform", {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_uint32_uniform",+ {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {return x.x;-}|}, []);-- ("uint4x32_to_uint64_uniform", {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_uint64_uniform",+ {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {return (uint64_t(x.y) << 32) | x.x;-}|}, []);-- ("uint4x32_to_byte_uniform", {|int8_t uint4x32_to_byte_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_byte_uniform",+ {|int8_t uint4x32_to_byte_uniform(uint4 x) {return int8_t(x.x & 0xFF);-}|}, []);-- ("uint4x32_to_uint16_uniform", {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_uint16_uniform",+ {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {return uint16_t(x.x & 0xFFFF);-}|}, []);-- ("uint4x32_to_bfloat16_uniform", {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_bfloat16_uniform",+ {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {float f = uint32_to_single_uniform(x.x);return uint16_t(as_type<uint32_t>(f) >> 16);-}|}, ["uint32_to_single_uniform"]);-- ("uint4x32_to_half_uniform", {|half uint4x32_to_half_uniform(uint4 x) {+}|},+ [ "uint32_to_single_uniform" ] );+ ( "uint4x32_to_half_uniform",+ {|half uint4x32_to_half_uniform(uint4 x) {float f = uint32_to_single_uniform(x.x);return half(f);-}|}, ["uint32_to_single_uniform"]);-- ("uint4x32_to_fp8_uniform", {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {+}|},+ [ "uint32_to_single_uniform" ] );+ ( "uint4x32_to_fp8_uniform",+ {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {return uint8_t(x.x & 0xFF);-}|}, []);-- ("uint4x32_to_single_uniform_vec", {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_single_uniform_vec",+ {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {float4_t result;result.v.x = uint32_to_single_uniform(x.x);result.v.y = uint32_to_single_uniform(x.y);result.v.z = uint32_to_single_uniform(x.z);result.v.w = uint32_to_single_uniform(x.w);return result;-}|}, ["float4_t"; "uint32_to_single_uniform"]);-- ("uint4x32_to_double_uniform_vec", {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {+}|},+ [ "float4_t"; "uint32_to_single_uniform" ] );+ ( "uint4x32_to_double_uniform_vec",+ {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {float2_t result;uint64_t combined1 = (uint64_t(x.y) << 32) | x.x;uint64_t combined2 = (uint64_t(x.w) << 32) | x.z;result.v.x = float(combined1) * (1.0f / 18446744073709551616.0f);result.v.y = float(combined2) * (1.0f / 18446744073709551616.0f);return result;-}|}, ["float2_t"]);-- ("uint4x32_to_int32_uniform_vec", {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {+}|},+ [ "float2_t" ] );+ ( "uint4x32_to_int32_uniform_vec",+ {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {int32x4_t result;result.v = int4(x);return result;-}|}, ["int32x4_t"]);-- ("uint4x32_to_int64_uniform_vec", {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {+}|},+ [ "int32x4_t" ] );+ ( "uint4x32_to_int64_uniform_vec",+ {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {int64x2_t result;result.v[0] = (int64_t(x.y) << 32) | x.x;result.v[1] = (int64_t(x.w) << 32) | x.z;return result;-}|}, ["int64x2_t"]);-- ("uint4x32_to_uint32_uniform_vec", {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {+}|},+ [ "int64x2_t" ] );+ ( "uint4x32_to_uint32_uniform_vec",+ {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {return x;-}|}, []);-- ("uint4x32_to_uint64_uniform_vec", {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {+}|},+ [] );+ ( "uint4x32_to_uint64_uniform_vec",+ {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {uint64x2_t result;result.v[0] = (uint64_t(x.y) << 32) | x.x;result.v[1] = (uint64_t(x.w) << 32) | x.z;return result;-}|}, ["uint64x2_t"]);-- ("uint4x32_to_byte_uniform_vec", {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {+}|},+ [ "uint64x2_t" ] );+ ( "uint4x32_to_byte_uniform_vec",+ {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {int8x16_t result;uint4 v = x;for (int i = 0; i < 4; i++) {@@ -238,9 +264,10 @@ using namespace metal;|}, []);result.v[i*4 + 3] = int8_t((val >> 24) & 0xFF);}return result;-}|}, ["int8x16_t"]);-- ("uint4x32_to_uint16_uniform_vec", {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {+}|},+ [ "int8x16_t" ] );+ ( "uint4x32_to_uint16_uniform_vec",+ {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {uint16x8_t result;uint4 v = x;for (int i = 0; i < 4; i++) {@@ -249,9 +276,10 @@ using namespace metal;|}, []);result.v[i*2 + 1] = uint16_t((val >> 16) & 0xFFFF);}return result;-}|}, ["uint16x8_t"]);-- ("uint4x32_to_bfloat16_uniform_vec", {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {+}|},+ [ "uint16x8_t" ] );+ ( "uint4x32_to_bfloat16_uniform_vec",+ {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {uint16x8_t result;uint4 v = x;for (int i = 0; i < 4; i++) {@@ -262,9 +290,10 @@ using namespace metal;|}, []);result.v[i*2 + 1] = uint16_t(as_type<uint32_t>(f2) >> 16);}return result;-}|}, ["uint16x8_t"]);-- ("uint4x32_to_half_uniform_vec", {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {+}|},+ [ "uint16x8_t" ] );+ ( "uint4x32_to_half_uniform_vec",+ {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {half8_t result;uint4 v = x;for (int i = 0; i < 4; i++) {@@ -275,9 +304,10 @@ using namespace metal;|}, []);result.v[i*2 + 1] = half(f2);}return result;-}|}, ["half8_t"]);-- ("uint4x32_to_fp8_uniform_vec", {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {+}|},+ [ "half8_t" ] );+ ( "uint4x32_to_fp8_uniform_vec",+ {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {uint8x16_t result;uint4 v = x;for (int i = 0; i < 4; i++) {@@ -288,53 +318,65 @@ using namespace metal;|}, []);result.v[i*4 + 3] = uint8_t((val >> 24) & 0xFF);}return result;-}|}, ["uint8x16_t"]);-- ("single_to_uint4x32", {|uint4 single_to_uint4x32(float x) {+}|},+ [ "uint8x16_t" ] );+ ( "single_to_uint4x32",+ {|uint4 single_to_uint4x32(float x) {uint32_t bits = as_type<uint32_t>(x);return uint4(bits, 0, 0, 0);-}|}, []);-- ("double_to_uint4x32", {|uint4 double_to_uint4x32(float x) {+}|},+ [] );+ ( "double_to_uint4x32",+ {|uint4 double_to_uint4x32(float x) {/* Metal doesn't have native double support, use float fallback */uint32_t bits = as_type<uint32_t>(x);return uint4(bits, 0, 0, 0);-}|}, []);-- ("int32_to_uint4x32", {|uint4 int32_to_uint4x32(int32_t x) {+}|},+ [] );+ ( "int32_to_uint4x32",+ {|uint4 int32_to_uint4x32(int32_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-- ("int64_to_uint4x32", {|uint4 int64_to_uint4x32(int64_t x) {+}|},+ [] );+ ( "int64_to_uint4x32",+ {|uint4 int64_to_uint4x32(int64_t x) {uint64_t bits = uint64_t(x);return uint4(uint32_t(bits & 0xFFFFFFFF), uint32_t(bits >> 32), 0, 0);-}|}, []);-- ("uint32_to_uint4x32", {|uint4 uint32_to_uint4x32(uint32_t x) {+}|},+ [] );+ ( "uint32_to_uint4x32",+ {|uint4 uint32_to_uint4x32(uint32_t x) {return uint4(x, 0, 0, 0);-}|}, []);-- ("uint64_to_uint4x32", {|uint4 uint64_to_uint4x32(uint64_t x) {+}|},+ [] );+ ( "uint64_to_uint4x32",+ {|uint4 uint64_to_uint4x32(uint64_t x) {return uint4(uint32_t(x & 0xFFFFFFFF), uint32_t(x >> 32), 0, 0);-}|}, []);-- ("byte_to_uint4x32", {|uint4 byte_to_uint4x32(int8_t x) {+}|},+ [] );+ ( "byte_to_uint4x32",+ {|uint4 byte_to_uint4x32(int8_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-- ("uint16_to_uint4x32", {|uint4 uint16_to_uint4x32(uint16_t x) {+}|},+ [] );+ ( "uint16_to_uint4x32",+ {|uint4 uint16_to_uint4x32(uint16_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-- ("bfloat16_to_uint4x32", {|uint4 bfloat16_to_uint4x32(uint16_t x) {+}|},+ [] );+ ( "bfloat16_to_uint4x32",+ {|uint4 bfloat16_to_uint4x32(uint16_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-- ("half_to_uint4x32", {|uint4 half_to_uint4x32(uint16_t x) {+}|},+ [] );+ ( "half_to_uint4x32",+ {|uint4 half_to_uint4x32(uint16_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-- ("fp8_to_uint4x32", {|uint4 fp8_to_uint4x32(uint8_t x) {+}|},+ [] );+ ( "fp8_to_uint4x32",+ {|uint4 fp8_to_uint4x32(uint8_t x) {return uint4(uint32_t(x), 0, 0, 0);-}|}, []);-]+}|},+ [] );+ ]File "arrayjit/lib/cc_backend.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/cc_backend.ml b/_build/default/arrayjit/lib/.formatted/cc_backend.mlindex a23dbdf..9847c1a 100644--- a/_build/default/arrayjit/lib/cc_backend.ml+++ b/_build/default/arrayjit/lib/.formatted/cc_backend.ml@@ -306,10 +306,10 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)let idx_params = Indexing.bound_symbols bindings inlet build_file = Utils.open_build_file ~base_name:name ~extension:".c" inlet params, proc_doc = Syntax.compile_proc ~name idx_params lowered in- let filtered_code = Syntax.filter_and_prepend_builtins- ~includes:Builtins_cc.includes- ~builtins:Builtins_cc.builtins- ~proc_doc in+ let filtered_code =+ Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins+ ~proc_doc+ in(* Use ribbon = 1.0 for usual code formatting, width 110 *)Out_channel.output_string build_file.oc filtered_code;build_file.finalize ();@@ -338,10 +338,10 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized optinlet all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) inlet combined_proc_doc = PPrint.separate PPrint.hardline all_proc_docs in- let filtered_code = Syntax.filter_and_prepend_builtins- ~includes:Builtins_cc.includes- ~builtins:Builtins_cc.builtins- ~proc_doc:combined_proc_doc in+ let filtered_code =+ Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins+ ~proc_doc:combined_proc_doc+ inOut_channel.output_string build_file.oc filtered_code;build_file.finalize ();let result_library = c_compile_and_load ~f_path:build_file.f_path inFile "arrayjit/lib/metal_backend.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/metal_backend.ml b/_build/default/arrayjit/lib/.formatted/metal_backend.mlindex 2b4a85b..367b1c9 100644--- a/_build/default/arrayjit/lib/metal_backend.ml+++ b/_build/default/arrayjit/lib/.formatted/metal_backend.ml@@ -655,8 +655,10 @@ end) : Ir.Backend_impl.Lowered_backend = structlet params, proc_doc = Syntax.compile_proc ~name idx_params lowered inlet metal_includes = {|#include <metal_stdlib>using namespace metal;|} in- let source = Syntax.filter_and_prepend_builtins- ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc in+ let source =+ Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins+ ~proc_doc+ in{metal_source = source;compiled_code = Array.create ~len:num_devs None;@@ -683,8 +685,10 @@ using namespace metal;|} inlet final_doc = PPrint.(separate hardline all_proc_docs) inlet metal_includes = {|#include <metal_stdlib>using namespace metal;|} in- let source = Syntax.filter_and_prepend_builtins- ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc:final_doc in+ let source =+ Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins+ ~proc_doc:final_doc+ inlet traced_stores = Array.map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store)) inlet funcs = Array.map funcs_and_docs ~f:(Option.map ~f:fst) in{File "arrayjit/lib/c_syntax.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/c_syntax.ml b/_build/default/arrayjit/lib/.formatted/c_syntax.mlindex be842b3..aeff3ae 100644--- a/_build/default/arrayjit/lib/c_syntax.ml+++ b/_build/default/arrayjit/lib/.formatted/c_syntax.ml@@ -239,13 +239,13 @@ module C_syntax (B : C_syntax_config) = structlet result_buffer = Buffer.create 4096 inBuffer.add_string result_buffer includes;Buffer.add_string result_buffer "\n";-+(* Collect all needed keys, including dependencies *)let needed_keys = ref (Set.empty (module String)) inList.iter builtins ~f:(fun (key, _, _) ->- if String.is_substring doc_string ~substring:key then- needed_keys := Set.add !needed_keys key);-+ if String.is_substring doc_string ~substring:key then+ needed_keys := Set.add !needed_keys key);+(* Add dependencies recursively *)let processed_keys = ref (Set.empty (module String)) inlet rec add_dependencies key =@@ -254,17 +254,15 @@ module C_syntax (B : C_syntax_config) = structneeded_keys := Set.add !needed_keys key;match List.find builtins ~f:(fun (k, _, _) -> String.equal k key) with| Some (_, _, deps) -> List.iter deps ~f:add_dependencies- | None -> ()- )+ | None -> ())inSet.iter !needed_keys ~f:add_dependencies;-+(* Add the builtins in order *)List.iter builtins ~f:(fun (key, definition, _) ->- if Set.mem !needed_keys key then (- Buffer.add_string result_buffer definition;- Buffer.add_string result_buffer "\n";- ));+ if Set.mem !needed_keys key then (+ Buffer.add_string result_buffer definition;+ Buffer.add_string result_buffer "\n"));Buffer.add_string result_buffer doc_string;Buffer.contents result_bufferFile "arrayjit/lib/cuda_backend.ml", line 1, characters 0-0:diff --git a/_build/default/arrayjit/lib/cuda_backend.ml b/_build/default/arrayjit/lib/.formatted/cuda_backend.mlindex 16df79a..2a59b1a 100644--- a/_build/default/arrayjit/lib/cuda_backend.ml+++ b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml@@ -764,7 +764,6 @@ end) : Ir.Backend_impl.Lowered_backend = struct^^ rparen ^^ semiend-let%diagn2_sexp compile ~name bindings ({ Low_level.traced_store; _ } as lowered) =(* TODO: The following link seems to claim it's better to expand into loops than use memset.https://stackoverflow.com/questions/23712558/how-do-i-best-initialize-a-local-memory-array-to-0 *)@@ -773,11 +772,18 @@ end) : Ir.Backend_impl.Lowered_backend = structend)) inlet idx_params = Indexing.bound_symbols bindings inlet params, proc_doc = Syntax.compile_proc ~name idx_params lowered in- let cuda_includes = {|#include <cuda_fp16.h>-#include <cuda_bf16.h>|} ^- (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in- let source = Syntax.filter_and_prepend_builtins- ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc in+ let cuda_includes =+ {|#include <cuda_fp16.h>+#include <cuda_bf16.h>|}+ ^+ if Utils.debug_log_from_routines () then+ "\n__device__ int printf (const char * format, ... );"+ else ""+ in+ let source =+ Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins+ ~proc_doc+ inlet ptx = cuda_to_ptx ~name source in{ traced_store; ptx; params; bindings; name }@@ -795,11 +801,18 @@ end) : Ir.Backend_impl.Lowered_backend = structinlet all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) inlet final_doc = PPrint.(separate hardline all_proc_docs) in- let cuda_includes = {|#include <cuda_fp16.h>-#include <cuda_bf16.h>|} ^- (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in- let source = Syntax.filter_and_prepend_builtins- ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc:final_doc in+ let cuda_includes =+ {|#include <cuda_fp16.h>+#include <cuda_bf16.h>|}+ ^+ if Utils.debug_log_from_routines () then+ "\n__device__ int printf (const char * format, ... );"+ else ""+ in+ let source =+ Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins+ ~proc_doc:final_doc+ inlet name : string =String.(dune build @fmt failed"/usr/bin/env" "bash" "-c" "opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)" failed with exit status 22025-08-21 17:15.22: Job failed: Failed: Build failed