Organisationsahrefsocannl7c81b8 ()(lint-fmt)

(lint-fmt)

Link Copied
Code Copied

Logs

2025-08-23 11:25.17: New job: test ahrefs/ocannl https://github.com/ahrefs/ocannl.git#refs/heads/master (7c81b8cd5153b938b30001f49d5582cceaaa1ec1) (linux-x86_64:(lint-fmt))
Base: ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
ocamlformat version: version 0.27.0 (from opam)


To reproduce locally:


git clone --recursive "https://github.com/ahrefs/ocannl.git" -b "master" && cd "ocannl" && git reset --hard 7c81b8cd
cat > Dockerfile <<'END-OF-DOCKERFILE'
FROM ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
USER 1000:1000
RUN cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u
RUN opam depext -i dune
WORKDIR /src
RUN opam depext -i ocamlformat=0.27.0
COPY --chown=1000:1000 . /src/
RUN opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)


END-OF-DOCKERFILE
docker build .
END-REPRO-BLOCK


2025-08-23 11:25.17: Using cache hint "ahrefs/ocannl-ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a-debian-12-4.08_opam-2.4-ocamlformat-b8021439f8c57ba6435bc2263f6596671f4f4466"
2025-08-23 11:25.17: Using OBuilder spec:
((from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)
(user (uid 1000) (gid 1000))
(run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))
(run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "opam depext -i dune"))
(workdir /src)
(run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "opam depext -i ocamlformat=0.27.0"))
(copy (src .) (dst /src/))
(run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)"))
)


2025-08-23 11:25.17: Waiting for resource in pool OCluster
2025-08-23 11:25.17: Waiting for worker…
2025-08-23 11:25.17: Got resource from pool OCluster
Building on toxis.caelum.ci.dev
All commits already cached
HEAD is now at 7c81b8cd Merge pull request #365 from ahrefs/buffer-allocation-refactor


(from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)
Unable to find image 'ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a' locally
docker.io/ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a: Pulling from ocaml/opam
2b437dab448b: Pulling fs layer
348b54b4f842: Pulling fs layer
d67c1d522a4e: Pulling fs layer
dea856d1a4d8: Pulling fs layer
b580a9d7b5b1: Pulling fs layer
b580a9d7b5b1: Waiting
dea856d1a4d8: Waiting
d67c1d522a4e: Verifying Checksum
d67c1d522a4e: Download complete
dea856d1a4d8: Download complete
b580a9d7b5b1: Verifying Checksum
b580a9d7b5b1: Download complete
2b437dab448b: Verifying Checksum
2b437dab448b: Download complete
348b54b4f842: Verifying Checksum
348b54b4f842: Download complete
2b437dab448b: Pull complete
348b54b4f842: Pull complete
d67c1d522a4e: Pull complete
dea856d1a4d8: Pull complete
b580a9d7b5b1: Pull complete
Digest: sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
Status: Downloaded newer image for ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
2025-08-23 11:26.50 ---> saved as "1d0024db739bd078f91b2384c47919652a4b72a425e3e24ce24cfd1f6debdfbc"


/: (user (uid 1000) (gid 1000))


/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))
b8021439f8 Merge pull request #28261 from kit-ty-kate/deploy-fix-25819


<><> Updating package repositories ><><><><><><><><><><><><><><><><><><><><><><>
[default] Initialised
default (at git+file:///home/opam/opam-repository):
[INFO] opam 2.1 and 2.2 include many performance and security improvements over 2.0; please consider upgrading (https://opam.ocaml.org/doc/Install.html)


Everything as up-to-date as possible (run with --verbose to show unavailable upgrades).
However, you may "opam upgrade" these packages explicitly, which will ask permission to downgrade or uninstall the conflicting packages.
Nothing to do.
# Run eval $(opam env) to update the current shell environment
2025-08-23 11:27.55 ---> saved as "76d9d96bb26da3c78200d383fd35f876d80571baf05962331a1fff5f47db0e2e"


/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "opam depext -i dune"))
# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian
# No extra OS packages requirements found.
# All required OS packages found.
# Now letting opam install the packages
The following actions will be performed:
- install dune 3.19.1


<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[dune.3.19.1] found in cache


<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-> installed dune.3.19.1
Done.
# Run eval $(opam env) to update the current shell environment
2025-08-23 11:28.23 ---> saved as "da0888a20a067de19f6183f0b497dcc2d1ea7f7036861cc50f633c662efdce4f"


/: (workdir /src)


/src: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
(network host)
(shell "opam depext -i ocamlformat=0.27.0"))
# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian
# No extra OS packages requirements found.
# All required OS packages found.
# Now letting opam install the packages
The following actions will be performed:
- install sexplib0          v0.14.0  [required by base]
- install menhirLib         20240715 [required by ocamlformat-lib]
- install menhirCST         20240715 [required by menhir]
- install ocamlbuild        0.16.1   [required by fpath, astring, uuseg]
- install cmdliner          1.3.0    [required by ocamlformat]
- install menhirSdk         20240715 [required by ocamlformat-lib]
- install either            1.0.0    [required by ocamlformat-lib]
- install csexp             1.5.2    [required by ocamlformat]
- install dune-build-info   3.19.1   [required by ocamlformat-lib]
- install camlp-streams     5.0.1    [required by ocamlformat-lib]
- install seq               base     [required by re]
- install fix               20250428 [required by ocamlformat-lib]
- install ocaml-version     4.0.1    [required by ocamlformat-lib]
- install ocamlfind         1.9.8    [required by ocp-indent, astring, fpath, uuseg]
- install menhir            20240715 [required by ocamlformat-lib]
- install dune-configurator 3.19.1   [required by base]
- install re                1.11.0   [required by ocamlformat]
- install topkg             1.1.0    [required by fpath, astring, uuseg]
- install base-bytes        base     [required by ocp-indent]
- install base              v0.14.3  [required by ocamlformat-lib]
- install uutf              1.0.4    [required by ocamlformat-lib]
- install astring           0.8.5    [required by ocamlformat-lib]
- install ocp-indent        1.8.1    [required by ocamlformat-lib]
- install stdio             v0.14.0  [required by ocamlformat-lib]
- install uucp              15.0.0   [required by uuseg]
- install fpath             0.7.3    [required by ocamlformat-lib]
- install uuseg             15.0.0   [required by ocamlformat-lib]
- install ocamlformat-lib   0.27.0   [required by ocamlformat]
- install ocamlformat       0.27.0
===== 29 to install =====


<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[astring.0.8.5] found in cache
[base.v0.14.3] found in cache
[camlp-streams.5.0.1] found in cache
[cmdliner.1.3.0] found in cache
[csexp.1.5.2] found in cache
[dune-build-info.3.19.1] found in cache
[dune-configurator.3.19.1] found in cache
[either.1.0.0] found in cache
[fix.20250428] found in cache
[fpath.0.7.3] found in cache
[menhir.20240715] found in cache
[menhirCST.20240715] found in cache
[menhirLib.20240715] found in cache
[menhirSdk.20240715] found in cache
[ocaml-version.4.0.1] found in cache
[ocamlbuild.0.16.1] found in cache
[ocamlfind.1.9.8] found in cache
[ocamlformat.0.27.0] found in cache
[ocamlformat-lib.0.27.0] found in cache
[ocp-indent.1.8.1] found in cache
[re.1.11.0] found in cache
[sexplib0.v0.14.0] found in cache
[stdio.v0.14.0] found in cache
[topkg.1.1.0] found in cache
[uucp.15.0.0] found in cache
[uuseg.15.0.0] found in cache
[uutf.1.0.4] found in cache


<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-> installed seq.base
-> installed camlp-streams.5.0.1
-> installed csexp.1.5.2
-> installed either.1.0.0
-> installed fix.20250428
-> installed cmdliner.1.3.0
-> installed menhirCST.20240715
-> installed menhirLib.20240715
-> installed menhirSdk.20240715
-> installed ocaml-version.4.0.1
-> installed re.1.11.0
-> installed sexplib0.v0.14.0
-> installed dune-build-info.3.19.1
-> installed dune-configurator.3.19.1
-> installed ocamlfind.1.9.8
-> installed base-bytes.base
-> installed ocp-indent.1.8.1
-> installed ocamlbuild.0.16.1
-> installed base.v0.14.3
-> installed stdio.v0.14.0
-> installed topkg.1.1.0
-> installed uutf.1.0.4
-> installed astring.0.8.5
-> installed fpath.0.7.3
-> installed menhir.20240715
-> installed uucp.15.0.0
-> installed uuseg.15.0.0
-> installed ocamlformat-lib.0.27.0
-> installed ocamlformat.0.27.0
Done.


<><> ocp-indent.1.8.1 installed successfully ><><><><><><><><><><><><><><><><><>
=> This package requires additional configuration for use in editors. Install package 'user-setup', or manually:


* for Emacs, add these lines to ~/.emacs:
(add-to-list 'load-path "/home/opam/.opam/4.08/share/emacs/site-lisp")
(require 'ocp-indent)


* for Vim, add this line to ~/.vimrc:
set rtp^="/home/opam/.opam/4.08/share/ocp-indent/vim"
# Run eval $(opam env) to update the current shell environment
2025-08-23 11:29.19 ---> saved as "86ec8dcb8046a1e5dacfb1841e8c026d30cfead67649bcb6d6a8f9ddd6fb153d"


/src: (copy (src .) (dst /src/))
2025-08-23 11:29.20 ---> saved as "99cbfa4483311ee4c9795793824baf21b680c9267bc3979708e89c86a46a2389"


/src: (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)"))
File "arrayjit/lib/dune", line 7, characters 30-51:
7 |  (libraries base stdio pprint ppx_minidebug.runtime)
^^^^^^^^^^^^^^^^^^^^^
Error: Library "ppx_minidebug.runtime" not found.
-> required by library "arrayjit.utils" in _build/default/arrayjit/lib
-> required by executable ocannl_read_config in test/operations/config/dune:5
-> required by _build/default/test/operations/config/ocannl_read_config.exe
-> required by %{dep:ocannl_read_config.exe} at test/operations/dune:22
-> required by _build/default/test/operations/config/ocannl_backend.txt
-> required by %{read:config/ocannl_backend.txt} at test/operations/dune:37
-> required by Computing directory contents of _build/default/test/operations
File "arrayjit/lib/no_device_backend_missing.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/no_device_backend_missing.ml b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml
index 0e04309..890e1c2 100644
--- a/_build/default/arrayjit/lib/no_device_backend_missing.ml
+++ b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml
@@ -35,12 +35,8 @@ struct
let alloc_buffer ?old_buffer:_ ~size_in_bytes:_ () =
failwith @@ "Backend " ^ Config.name ^ " missing (no device)"


-  let alloc_array _prec ~dims:_ () =
-    failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
-
-  let alloc_zeros _prec ~dims:_ () =
-    failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
-
+  let alloc_array _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
+  let alloc_zeros _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
let free_buffer = None
let get_used_memory () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"


File "arrayjit/lib/builtins_cuda.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_cuda.ml b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml
index 4dd3451..7a265ed 100644
--- a/_build/default/arrayjit/lib/builtins_cuda.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml
@@ -1,103 +1,119 @@
(* CUDA builtin code split into (key, definition, dependencies) triples for filtering *)
-let builtins = [
-  ("uint4x32_t", {|typedef struct {
+let builtins =
+  [
+    ("uint4x32_t", {|typedef struct {
unsigned int v[4];
} uint4x32_t;|}, []);
-
-  ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);
-  ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);
-  ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);
-  ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);
-  ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);
-  ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);
-  ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);
-  ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);
-
-  ("uint32_to_single_uniform", {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {
+    ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);
+    ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);
+    ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);
+    ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);
+    ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);
+    ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);
+    ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);
+    ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);
+    ( "uint32_to_single_uniform",
+      {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {
/* Use __uint2float_rn for correct rounding */
return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f);
-}|}, []);
-
-  ("uint32_to_double_uniform", {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {
+}|},
+      [] );
+    ( "uint32_to_double_uniform",
+      {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {
return __uint2double_rn(x) * (1.0 / 4294967296.0);
-}|}, []);
-
-  ("uint4x32_to_single_uniform", {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform",
+      {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
return uint32_to_single_uniform(x.v[0]);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform", {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform",
+      {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
unsigned long long combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int32_uniform", {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int32_uniform",
+      {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {
return (int)x.v[0];
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_i64_uniform", {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_i64_uniform",
+      {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {
return __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u32_uniform", {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u32_uniform",
+      {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {
return x.v[0];
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u64_uniform", {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u64_uniform",
+      {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {
return (unsigned long long)__double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_i8_uniform", {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_i8_uniform",
+      {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {
return (signed char)(x.v[0] & 0xFF);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u8_uniform", {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u8_uniform",
+      {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {
return (unsigned char)(x.v[0] & 0xFF);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_bfloat16_uniform", {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {
float f = uint32_to_single_uniform(x.v[0]);
return (unsigned short)(__float_as_uint(f) >> 16);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
float f = uint32_to_single_uniform(x.v[0]);
return __float2half(f);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_single_uniform_vec", {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_single_uniform_vec",
+      {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
float4_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
result.v[i] = uint32_to_single_uniform(x.v[i]);
}
return result;
-}|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
double2_t result;
result.v[0] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[1], x.v[0]))) * (1.0 / 18446744073709551616.0);
result.v[1] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[3], x.v[2]))) * (1.0 / 18446744073709551616.0);
return result;
-}|}, ["uint4x32_t"; "double2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "double2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
int32x4_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
result.v[i] = (int)x.v[i];
}
return result;
-}|}, ["uint4x32_t"; "int32x4_t"]);
-
-  ("uint4x32_to_i64_uniform_vec", {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int32x4_t" ] );
+    ( "uint4x32_to_i64_uniform_vec",
+      {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {
int64x2_t result;
result.v[0] = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
result.v[1] = __double_as_longlong(__hiloint2double(x.v[3], x.v[2]));
return result;
-}|}, ["uint4x32_t"; "int64x2_t"]);
-
-  ("uint4x32_to_i8_uniform_vec", {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int64x2_t" ] );
+    ( "uint4x32_to_i8_uniform_vec",
+      {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {
int8x16_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
@@ -107,9 +123,10 @@ let builtins = [
result.v[i*4 + 3] = (signed char)((x.v[i] >> 24) & 0xFF);
}
return result;
-}|}, ["uint4x32_t"; "int8x16_t"]);
-
-  ("uint4x32_to_u16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int8x16_t" ] );
+    ( "uint4x32_to_u16_uniform_vec",
+      {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {
uint16x8_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
@@ -117,9 +134,10 @@ let builtins = [
result.v[i*2 + 1] = (unsigned short)((x.v[i] >> 16) & 0xFFFF);
}
return result;
-}|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
uint16x8_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
@@ -130,9 +148,10 @@ let builtins = [
result.v[i*2 + 1] = (unsigned short)(__float_as_uint(f2) >> 16);
}
return result;
-}|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_half_uniform_vec", {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
half8_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
@@ -142,9 +161,10 @@ let builtins = [
result.v[i*2 + 1] = __float2half(f2);
}
return result;
-}|}, ["uint4x32_t"; "half8_t"]);
-
-  ("uint4x32_to_u8_uniform_vec", {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "half8_t" ] );
+    ( "uint4x32_to_u8_uniform_vec",
+      {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {
uint8x16_t result;
#pragma unroll
for (int i = 0; i < 4; i++) {
@@ -154,70 +174,81 @@ let builtins = [
result.v[i*4 + 3] = (unsigned char)((x.v[i] >> 24) & 0xFF);
}
return result;
-}|}, ["uint4x32_t"; "uint8x16_t"]);
-
-  ("single_to_uint4x32", {|__device__ uint4x32_t single_to_uint4x32(float x) {
+}|},
+      [ "uint4x32_t"; "uint8x16_t" ] );
+    ( "single_to_uint4x32",
+      {|__device__ uint4x32_t single_to_uint4x32(float x) {
unsigned int bits = __float_as_uint(x);
uint4x32_t result = {{bits, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("double_to_uint4x32", {|__device__ uint4x32_t double_to_uint4x32(double x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "double_to_uint4x32",
+      {|__device__ uint4x32_t double_to_uint4x32(double x) {
unsigned long long bits = __double_as_longlong(x);
uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("int32_to_uint4x32", {|__device__ uint4x32_t int32_to_uint4x32(int x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "int32_to_uint4x32",
+      {|__device__ uint4x32_t int32_to_uint4x32(int x) {
uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("int64_to_uint4x32", {|__device__ uint4x32_t int64_to_uint4x32(long long x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "int64_to_uint4x32",
+      {|__device__ uint4x32_t int64_to_uint4x32(long long x) {
unsigned long long bits = (unsigned long long)x;
uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint32_to_uint4x32", {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint32_to_uint4x32",
+      {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {
uint4x32_t result = {{x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint64_to_uint4x32", {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint64_to_uint4x32",
+      {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {
uint4x32_t result = {{(unsigned int)(x & 0xFFFFFFFF), (unsigned int)(x >> 32), 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("byte_to_uint4x32", {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "byte_to_uint4x32",
+      {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {
uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint16_to_uint4x32", {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint16_to_uint4x32",
+      {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {
uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("bfloat16_to_uint4x32", {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "bfloat16_to_uint4x32",
+      {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {
uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("half_to_uint4x32", {|__device__ uint4x32_t half_to_uint4x32(__half x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "half_to_uint4x32",
+      {|__device__ uint4x32_t half_to_uint4x32(__half x) {
unsigned short bits = __half_as_ushort(x);
uint4x32_t result = {{(unsigned int)bits, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("fp8_to_uint4x32", {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "fp8_to_uint4x32",
+      {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {
uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
return result;
-}|}, ["uint4x32_t"]);
-
-  ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);
-
-  ("THREEFRY_ROTATION", {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {
+}|},
+      [ "uint4x32_t" ] );
+    ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);
+    ( "THREEFRY_ROTATION",
+      {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {
{13, 15, 26, 6},
{17, 29, 16, 24},
{13, 15, 26, 6},
@@ -226,13 +257,15 @@ let builtins = [
{17, 29, 16, 24},
{13, 15, 26, 6},
{17, 29, 16, 24}
-};|}, []);
-
-  ("rotl32", {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {
+};|},
+      [] );
+    ( "rotl32",
+      {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {
return __funnelshift_l(x, x, n);
-}|}, []);
-
-  ("threefry_round", {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {
+}|},
+      [] );
+    ( "threefry_round",
+      {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {
x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;
x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;


@@ -246,9 +279,10 @@ let builtins = [
tmp = x.y;
x.y = x.w;
x.w = tmp;
-}|}, ["rotl32"]);
-
-  ("arrayjit_threefry4x32", {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
+}|},
+      [ "rotl32" ] );
+    ( "arrayjit_threefry4x32",
+      {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]);
uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]);


@@ -310,5 +344,6 @@ let builtins = [
result.v[2] = x.z;
result.v[3] = x.w;
return result;
-}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]);
-]
\ No newline at end of file
+}|},
+      [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] );
+  ]
File "arrayjit/lib/builtins_metal.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_metal.ml b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml
index 7d2de15..0514d95 100644
--- a/_build/default/arrayjit/lib/builtins_metal.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml
@@ -1,24 +1,22 @@
(* Metal builtin code split into (key, definition, dependencies) triples for filtering *)
-let builtins = [
-  ("METAL_HEADERS", {|#include <metal_stdlib>
+let builtins =
+  [
+    ("METAL_HEADERS", {|#include <metal_stdlib>
using namespace metal;|}, []);
-
-  ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);
-
-  ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);
-  ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);
-  ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);
-  ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);
-  ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);
-  ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);
-  ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);
-  ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);
-
-  ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {
+    ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);
+    ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);
+    ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);
+    ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);
+    ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);
+    ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);
+    ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);
+    ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);
+    ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);
+    ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {
return rotate(x, n);
}|}, []);
-
-  ("threefry_round", {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {
+    ( "threefry_round",
+      {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {
x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;
x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;


@@ -32,9 +30,10 @@ using namespace metal;|}, []);
tmp = x.y;
x.y = x.w;
x.w = tmp;
-}|}, ["rotl32"]);
-
-  ("arrayjit_threefry4x32", {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {
+}|},
+      [ "rotl32" ] );
+    ( "arrayjit_threefry4x32",
+      {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {
uint4 x = counter;
uint4 k = key;


@@ -119,115 +118,142 @@ using namespace metal;|}, []);
x.w += 5;


return x;
-}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1";
-      "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0";
-      "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]);
-
-  ("float4_t", {|struct float4_t { float4 v; };|}, []);
-  ("float2_t", {|struct float2_t { float2 v; };|}, []);
-  ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);
-  ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);
-  ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);
-  ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);
-  ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);
-  ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);
-  ("half8_t", {|struct half8_t { half v[8]; };|}, []);
-
-  ("uint32_to_single_uniform", {|inline float uint32_to_single_uniform(uint32_t x) {
+}|},
+      [
+        "THREEFRY_C240";
+        "threefry_round";
+        "THREEFRY_ROTATION_0_0";
+        "THREEFRY_ROTATION_0_1";
+        "THREEFRY_ROTATION_0_2";
+        "THREEFRY_ROTATION_0_3";
+        "THREEFRY_ROTATION_1_0";
+        "THREEFRY_ROTATION_1_1";
+        "THREEFRY_ROTATION_1_2";
+        "THREEFRY_ROTATION_1_3";
+      ] );
+    ("float4_t", {|struct float4_t { float4 v; };|}, []);
+    ("float2_t", {|struct float2_t { float2 v; };|}, []);
+    ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);
+    ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);
+    ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);
+    ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);
+    ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);
+    ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);
+    ("half8_t", {|struct half8_t { half v[8]; };|}, []);
+    ( "uint32_to_single_uniform",
+      {|inline float uint32_to_single_uniform(uint32_t x) {
return (x >> 8) * (1.0f / 16777216.0f);
-}|}, []);
-
-  ("uint4x32_to_single_uniform", {|float uint4x32_to_single_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform",
+      {|float uint4x32_to_single_uniform(uint4 x) {
return uint32_to_single_uniform(x.x);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform", {|float uint4x32_to_double_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform",
+      {|float uint4x32_to_double_uniform(uint4 x) {
/* Fallback to float precision */
uint64_t combined = (uint64_t(x.y) << 32) | x.x;
return float(combined) * (1.0f / 18446744073709551616.0f);
-}|}, []);
-
-  ("uint4x32_to_int32_uniform", {|int32_t uint4x32_to_int32_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_int32_uniform",
+      {|int32_t uint4x32_to_int32_uniform(uint4 x) {
return int32_t(x.x);
-}|}, []);
-
-  ("uint4x32_to_int64_uniform", {|int64_t uint4x32_to_int64_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_int64_uniform",
+      {|int64_t uint4x32_to_int64_uniform(uint4 x) {
return int64_t((uint64_t(x.y) << 32) | x.x);
-}|}, []);
-
-  ("uint4x32_to_uint32_uniform", {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint32_uniform",
+      {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {
return x.x;
-}|}, []);
-
-  ("uint4x32_to_uint64_uniform", {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint64_uniform",
+      {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {
return (uint64_t(x.y) << 32) | x.x;
-}|}, []);
-
-  ("uint4x32_to_byte_uniform", {|int8_t uint4x32_to_byte_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_byte_uniform",
+      {|int8_t uint4x32_to_byte_uniform(uint4 x) {
return int8_t(x.x & 0xFF);
-}|}, []);
-
-  ("uint4x32_to_uint16_uniform", {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint16_uniform",
+      {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {
return uint16_t(x.x & 0xFFFF);
-}|}, []);
-
-  ("uint4x32_to_bfloat16_uniform", {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {
float f = uint32_to_single_uniform(x.x);
return uint16_t(as_type<uint32_t>(f) >> 16);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|half uint4x32_to_half_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|half uint4x32_to_half_uniform(uint4 x) {
float f = uint32_to_single_uniform(x.x);
return half(f);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_fp8_uniform", {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_fp8_uniform",
+      {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {
return uint8_t(x.x & 0xFF);
-}|}, []);
-
-  ("uint4x32_to_single_uniform_vec", {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform_vec",
+      {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {
float4_t result;
result.v.x = uint32_to_single_uniform(x.x);
result.v.y = uint32_to_single_uniform(x.y);
result.v.z = uint32_to_single_uniform(x.z);
result.v.w = uint32_to_single_uniform(x.w);
return result;
-}|}, ["float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {
+}|},
+      [ "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {
float2_t result;
uint64_t combined1 = (uint64_t(x.y) << 32) | x.x;
uint64_t combined2 = (uint64_t(x.w) << 32) | x.z;
result.v.x = float(combined1) * (1.0f / 18446744073709551616.0f);
result.v.y = float(combined2) * (1.0f / 18446744073709551616.0f);
return result;
-}|}, ["float2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {
+}|},
+      [ "float2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {
int32x4_t result;
result.v = int4(x);
return result;
-}|}, ["int32x4_t"]);
-
-  ("uint4x32_to_int64_uniform_vec", {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {
+}|},
+      [ "int32x4_t" ] );
+    ( "uint4x32_to_int64_uniform_vec",
+      {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {
int64x2_t result;
result.v[0] = (int64_t(x.y) << 32) | x.x;
result.v[1] = (int64_t(x.w) << 32) | x.z;
return result;
-}|}, ["int64x2_t"]);
-
-  ("uint4x32_to_uint32_uniform_vec", {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {
+}|},
+      [ "int64x2_t" ] );
+    ( "uint4x32_to_uint32_uniform_vec",
+      {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {
return x;
-}|}, []);
-
-  ("uint4x32_to_uint64_uniform_vec", {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint64_uniform_vec",
+      {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {
uint64x2_t result;
result.v[0] = (uint64_t(x.y) << 32) | x.x;
result.v[1] = (uint64_t(x.w) << 32) | x.z;
return result;
-}|}, ["uint64x2_t"]);
-
-  ("uint4x32_to_byte_uniform_vec", {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {
+}|},
+      [ "uint64x2_t" ] );
+    ( "uint4x32_to_byte_uniform_vec",
+      {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {
int8x16_t result;
uint4 v = x;
for (int i = 0; i < 4; i++) {
@@ -238,9 +264,10 @@ using namespace metal;|}, []);
result.v[i*4 + 3] = int8_t((val >> 24) & 0xFF);
}
return result;
-}|}, ["int8x16_t"]);
-
-  ("uint4x32_to_uint16_uniform_vec", {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {
+}|},
+      [ "int8x16_t" ] );
+    ( "uint4x32_to_uint16_uniform_vec",
+      {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {
uint16x8_t result;
uint4 v = x;
for (int i = 0; i < 4; i++) {
@@ -249,9 +276,10 @@ using namespace metal;|}, []);
result.v[i*2 + 1] = uint16_t((val >> 16) & 0xFFFF);
}
return result;
-}|}, ["uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {
+}|},
+      [ "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {
uint16x8_t result;
uint4 v = x;
for (int i = 0; i < 4; i++) {
@@ -262,9 +290,10 @@ using namespace metal;|}, []);
result.v[i*2 + 1] = uint16_t(as_type<uint32_t>(f2) >> 16);
}
return result;
-}|}, ["uint16x8_t"]);
-
-  ("uint4x32_to_half_uniform_vec", {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {
+}|},
+      [ "uint16x8_t" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {
half8_t result;
uint4 v = x;
for (int i = 0; i < 4; i++) {
@@ -275,9 +304,10 @@ using namespace metal;|}, []);
result.v[i*2 + 1] = half(f2);
}
return result;
-}|}, ["half8_t"]);
-
-  ("uint4x32_to_fp8_uniform_vec", {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {
+}|},
+      [ "half8_t" ] );
+    ( "uint4x32_to_fp8_uniform_vec",
+      {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {
uint8x16_t result;
uint4 v = x;
for (int i = 0; i < 4; i++) {
@@ -288,53 +318,65 @@ using namespace metal;|}, []);
result.v[i*4 + 3] = uint8_t((val >> 24) & 0xFF);
}
return result;
-}|}, ["uint8x16_t"]);
-
-  ("single_to_uint4x32", {|uint4 single_to_uint4x32(float x) {
+}|},
+      [ "uint8x16_t" ] );
+    ( "single_to_uint4x32",
+      {|uint4 single_to_uint4x32(float x) {
uint32_t bits = as_type<uint32_t>(x);
return uint4(bits, 0, 0, 0);
-}|}, []);
-
-  ("double_to_uint4x32", {|uint4 double_to_uint4x32(float x) {
+}|},
+      [] );
+    ( "double_to_uint4x32",
+      {|uint4 double_to_uint4x32(float x) {
/* Metal doesn't have native double support, use float fallback */
uint32_t bits = as_type<uint32_t>(x);
return uint4(bits, 0, 0, 0);
-}|}, []);
-
-  ("int32_to_uint4x32", {|uint4 int32_to_uint4x32(int32_t x) {
+}|},
+      [] );
+    ( "int32_to_uint4x32",
+      {|uint4 int32_to_uint4x32(int32_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("int64_to_uint4x32", {|uint4 int64_to_uint4x32(int64_t x) {
+}|},
+      [] );
+    ( "int64_to_uint4x32",
+      {|uint4 int64_to_uint4x32(int64_t x) {
uint64_t bits = uint64_t(x);
return uint4(uint32_t(bits & 0xFFFFFFFF), uint32_t(bits >> 32), 0, 0);
-}|}, []);
-
-  ("uint32_to_uint4x32", {|uint4 uint32_to_uint4x32(uint32_t x) {
+}|},
+      [] );
+    ( "uint32_to_uint4x32",
+      {|uint4 uint32_to_uint4x32(uint32_t x) {
return uint4(x, 0, 0, 0);
-}|}, []);
-
-  ("uint64_to_uint4x32", {|uint4 uint64_to_uint4x32(uint64_t x) {
+}|},
+      [] );
+    ( "uint64_to_uint4x32",
+      {|uint4 uint64_to_uint4x32(uint64_t x) {
return uint4(uint32_t(x & 0xFFFFFFFF), uint32_t(x >> 32), 0, 0);
-}|}, []);
-
-  ("byte_to_uint4x32", {|uint4 byte_to_uint4x32(int8_t x) {
+}|},
+      [] );
+    ( "byte_to_uint4x32",
+      {|uint4 byte_to_uint4x32(int8_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("uint16_to_uint4x32", {|uint4 uint16_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "uint16_to_uint4x32",
+      {|uint4 uint16_to_uint4x32(uint16_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("bfloat16_to_uint4x32", {|uint4 bfloat16_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "bfloat16_to_uint4x32",
+      {|uint4 bfloat16_to_uint4x32(uint16_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("half_to_uint4x32", {|uint4 half_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "half_to_uint4x32",
+      {|uint4 half_to_uint4x32(uint16_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("fp8_to_uint4x32", {|uint4 fp8_to_uint4x32(uint8_t x) {
+}|},
+      [] );
+    ( "fp8_to_uint4x32",
+      {|uint4 fp8_to_uint4x32(uint8_t x) {
return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-]
+}|},
+      [] );
+  ]
File "arrayjit/lib/builtins_cc.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_cc.ml b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml
index d4d6f9c..a66ed26 100644
--- a/_build/default/arrayjit/lib/builtins_cc.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml
@@ -1,4 +1,5 @@
-let includes = {|
+let includes =
+  {|
#include <stdio.h>
#include <math.h>
#include <stdint.h>
@@ -9,74 +10,84 @@ let includes = {|
|}


(* Each entry is (key, definition, dependencies) *)
-let builtins = [
-  (* Float16 feature detection and type definitions *)
-  ("HAS_NATIVE_FLOAT16", {|
+let builtins =
+  [
+    (* Float16 feature detection and type definitions *)
+    ( "HAS_NATIVE_FLOAT16",
+      {|
#ifdef __FLT16_MAX__
#define HAS_NATIVE_FLOAT16 1
#else
#define HAS_NATIVE_FLOAT16 0
#endif
-|}, []);
-
-  ("HALF_T", {|
+|},
+      [] );
+    ( "HALF_T",
+      {|
#if HAS_NATIVE_FLOAT16
#define HALF_T _Float16
#else
#define HALF_T uint16_t
#endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("HALF_TO_FP", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "HALF_TO_FP",
+      {|
#if HAS_NATIVE_FLOAT16
#define HALF_TO_FP(x) (x)  /* Identity - already in floating point */
#else
#define HALF_TO_FP(x) half_to_float_emulated(x)  /* Convert to float for computation */
#endif
-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);
-
-  ("FP_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );
+    ( "FP_TO_HALF",
+      {|
#if HAS_NATIVE_FLOAT16
#define FP_TO_HALF(x) (x)  /* Identity - already half precision */
#else
#define FP_TO_HALF(x) float_to_half_emulated(x)  /* Convert back from float */
#endif
-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);
-
-  ("HALF_TO_FLOAT", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );
+    ( "HALF_TO_FLOAT",
+      {|
#if HAS_NATIVE_FLOAT16
#define HALF_TO_FLOAT(x) ((float)(x))
#else
#define HALF_TO_FLOAT(x) half_to_float_emulated(x)
#endif
-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);
-
-  ("FLOAT_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );
+    ( "FLOAT_TO_HALF",
+      {|
#if HAS_NATIVE_FLOAT16
#define FLOAT_TO_HALF(x) ((_Float16)(x))
#else
#define FLOAT_TO_HALF(x) float_to_half_emulated(x)
#endif
-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);
-
-  ("HALF_TO_UINT16", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );
+    ( "HALF_TO_UINT16",
+      {|
#if HAS_NATIVE_FLOAT16
#define HALF_TO_UINT16(x) ({ _Float16 _h = (x); uint16_t _r; memcpy(&_r, &_h, 2); _r; })
#else
#define HALF_TO_UINT16(x) (x)
#endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("UINT16_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "UINT16_TO_HALF",
+      {|
#if HAS_NATIVE_FLOAT16
#define UINT16_TO_HALF(x) ({ uint16_t _u = (x); _Float16 _h; memcpy(&_h, &_u, 2); _h; })
#else
#define UINT16_TO_HALF(x) (x)
#endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  (* Float16 emulation functions *)
-  ("half_to_float_emulated", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    (* Float16 emulation functions *)
+    ( "half_to_float_emulated",
+      {|
#if !HAS_NATIVE_FLOAT16
/* Convert IEEE 754 half precision (stored as uint16_t) to float */
static float half_to_float_emulated(uint16_t h) {
@@ -108,9 +119,10 @@ static float half_to_float_emulated(uint16_t h) {
}
}
#endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("float_to_half_emulated", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "float_to_half_emulated",
+      {|
#if !HAS_NATIVE_FLOAT16
/* Convert float to IEEE 754 half precision (stored as uint16_t) */
static uint16_t float_to_half_emulated(float f) {
@@ -197,16 +209,16 @@ static uint16_t float_to_half_emulated(float f) {
}
}
#endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  (* Threefry4x32 types and complete implementation *)
-  ("uint4x32_t", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    (* Threefry4x32 types and complete implementation *)
+    ("uint4x32_t", {|
typedef struct {
uint32_t v[4];
} uint4x32_t;
|}, []);
-
-  ("arrayjit_threefry4x32", {|
+    ( "arrayjit_threefry4x32",
+      {|
/* Threefry4x32 constants */
const uint32_t THREEFRY_C240 = 0x1BD11BDA;


@@ -329,75 +341,72 @@ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
result.v[3] = x[3];
return result;
}
-|}, ["uint4x32_t"]);
-
-  (* Vector types with half precision *)
-  ("half8_t", {|
+|},
+      [ "uint4x32_t" ] );
+    (* Vector types with half precision *)
+    ("half8_t", {|
typedef struct { HALF_T v[8]; } half8_t;
-|}, ["HALF_T"]);
-
-  ("float4_t", {|
+|}, [ "HALF_T" ]);
+    ("float4_t", {|
typedef struct { float v[4]; } float4_t;
|}, []);
-
-  ("double2_t", {|
+    ("double2_t", {|
typedef struct { double v[2]; } double2_t;
|}, []);
-
-  ("int32x4_t", {|
+    ("int32x4_t", {|
typedef struct { int32_t v[4]; } int32x4_t;
|}, []);
-
-  ("int64x2_t", {|
+    ("int64x2_t", {|
typedef struct { int64_t v[2]; } int64x2_t;
|}, []);
-
-  ("int8x16_t", {|
+    ("int8x16_t", {|
typedef struct { int8_t v[16]; } int8x16_t;
|}, []);
-
-  ("uint16x8_t", {|
+    ("uint16x8_t", {|
typedef struct { uint16_t v[8]; } uint16x8_t;
|}, []);
-
-  ("uint8x16_t", {|
+    ("uint8x16_t", {|
typedef struct { uint8_t v[16]; } uint8x16_t;
|}, []);
-
-  (* Basic conversion functions *)
-  ("uint32_to_single_uniform", {|
+    (* Basic conversion functions *)
+    ( "uint32_to_single_uniform",
+      {|
/* Convert to float in [0, 1) */
float uint32_to_single_uniform(uint32_t x) {
/* Use upper 24 bits for float mantissa (23 bits + implicit 1) */
return (x >> 8) * (1.0f / 16777216.0f);
}
-|}, []);
-
-  ("uint32_to_double_uniform", {|
+|},
+      [] );
+    ( "uint32_to_double_uniform",
+      {|
/* Convert to double in [0, 1) */
double uint32_to_double_uniform(uint32_t x) {
return x * (1.0 / 4294967296.0);
}
-|}, []);
-
-  (* Conversion functions with dependencies *)
-  ("uint4x32_to_single_uniform", {|
+|},
+      [] );
+    (* Conversion functions with dependencies *)
+    ( "uint4x32_to_single_uniform",
+      {|
/* Uint4x32 to float32 uniform - uses first 32 bits */
float uint4x32_to_single_uniform(uint4x32_t x) {
return uint32_to_single_uniform(x.v[0]);
}
-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|
+|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|
/* Uint4x32 to float16 uniform - uses first 16 bits */
uint16_t uint4x32_to_half_uniform(uint4x32_t x) {
/* Convert through float for consistent behavior */
float f = (x.v[0] & 0xFFFF) * (1.0f / 65536.0f);
return FLOAT_TO_HALF(f);
}
-|}, ["uint4x32_t"; "FLOAT_TO_HALF"]);
-
-  ("uint4x32_to_half_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "FLOAT_TO_HALF" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|
/* Convert uint4x32 to 8 float16s uniform */
half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
half8_t result;
@@ -412,10 +421,11 @@ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "half8_t"; "FLOAT_TO_HALF"]);
-
-  (* Pure C conversion functions *)
-  ("bfloat16_to_single", {|
+|},
+      [ "uint4x32_t"; "half8_t"; "FLOAT_TO_HALF" ] );
+    (* Pure C conversion functions *)
+    ( "bfloat16_to_single",
+      {|
/* BFloat16 to Float conversion (C function) */
float bfloat16_to_single(uint16_t bf16)
{
@@ -424,9 +434,10 @@ float bfloat16_to_single(uint16_t bf16)
uint32_t f32 = ((uint32_t)bf16) << 16;
return *((float *)&f32);
}
-|}, []);
-
-  ("single_to_bfloat16", {|
+|},
+      [] );
+    ( "single_to_bfloat16",
+      {|
/* Float to BFloat16 conversion (C function) */
uint16_t single_to_bfloat16(float f)
{
@@ -436,27 +447,30 @@ uint16_t single_to_bfloat16(float f)
uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);
return (uint16_t)(rounded >> 16);
}
-|}, []);
-
-  ("half_to_single", {|
+|},
+      [] );
+    ( "half_to_single",
+      {|
/* Half (Float16) to Float conversion (C function) */
float half_to_single(uint16_t h)
{
HALF_T half_val = UINT16_TO_HALF(h);
return HALF_TO_FLOAT(half_val);
}
-|}, ["HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT"]);
-
-  ("single_to_half", {|
+|},
+      [ "HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT" ] );
+    ( "single_to_half",
+      {|
/* Float to Half (Float16) conversion (C function) */
uint16_t single_to_half(float f)
{
HALF_T half_val = FLOAT_TO_HALF(f);
return HALF_TO_UINT16(half_val);
}
-|}, ["HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16"]);
-
-  ("fp8_to_single", {|
+|},
+      [ "HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16" ] );
+    ( "fp8_to_single",
+      {|
/* FP8 E5M2 format to Float conversion (C function)
Format: 1 sign bit, 5 exponent bits, 2 mantissa bits */
float fp8_to_single(uint8_t fp8)
@@ -500,9 +514,10 @@ float fp8_to_single(uint8_t fp8)


return result;
}
-|}, []);
-
-  ("single_to_fp8", {|
+|},
+      [] );
+    ( "single_to_fp8",
+      {|
/* Float to FP8 E5M2 conversion (C function) */
uint8_t single_to_fp8(float f)
{
@@ -560,143 +575,162 @@ uint8_t single_to_fp8(float f)


return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
}
-|}, []);
-
-  (* Conversion functions from various precisions to uint4x32_t *)
-  ("int32_to_uint4x32", {|
+|},
+      [] );
+    (* Conversion functions from various precisions to uint4x32_t *)
+    ( "int32_to_uint4x32",
+      {|
uint4x32_t int32_to_uint4x32(int32_t x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("int64_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "int64_to_uint4x32",
+      {|
uint4x32_t int64_to_uint4x32(int64_t x) {
uint64_t bits = (uint64_t)x;
uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("uint32_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint32_to_uint4x32",
+      {|
uint4x32_t uint32_to_uint4x32(uint32_t x) {
uint4x32_t result = {{x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("uint64_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint64_to_uint4x32",
+      {|
uint4x32_t uint64_to_uint4x32(uint64_t x) {
uint4x32_t result = {{(uint32_t)(x & 0xFFFFFFFF), (uint32_t)(x >> 32), 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("single_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "single_to_uint4x32",
+      {|
uint4x32_t single_to_uint4x32(float x) {
uint32_t bits;
memcpy(&bits, &x, sizeof(float));
uint4x32_t result = {{bits, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("double_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "double_to_uint4x32",
+      {|
uint4x32_t double_to_uint4x32(double x) {
uint64_t bits;
memcpy(&bits, &x, sizeof(double));
uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("byte_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "byte_to_uint4x32",
+      {|
uint4x32_t byte_to_uint4x32(unsigned char x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("uint16_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint16_to_uint4x32",
+      {|
uint4x32_t uint16_to_uint4x32(uint16_t x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("bfloat16_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "bfloat16_to_uint4x32",
+      {|
uint4x32_t bfloat16_to_uint4x32(uint16_t x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("half_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "half_to_uint4x32",
+      {|
uint4x32_t half_to_uint4x32(uint16_t x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  ("fp8_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "fp8_to_uint4x32",
+      {|
uint4x32_t fp8_to_uint4x32(uint8_t x) {
uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
return result;
}
-|}, ["uint4x32_t"]);
-
-  (* More uint4x32 to various precision conversion functions *)
-  ("uint4x32_to_double_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    (* More uint4x32 to various precision conversion functions *)
+    ( "uint4x32_to_double_uniform",
+      {|
/* Uint4x32 to float64 uniform - uses first 64 bits */
double uint4x32_to_double_uniform(uint4x32_t x) {
uint64_t combined = ((uint64_t)x.v[1] << 32) | x.v[0];
return combined * (1.0 / 18446744073709551616.0);
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int32_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int32_uniform",
+      {|
/* Uint4x32 to int32 uniform - full range */
int32_t uint4x32_to_int32_uniform(uint4x32_t x) {
return (int32_t)x.v[0];
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int64_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int64_uniform",
+      {|
/* Uint4x32 to int64 uniform - full range */
int64_t uint4x32_to_int64_uniform(uint4x32_t x) {
return (int64_t)(((uint64_t)x.v[1] << 32) | x.v[0]);
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint32_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint32_uniform",
+      {|
/* Uint4x32 to uint32 uniform - full range */
uint32_t uint4x32_to_uint32_uniform(uint4x32_t x) {
return x.v[0];
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint64_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint64_uniform",
+      {|
/* Uint4x32 to uint64 uniform - full range */
uint64_t uint4x32_to_uint64_uniform(uint4x32_t x) {
return ((uint64_t)x.v[1] << 32) | x.v[0];
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_byte_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_byte_uniform",
+      {|
/* Uint4x32 to int8 uniform - full range */
int8_t uint4x32_to_byte_uniform(uint4x32_t x) {
return (int8_t)(x.v[0] & 0xFF);
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint16_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint16_uniform",
+      {|
/* Uint4x32 to uint16 uniform - full range */
uint16_t uint4x32_to_uint16_uniform(uint4x32_t x) {
return (uint16_t)(x.v[0] & 0xFFFF);
}
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_bfloat16_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|
/* Uint4x32 to bfloat16 uniform - uses first 16 bits */
uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
/* Convert to float first, then to bfloat16 */
@@ -708,17 +742,19 @@ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
if ((bits & 0x8000) && ((bits & 0x7FFF) || (bf & 1))) bf++;
return bf;
}
-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_fp8_uniform", {|
+|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_fp8_uniform",
+      {|
/* Uint4x32 to fp8 uniform - uses first 8 bits */
uint8_t uint4x32_to_fp8_uniform(uint4x32_t x) {
return (uint8_t)(x.v[0] & 0xFF);
}
-|}, ["uint4x32_t"]);
-
-  (* Vectorized conversion functions *)
-  ("uint4x32_to_single_uniform_vec", {|
+|},
+      [ "uint4x32_t" ] );
+    (* Vectorized conversion functions *)
+    ( "uint4x32_to_single_uniform_vec",
+      {|
/* Convert uint4x32 to 4 floats in [0, 1) */
float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
float4_t result;
@@ -727,9 +763,10 @@ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|
/* Convert uint4x32 to 2 doubles in [0, 1) */
double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
double2_t result;
@@ -739,9 +776,10 @@ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
result.v[1] = combined2 * (1.0 / 18446744073709551616.0);
return result;
}
-|}, ["uint4x32_t"; "double2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "double2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|
/* Convert uint4x32 to 4 int32s - full range */
int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
int32x4_t result;
@@ -750,9 +788,10 @@ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "int32x4_t"]);
-
-  ("uint4x32_to_int64_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int32x4_t" ] );
+    ( "uint4x32_to_int64_uniform_vec",
+      {|
/* Convert uint4x32 to 2 int64s - full range */
int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {
int64x2_t result;
@@ -760,9 +799,10 @@ int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {
result.v[1] = (int64_t)(((uint64_t)x.v[3] << 32) | x.v[2]);
return result;
}
-|}, ["uint4x32_t"; "int64x2_t"]);
-
-  ("uint4x32_to_byte_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int64x2_t" ] );
+    ( "uint4x32_to_byte_uniform_vec",
+      {|
/* Convert uint4x32 to 16 int8s - full range */
int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {
int8x16_t result;
@@ -774,9 +814,10 @@ int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "int8x16_t"]);
-
-  ("uint4x32_to_uint16_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int8x16_t" ] );
+    ( "uint4x32_to_uint16_uniform_vec",
+      {|
/* Convert uint4x32 to 8 uint16s - full range */
uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {
uint16x8_t result;
@@ -786,9 +827,10 @@ uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|
/* Convert uint4x32 to 8 bfloat16s uniform */
uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
uint16x8_t result;
@@ -810,9 +852,10 @@ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_fp8_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_fp8_uniform_vec",
+      {|
/* Convert uint4x32 to 16 fp8s uniform */
uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {
uint8x16_t result;
@@ -824,7 +867,8 @@ uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {
}
return result;
}
-|}, ["uint4x32_t"; "uint8x16_t"]);
-]
+|},
+      [ "uint4x32_t"; "uint8x16_t" ] );
+  ]


-let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)
\ No newline at end of file
+let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)
File "arrayjit/lib/backend_impl.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/backend_impl.ml b/_build/default/arrayjit/lib/.formatted/backend_impl.ml
index 6d483ff..45e6bea 100644
--- a/_build/default/arrayjit/lib/backend_impl.ml
+++ b/_build/default/arrayjit/lib/.formatted/backend_impl.ml
@@ -50,13 +50,11 @@ module No_device_buffer_and_copying () :
Stdlib.Gc.finalise finalize ptr;
ptr


-  let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) :
-      buffer_ptr =
+  let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr =
let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in
alloc_impl ~size_in_bytes


-  let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) :
-      buffer_ptr =
+  let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr =
let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in
let ptr = alloc_impl ~size_in_bytes in
(* Zero-initialize the allocated memory *)
File "arrayjit/lib/cc_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/cc_backend.ml b/_build/default/arrayjit/lib/.formatted/cc_backend.ml
index a23dbdf..9847c1a 100644
--- a/_build/default/arrayjit/lib/cc_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/cc_backend.ml
@@ -306,10 +306,10 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)
let idx_params = Indexing.bound_symbols bindings in
let build_file = Utils.open_build_file ~base_name:name ~extension:".c" in
let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-  let filtered_code = Syntax.filter_and_prepend_builtins
-    ~includes:Builtins_cc.includes
-    ~builtins:Builtins_cc.builtins
-    ~proc_doc in
+  let filtered_code =
+    Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins
+      ~proc_doc
+  in
(* Use ribbon = 1.0 for usual code formatting, width 110 *)
Out_channel.output_string build_file.oc filtered_code;
build_file.finalize ();
@@ -338,10 +338,10 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized opt
in
let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
let combined_proc_doc = PPrint.separate PPrint.hardline all_proc_docs in
-  let filtered_code = Syntax.filter_and_prepend_builtins
-    ~includes:Builtins_cc.includes
-    ~builtins:Builtins_cc.builtins
-    ~proc_doc:combined_proc_doc in
+  let filtered_code =
+    Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins
+      ~proc_doc:combined_proc_doc
+  in
Out_channel.output_string build_file.oc filtered_code;
build_file.finalize ();
let result_library = c_compile_and_load ~f_path:build_file.f_path in
File "arrayjit/lib/backends.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/backends.ml b/_build/default/arrayjit/lib/.formatted/backends.ml
index a6ab05e..166603a 100644
--- a/_build/default/arrayjit/lib/backends.ml
+++ b/_build/default/arrayjit/lib/.formatted/backends.ml
@@ -474,18 +474,17 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct
[%log (key : Tnode.t)];
let default () =
let dims = Lazy.force key.dims in
-        (* Use alloc_array when zero initialization is not needed:
-           - When copying from host immediately after allocation
-           - When the node has explicit Zero_out operations in the lowered code *)
-        let will_copy_from_host =
-          Utils.settings.automatic_host_transfers && Tn.known_constant key &&
-          match key.array with | (lazy (Some _)) -> true | _ -> false
+        (* Use alloc_array when zero initialization is not needed: - When copying from host
+           immediately after allocation - When the node has explicit Zero_out operations in the
+           lowered code *)
+        let will_copy_from_host =
+          Utils.settings.automatic_host_transfers && Tn.known_constant key
+          && match key.array with (lazy (Some _)) -> true | _ -> false
in
-        let dst_ptr =
+        let dst_ptr =
if will_copy_from_host || node.Low_level.zero_initialized_by_code then
alloc_array (Lazy.force key.prec) ~dims stream
-          else
-            alloc_zeros (Lazy.force key.prec) ~dims stream
+          else alloc_zeros (Lazy.force key.prec) ~dims stream
in
(if will_copy_from_host then
match key.array with
File "arrayjit/lib/c_syntax.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/c_syntax.ml b/_build/default/arrayjit/lib/.formatted/c_syntax.ml
index c24e453..8629785 100644
--- a/_build/default/arrayjit/lib/c_syntax.ml
+++ b/_build/default/arrayjit/lib/.formatted/c_syntax.ml
@@ -239,13 +239,13 @@ module C_syntax (B : C_syntax_config) = struct
let result_buffer = Buffer.create 4096 in
Buffer.add_string result_buffer includes;
Buffer.add_string result_buffer "\n";
-
+
(* Collect all needed keys, including dependencies *)
let needed_keys = ref (Set.empty (module String)) in
List.iter builtins ~f:(fun (key, _, _) ->
-      if String.is_substring doc_string ~substring:key then
-        needed_keys := Set.add !needed_keys key);
-
+        if String.is_substring doc_string ~substring:key then
+          needed_keys := Set.add !needed_keys key);
+
(* Add dependencies recursively *)
let processed_keys = ref (Set.empty (module String)) in
let rec add_dependencies key =
@@ -254,17 +254,15 @@ module C_syntax (B : C_syntax_config) = struct
needed_keys := Set.add !needed_keys key;
match List.find builtins ~f:(fun (k, _, _) -> String.equal k key) with
| Some (_, _, deps) -> List.iter deps ~f:add_dependencies
-        | None -> ()
-      )
+        | None -> ())
in
Set.iter !needed_keys ~f:add_dependencies;
-
+
(* Add the builtins in order *)
List.iter builtins ~f:(fun (key, definition, _) ->
-      if Set.mem !needed_keys key then (
-        Buffer.add_string result_buffer definition;
-        Buffer.add_string result_buffer "\n";
-      ));
+        if Set.mem !needed_keys key then (
+          Buffer.add_string result_buffer definition;
+          Buffer.add_string result_buffer "\n"));
Buffer.add_string result_buffer doc_string;
Buffer.contents result_buffer


@@ -768,7 +766,9 @@ module C_syntax (B : C_syntax_config) = struct
let ident_doc = string (get_ident tn) in
let num_elems = Tn.num_elems tn in
let size_doc = OCaml.int num_elems in
-               let init_doc = if node.Low_level.zero_initialized_by_code then string " = {0}" else empty in
+               let init_doc =
+                 if node.Low_level.zero_initialized_by_code then string " = {0}" else empty
+               in
typ_doc ^^ space ^^ ident_doc ^^ brackets size_doc ^^ init_doc ^^ semi ^^ hardline
else empty)
(Hashtbl.to_alist traced_store)
File "arrayjit/lib/metal_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/metal_backend.ml b/_build/default/arrayjit/lib/.formatted/metal_backend.ml
index 1a4ed71..63d0444 100644
--- a/_build/default/arrayjit/lib/metal_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/metal_backend.ml
@@ -662,8 +662,10 @@ end) : Ir.Backend_impl.Lowered_backend = struct
let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
let metal_includes = {|#include <metal_stdlib>
using namespace metal;|} in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins
+        ~proc_doc
+    in
{
metal_source = source;
compiled_code = Array.create ~len:num_devs None;
@@ -690,8 +692,10 @@ using namespace metal;|} in
let final_doc = PPrint.(separate hardline all_proc_docs) in
let metal_includes = {|#include <metal_stdlib>
using namespace metal;|} in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc:final_doc in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins
+        ~proc_doc:final_doc
+    in
let traced_stores = Array.map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store)) in
let funcs = Array.map funcs_and_docs ~f:(Option.map ~f:fst) in
{
File "arrayjit/lib/cuda_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/cuda_backend.ml b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml
index aeb3cb0..2868831 100644
--- a/_build/default/arrayjit/lib/cuda_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml
@@ -769,7 +769,6 @@ end) : Ir.Backend_impl.Lowered_backend = struct
^^ rparen ^^ semi
end


-
let%diagn2_sexp compile ~name bindings ({ Low_level.traced_store; _ } as lowered) =
(* TODO: The following link seems to claim it's better to expand into loops than use memset.
https://stackoverflow.com/questions/23712558/how-do-i-best-initialize-a-local-memory-array-to-0 *)
@@ -778,11 +777,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct
end)) in
let idx_params = Indexing.bound_symbols bindings in
let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-    let cuda_includes = {|#include <cuda_fp16.h>
-#include <cuda_bf16.h>|} ^
-      (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc in
+    let cuda_includes =
+      {|#include <cuda_fp16.h>
+#include <cuda_bf16.h>|}
+      ^
+      if Utils.debug_log_from_routines () then
+        "\n__device__ int printf (const char * format, ... );"
+      else ""
+    in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins
+        ~proc_doc
+    in
let ptx = cuda_to_ptx ~name source in
{ traced_store; ptx; params; bindings; name }


@@ -800,11 +806,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct
in
let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
let final_doc = PPrint.(separate hardline all_proc_docs) in
-    let cuda_includes = {|#include <cuda_fp16.h>
-#include <cuda_bf16.h>|} ^
-      (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc:final_doc in
+    let cuda_includes =
+      {|#include <cuda_fp16.h>
+#include <cuda_bf16.h>|}
+      ^
+      if Utils.debug_log_from_routines () then
+        "\n__device__ int printf (const char * format, ... );"
+      else ""
+    in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins
+        ~proc_doc:final_doc
+    in


let name : string =
String.(
dune build @fmt failed
"/usr/bin/env" "bash" "-c" "opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)" failed with exit status 2
2025-08-23 11:29.21: Job failed: Failed: Build failed