Organisations ahrefs ocannl 7fe340 ()(lint-fmt)
(lint-fmt)

Created at Aug 24 09:40 +00:00
0s in queue
Finished at Aug 24 09:40 +00:00
Ran for 3s
Logs

2025-08-24 09:40.15: New job: test ahrefs/ocannl https://github.com/ahrefs/ocannl.git#refs/heads/master (7fe3406b69dea29244d16ef354c1a477c1410682) (linux-x86_64:(lint-fmt))
Base: ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
ocamlformat version: version 0.27.0 (from opam)

To reproduce locally:

git clone --recursive "https://github.com/ahrefs/ocannl.git" -b "master" && cd "ocannl" && git reset --hard 7fe3406b
cat > Dockerfile <<'END-OF-DOCKERFILE'
FROM ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
USER 1000:1000
RUN cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u
RUN opam depext -i dune
WORKDIR /src
RUN opam depext -i ocamlformat=0.27.0
COPY --chown=1000:1000 . /src/
RUN opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)

END-OF-DOCKERFILE
docker build .
END-REPRO-BLOCK

2025-08-24 09:40.15: Using cache hint "ahrefs/ocannl-ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a-debian-12-4.08_opam-2.4-ocamlformat-b8021439f8c57ba6435bc2263f6596671f4f4466"
2025-08-24 09:40.15: Using OBuilder spec:
((from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)
 (user (uid 1000) (gid 1000))
 (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
      (network host)
      (shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))
 (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
      (network host)
      (shell "opam depext -i dune"))
 (workdir /src)
 (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
      (network host)
      (shell "opam depext -i ocamlformat=0.27.0"))
 (copy (src .) (dst /src/))
 (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)"))
)

2025-08-24 09:40.15: Waiting for resource in pool OCluster
2025-08-24 09:40.15: Waiting for worker…
2025-08-24 09:40.15: Got resource from pool OCluster
Building on eumache
All commits already cached
HEAD is now at 7fe3406b Merge pull request #368 from ahrefs/feature/tensor-rootness-check

(from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a)
Unable to find image 'ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a' locally
docker.io/ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a: Pulling from ocaml/opam
2b437dab448b: Already exists
348b54b4f842: Already exists
d67c1d522a4e: Already exists
dea856d1a4d8: Already exists
b580a9d7b5b1: Already exists
Digest: sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
Status: Downloaded newer image for ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a
2025-08-24 09:40.16 ---> using "1d0024db739bd078f91b2384c47919652a4b72a425e3e24ce24cfd1f6debdfbc" from cache

/: (user (uid 1000) (gid 1000))

/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
        (network host)
        (shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u"))
b8021439f8 Merge pull request #28261 from kit-ty-kate/deploy-fix-25819

<><> Updating package repositories ><><><><><><><><><><><><><><><><><><><><><><>
[default] Initialised
default (at git+file:///home/opam/opam-repository): 
    [INFO] opam 2.1 and 2.2 include many performance and security improvements over 2.0; please consider upgrading (https://opam.ocaml.org/doc/Install.html)


Everything as up-to-date as possible (run with --verbose to show unavailable upgrades).
However, you may "opam upgrade" these packages explicitly, which will ask permission to downgrade or uninstall the conflicting packages.
Nothing to do.
# Run eval $(opam env) to update the current shell environment
2025-08-24 09:40.16 ---> using "76d9d96bb26da3c78200d383fd35f876d80571baf05962331a1fff5f47db0e2e" from cache

/: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
        (network host)
        (shell "opam depext -i dune"))
# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian
# No extra OS packages requirements found.
# All required OS packages found.
# Now letting opam install the packages
The following actions will be performed:
  - install dune 3.19.1

<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[dune.3.19.1] found in cache

<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-> installed dune.3.19.1
Done.
# Run eval $(opam env) to update the current shell environment
2025-08-24 09:40.16 ---> using "da0888a20a067de19f6183f0b497dcc2d1ea7f7036861cc50f633c662efdce4f" from cache

/: (workdir /src)

/src: (run (cache (opam-archives (target /home/opam/.opam/download-cache)))
           (network host)
           (shell "opam depext -i ocamlformat=0.27.0"))
# Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian
# No extra OS packages requirements found.
# All required OS packages found.
# Now letting opam install the packages
The following actions will be performed:
  - install sexplib0          v0.14.0  [required by base]
  - install menhirLib         20240715 [required by ocamlformat-lib]
  - install menhirCST         20240715 [required by menhir]
  - install ocamlbuild        0.16.1   [required by fpath, astring, uuseg]
  - install cmdliner          1.3.0    [required by ocamlformat]
  - install menhirSdk         20240715 [required by ocamlformat-lib]
  - install either            1.0.0    [required by ocamlformat-lib]
  - install csexp             1.5.2    [required by ocamlformat]
  - install dune-build-info   3.19.1   [required by ocamlformat-lib]
  - install camlp-streams     5.0.1    [required by ocamlformat-lib]
  - install seq               base     [required by re]
  - install fix               20250428 [required by ocamlformat-lib]
  - install ocaml-version     4.0.1    [required by ocamlformat-lib]
  - install ocamlfind         1.9.8    [required by ocp-indent, astring, fpath, uuseg]
  - install menhir            20240715 [required by ocamlformat-lib]
  - install dune-configurator 3.19.1   [required by base]
  - install re                1.11.0   [required by ocamlformat]
  - install topkg             1.1.0    [required by fpath, astring, uuseg]
  - install base-bytes        base     [required by ocp-indent]
  - install base              v0.14.3  [required by ocamlformat-lib]
  - install uutf              1.0.4    [required by ocamlformat-lib]
  - install astring           0.8.5    [required by ocamlformat-lib]
  - install ocp-indent        1.8.1    [required by ocamlformat-lib]
  - install stdio             v0.14.0  [required by ocamlformat-lib]
  - install uucp              15.0.0   [required by uuseg]
  - install fpath             0.7.3    [required by ocamlformat-lib]
  - install uuseg             15.0.0   [required by ocamlformat-lib]
  - install ocamlformat-lib   0.27.0   [required by ocamlformat]
  - install ocamlformat       0.27.0
===== 29 to install =====

<><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[astring.0.8.5] found in cache
[base.v0.14.3] found in cache
[camlp-streams.5.0.1] found in cache
[cmdliner.1.3.0] found in cache
[csexp.1.5.2] found in cache
[dune-build-info.3.19.1] found in cache
[dune-configurator.3.19.1] found in cache
[either.1.0.0] found in cache
[fix.20250428] found in cache
[fpath.0.7.3] found in cache
[menhir.20240715] found in cache
[menhirCST.20240715] found in cache
[menhirLib.20240715] found in cache
[menhirSdk.20240715] found in cache
[ocaml-version.4.0.1] found in cache
[ocamlbuild.0.16.1] found in cache
[ocamlfind.1.9.8] found in cache
[ocamlformat.0.27.0] found in cache
[ocamlformat-lib.0.27.0] found in cache
[ocp-indent.1.8.1] found in cache
[re.1.11.0] found in cache
[sexplib0.v0.14.0] found in cache
[stdio.v0.14.0] found in cache
[topkg.1.1.0] found in cache
[uucp.15.0.0] found in cache
[uuseg.15.0.0] found in cache
[uutf.1.0.4] found in cache

<><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-> installed seq.base
-> installed camlp-streams.5.0.1
-> installed csexp.1.5.2
-> installed either.1.0.0
-> installed cmdliner.1.3.0
-> installed fix.20250428
-> installed menhirCST.20240715
-> installed menhirLib.20240715
-> installed menhirSdk.20240715
-> installed ocaml-version.4.0.1
-> installed sexplib0.v0.14.0
-> installed re.1.11.0
-> installed dune-build-info.3.19.1
-> installed dune-configurator.3.19.1
-> installed ocamlfind.1.9.8
-> installed base-bytes.base
-> installed ocp-indent.1.8.1
-> installed ocamlbuild.0.16.1
-> installed base.v0.14.3
-> installed stdio.v0.14.0
-> installed topkg.1.1.0
-> installed uutf.1.0.4
-> installed astring.0.8.5
-> installed fpath.0.7.3
-> installed menhir.20240715
-> installed uucp.15.0.0
-> installed uuseg.15.0.0
-> installed ocamlformat-lib.0.27.0
-> installed ocamlformat.0.27.0
Done.

<><> ocp-indent.1.8.1 installed successfully ><><><><><><><><><><><><><><><><><>
=> This package requires additional configuration for use in editors. Install package 'user-setup', or manually:

   * for Emacs, add these lines to ~/.emacs:
     (add-to-list 'load-path "/home/opam/.opam/4.08/share/emacs/site-lisp")
     (require 'ocp-indent)

   * for Vim, add this line to ~/.vimrc:
     set rtp^="/home/opam/.opam/4.08/share/ocp-indent/vim"
# Run eval $(opam env) to update the current shell environment
2025-08-24 09:40.16 ---> using "86ec8dcb8046a1e5dacfb1841e8c026d30cfead67649bcb6d6a8f9ddd6fb153d" from cache

/src: (copy (src .) (dst /src/))
2025-08-24 09:40.17 ---> saved as "a809cbb37111f2865c2f50946bfde16e234850e0c770765ec6b6e948e09e729f"

/src: (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)"))
File "arrayjit/lib/dune", line 7, characters 30-51:
7 |  (libraries base stdio pprint ppx_minidebug.runtime)
                                  ^^^^^^^^^^^^^^^^^^^^^
Error: Library "ppx_minidebug.runtime" not found.
-> required by library "arrayjit.utils" in _build/default/arrayjit/lib
-> required by executable ocannl_read_config in test/operations/config/dune:5
-> required by _build/default/test/operations/config/ocannl_read_config.exe
-> required by %{dep:ocannl_read_config.exe} at test/operations/dune:22
-> required by _build/default/test/operations/config/ocannl_backend.txt
-> required by %{read:config/ocannl_backend.txt} at test/operations/dune:36
-> required by Computing directory contents of _build/default/test/operations
File "arrayjit/lib/no_device_backend_missing.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/no_device_backend_missing.ml b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml
index 0e04309..890e1c2 100644
--- a/_build/default/arrayjit/lib/no_device_backend_missing.ml
+++ b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml
@@ -35,12 +35,8 @@ struct
   let alloc_buffer ?old_buffer:_ ~size_in_bytes:_ () =
     failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
 
-  let alloc_array _prec ~dims:_ () =
-    failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
-
-  let alloc_zeros _prec ~dims:_ () =
-    failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
-
+  let alloc_array _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
+  let alloc_zeros _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
   let free_buffer = None
   let get_used_memory () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)"
 
File "arrayjit/lib/builtins_cuda.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_cuda.ml b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml
index c2a2e96..596c827 100644
--- a/_build/default/arrayjit/lib/builtins_cuda.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml
@@ -1,103 +1,119 @@
 (* CUDA builtin code split into (key, definition, dependencies) triples for filtering *)
-let builtins = [
-  ("uint4x32_t", {|typedef struct {
+let builtins =
+  [
+    ("uint4x32_t", {|typedef struct {
     unsigned int v[4];
 } uint4x32_t;|}, []);
-
-  ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);
-  ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);
-  ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);
-  ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);
-  ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);
-  ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);
-  ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);
-  ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);
-
-  ("uint32_to_single_uniform", {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {
+    ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []);
+    ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []);
+    ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []);
+    ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []);
+    ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []);
+    ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []);
+    ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []);
+    ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []);
+    ( "uint32_to_single_uniform",
+      {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) {
   /* Use __uint2float_rn for correct rounding */
   return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f);
-}|}, []);
-
-  ("uint32_to_double_uniform", {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {
+}|},
+      [] );
+    ( "uint32_to_double_uniform",
+      {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) {
   return __uint2double_rn(x) * (1.0 / 4294967296.0);
-}|}, []);
-
-  ("uint4x32_to_single_uniform", {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform",
+      {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
   return uint32_to_single_uniform(x.v[0]);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform", {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform",
+      {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
   unsigned long long combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
   return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int32_uniform", {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int32_uniform",
+      {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) {
   return (int)x.v[0];
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_i64_uniform", {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_i64_uniform",
+      {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) {
   return __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u32_uniform", {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u32_uniform",
+      {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) {
   return x.v[0];
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u64_uniform", {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u64_uniform",
+      {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) {
   return (unsigned long long)__double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_i8_uniform", {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_i8_uniform",
+      {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) {
   return (signed char)(x.v[0] & 0xFF);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_u8_uniform", {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_u8_uniform",
+      {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) {
   return (unsigned char)(x.v[0] & 0xFF);
-}|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_bfloat16_uniform", {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) {
   float f = uint32_to_single_uniform(x.v[0]);
   return (unsigned short)(__float_as_uint(f) >> 16);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
   float f = uint32_to_single_uniform(x.v[0]);
   return __float2half(f);
-}|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_single_uniform_vec", {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_single_uniform_vec",
+      {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
   float4_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
     result.v[i] = uint32_to_single_uniform(x.v[i]);
   }
   return result;
-}|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
   double2_t result;
   result.v[0] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[1], x.v[0]))) * (1.0 / 18446744073709551616.0);
   result.v[1] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[3], x.v[2]))) * (1.0 / 18446744073709551616.0);
   return result;
-}|}, ["uint4x32_t"; "double2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "double2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
   int32x4_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
     result.v[i] = (int)x.v[i];
   }
   return result;
-}|}, ["uint4x32_t"; "int32x4_t"]);
-
-  ("uint4x32_to_i64_uniform_vec", {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int32x4_t" ] );
+    ( "uint4x32_to_i64_uniform_vec",
+      {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) {
   int64x2_t result;
   result.v[0] = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
   result.v[1] = __double_as_longlong(__hiloint2double(x.v[3], x.v[2]));
   return result;
-}|}, ["uint4x32_t"; "int64x2_t"]);
-
-  ("uint4x32_to_i8_uniform_vec", {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int64x2_t" ] );
+    ( "uint4x32_to_i8_uniform_vec",
+      {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) {
   int8x16_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
@@ -107,9 +123,10 @@ let builtins = [
     result.v[i*4 + 3] = (signed char)((x.v[i] >> 24) & 0xFF);
   }
   return result;
-}|}, ["uint4x32_t"; "int8x16_t"]);
-
-  ("uint4x32_to_u16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "int8x16_t" ] );
+    ( "uint4x32_to_u16_uniform_vec",
+      {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) {
   uint16x8_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
@@ -117,9 +134,10 @@ let builtins = [
     result.v[i*2 + 1] = (unsigned short)((x.v[i] >> 16) & 0xFFFF);
   }
   return result;
-}|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
   uint16x8_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
@@ -130,9 +148,10 @@ let builtins = [
     result.v[i*2 + 1] = (unsigned short)(__float_as_uint(f2) >> 16);
   }
   return result;
-}|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_half_uniform_vec", {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
   half8_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
@@ -142,9 +161,10 @@ let builtins = [
     result.v[i*2 + 1] = __float2half(f2);
   }
   return result;
-}|}, ["uint4x32_t"; "half8_t"]);
-
-  ("uint4x32_to_u8_uniform_vec", {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {
+}|},
+      [ "uint4x32_t"; "half8_t" ] );
+    ( "uint4x32_to_u8_uniform_vec",
+      {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) {
   uint8x16_t result;
   #pragma unroll
   for (int i = 0; i < 4; i++) {
@@ -154,70 +174,81 @@ let builtins = [
     result.v[i*4 + 3] = (unsigned char)((x.v[i] >> 24) & 0xFF);
   }
   return result;
-}|}, ["uint4x32_t"; "uint8x16_t"]);
-
-  ("single_to_uint4x32", {|__device__ uint4x32_t single_to_uint4x32(float x) {
+}|},
+      [ "uint4x32_t"; "uint8x16_t" ] );
+    ( "single_to_uint4x32",
+      {|__device__ uint4x32_t single_to_uint4x32(float x) {
   unsigned int bits = __float_as_uint(x);
   uint4x32_t result = {{bits, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("double_to_uint4x32", {|__device__ uint4x32_t double_to_uint4x32(double x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "double_to_uint4x32",
+      {|__device__ uint4x32_t double_to_uint4x32(double x) {
   unsigned long long bits = __double_as_longlong(x);
   uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("int32_to_uint4x32", {|__device__ uint4x32_t int32_to_uint4x32(int x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "int32_to_uint4x32",
+      {|__device__ uint4x32_t int32_to_uint4x32(int x) {
   uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("int64_to_uint4x32", {|__device__ uint4x32_t int64_to_uint4x32(long long x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "int64_to_uint4x32",
+      {|__device__ uint4x32_t int64_to_uint4x32(long long x) {
   unsigned long long bits = (unsigned long long)x;
   uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint32_to_uint4x32", {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint32_to_uint4x32",
+      {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) {
   uint4x32_t result = {{x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint64_to_uint4x32", {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint64_to_uint4x32",
+      {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) {
   uint4x32_t result = {{(unsigned int)(x & 0xFFFFFFFF), (unsigned int)(x >> 32), 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("byte_to_uint4x32", {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "byte_to_uint4x32",
+      {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) {
   uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("uint16_to_uint4x32", {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "uint16_to_uint4x32",
+      {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) {
   uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("bfloat16_to_uint4x32", {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "bfloat16_to_uint4x32",
+      {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) {
   uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("half_to_uint4x32", {|__device__ uint4x32_t half_to_uint4x32(__half x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "half_to_uint4x32",
+      {|__device__ uint4x32_t half_to_uint4x32(__half x) {
   unsigned short bits = __half_as_ushort(x);
   uint4x32_t result = {{(unsigned int)bits, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("fp8_to_uint4x32", {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {
+}|},
+      [ "uint4x32_t" ] );
+    ( "fp8_to_uint4x32",
+      {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) {
   uint4x32_t result = {{(unsigned int)x, 0, 0, 0}};
   return result;
-}|}, ["uint4x32_t"]);
-
-  ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);
-
-  ("THREEFRY_ROTATION", {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {
+}|},
+      [ "uint4x32_t" ] );
+    ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []);
+    ( "THREEFRY_ROTATION",
+      {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = {
     {13, 15, 26, 6},
     {17, 29, 16, 24},
     {13, 15, 26, 6},
@@ -226,13 +257,15 @@ let builtins = [
     {17, 29, 16, 24},
     {13, 15, 26, 6},
     {17, 29, 16, 24}
-};|}, []);
-
-  ("rotl32", {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {
+};|},
+      [] );
+    ( "rotl32",
+      {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) {
     return __funnelshift_l(x, x, n);
-}|}, []);
-
-  ("threefry_round", {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {
+}|},
+      [] );
+    ( "threefry_round",
+      {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) {
     x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;
     x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;
     
@@ -246,9 +279,10 @@ let builtins = [
     tmp = x.y;
     x.y = x.w;
     x.w = tmp;
-}|}, ["rotl32"]);
-
-  ("arrayjit_threefry4x32_crypto", {|__device__ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) {
+}|},
+      [ "rotl32" ] );
+    ( "arrayjit_threefry4x32_crypto",
+      {|__device__ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) {
     uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]);
     uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]);
     
@@ -310,9 +344,10 @@ let builtins = [
     result.v[2] = x.z;
     result.v[3] = x.w;
     return result;
-}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]);
-
-  ("arrayjit_threefry4x32_light", {|__device__ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) {
+}|},
+      [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] );
+    ( "arrayjit_threefry4x32_light",
+      {|__device__ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) {
     uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]);
     uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]);
     
@@ -343,10 +378,12 @@ let builtins = [
     result.v[2] = x.z;
     result.v[3] = x.w;
     return result;
-}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]);
-
-  ("arrayjit_threefry4x32", {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
+}|},
+      [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] );
+    ( "arrayjit_threefry4x32",
+      {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
     /* Default to light version */
     return arrayjit_threefry4x32_light(key, counter);
-}|}, ["uint4x32_t"; "arrayjit_threefry4x32_light"]);
-]
\ No newline at end of file
+}|},
+      [ "uint4x32_t"; "arrayjit_threefry4x32_light" ] );
+  ]
File "arrayjit/lib/builtins_metal.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_metal.ml b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml
index b403f89..f77385e 100644
--- a/_build/default/arrayjit/lib/builtins_metal.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml
@@ -1,24 +1,22 @@
 (* Metal builtin code split into (key, definition, dependencies) triples for filtering *)
-let builtins = [
-  ("METAL_HEADERS", {|#include <metal_stdlib>
+let builtins =
+  [
+    ("METAL_HEADERS", {|#include <metal_stdlib>
 using namespace metal;|}, []);
-
-  ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);
-
-  ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);
-  ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);
-  ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);
-  ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);
-  ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);
-  ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);
-  ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);
-  ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);
-
-  ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {
+    ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []);
+    ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []);
+    ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []);
+    ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []);
+    ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []);
+    ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []);
+    ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []);
+    ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []);
+    ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []);
+    ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) {
     return rotate(x, n);
 }|}, []);
-
-  ("threefry_round", {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {
+    ( "threefry_round",
+      {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) {
     x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x;
     x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z;
     
@@ -32,9 +30,10 @@ using namespace metal;|}, []);
     tmp = x.y;
     x.y = x.w;
     x.w = tmp;
-}|}, ["rotl32"]);
-
-  ("arrayjit_threefry4x32_crypto", {|uint4 arrayjit_threefry4x32_crypto(uint4 key, uint4 counter) {
+}|},
+      [ "rotl32" ] );
+    ( "arrayjit_threefry4x32_crypto",
+      {|uint4 arrayjit_threefry4x32_crypto(uint4 key, uint4 counter) {
     uint4 x = counter;
     uint4 k = key;
     
@@ -119,11 +118,21 @@ using namespace metal;|}, []);
     x.w += 5;
     
     return x;
-}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1"; 
-      "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0"; 
-      "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]);
-
-  ("arrayjit_threefry4x32_light", {|uint4 arrayjit_threefry4x32_light(uint4 key, uint4 counter) {
+}|},
+      [
+        "THREEFRY_C240";
+        "threefry_round";
+        "THREEFRY_ROTATION_0_0";
+        "THREEFRY_ROTATION_0_1";
+        "THREEFRY_ROTATION_0_2";
+        "THREEFRY_ROTATION_0_3";
+        "THREEFRY_ROTATION_1_0";
+        "THREEFRY_ROTATION_1_1";
+        "THREEFRY_ROTATION_1_2";
+        "THREEFRY_ROTATION_1_3";
+      ] );
+    ( "arrayjit_threefry4x32_light",
+      {|uint4 arrayjit_threefry4x32_light(uint4 key, uint4 counter) {
     uint4 x = counter;
     uint4 k = key;
     
@@ -146,120 +155,148 @@ using namespace metal;|}, []);
     x.w += ks4 + 1;
     
     return x;
-}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1"; 
-      "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0"; 
-      "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]);
-
-  ("arrayjit_threefry4x32", {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {
+}|},
+      [
+        "THREEFRY_C240";
+        "threefry_round";
+        "THREEFRY_ROTATION_0_0";
+        "THREEFRY_ROTATION_0_1";
+        "THREEFRY_ROTATION_0_2";
+        "THREEFRY_ROTATION_0_3";
+        "THREEFRY_ROTATION_1_0";
+        "THREEFRY_ROTATION_1_1";
+        "THREEFRY_ROTATION_1_2";
+        "THREEFRY_ROTATION_1_3";
+      ] );
+    ( "arrayjit_threefry4x32",
+      {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) {
     /* Default to light version */
     return arrayjit_threefry4x32_light(key, counter);
-}|}, ["arrayjit_threefry4x32_light"]);
-
-  ("float4_t", {|struct float4_t { float4 v; };|}, []);
-  ("float2_t", {|struct float2_t { float2 v; };|}, []);
-  ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);
-  ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);
-  ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);
-  ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);
-  ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);
-  ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);
-  ("half8_t", {|struct half8_t { half v[8]; };|}, []);
-
-  ("uint32_to_single_uniform", {|inline float uint32_to_single_uniform(uint32_t x) {
+}|},
+      [ "arrayjit_threefry4x32_light" ] );
+    ("float4_t", {|struct float4_t { float4 v; };|}, []);
+    ("float2_t", {|struct float2_t { float2 v; };|}, []);
+    ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []);
+    ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []);
+    ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []);
+    ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []);
+    ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []);
+    ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []);
+    ("half8_t", {|struct half8_t { half v[8]; };|}, []);
+    ( "uint32_to_single_uniform",
+      {|inline float uint32_to_single_uniform(uint32_t x) {
     return (x >> 8) * (1.0f / 16777216.0f);
-}|}, []);
-
-  ("uint4x32_to_single_uniform", {|float uint4x32_to_single_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform",
+      {|float uint4x32_to_single_uniform(uint4 x) {
     return uint32_to_single_uniform(x.x);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform", {|float uint4x32_to_double_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform",
+      {|float uint4x32_to_double_uniform(uint4 x) {
     /* Fallback to float precision */
     uint64_t combined = (uint64_t(x.y) << 32) | x.x;
     return float(combined) * (1.0f / 18446744073709551616.0f);
-}|}, []);
-
-  ("uint4x32_to_int32_uniform", {|int32_t uint4x32_to_int32_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_int32_uniform",
+      {|int32_t uint4x32_to_int32_uniform(uint4 x) {
     return int32_t(x.x);
-}|}, []);
-
-  ("uint4x32_to_int64_uniform", {|int64_t uint4x32_to_int64_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_int64_uniform",
+      {|int64_t uint4x32_to_int64_uniform(uint4 x) {
     return int64_t((uint64_t(x.y) << 32) | x.x);
-}|}, []);
-
-  ("uint4x32_to_uint32_uniform", {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint32_uniform",
+      {|uint32_t uint4x32_to_uint32_uniform(uint4 x) {
     return x.x;
-}|}, []);
-
-  ("uint4x32_to_uint64_uniform", {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint64_uniform",
+      {|uint64_t uint4x32_to_uint64_uniform(uint4 x) {
     return (uint64_t(x.y) << 32) | x.x;
-}|}, []);
-
-  ("uint4x32_to_byte_uniform", {|int8_t uint4x32_to_byte_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_byte_uniform",
+      {|int8_t uint4x32_to_byte_uniform(uint4 x) {
     return int8_t(x.x & 0xFF);
-}|}, []);
-
-  ("uint4x32_to_uint16_uniform", {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint16_uniform",
+      {|uint16_t uint4x32_to_uint16_uniform(uint4 x) {
     return uint16_t(x.x & 0xFFFF);
-}|}, []);
-
-  ("uint4x32_to_bfloat16_uniform", {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) {
     float f = uint32_to_single_uniform(x.x);
     return uint16_t(as_type<uint32_t>(f) >> 16);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|half uint4x32_to_half_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|half uint4x32_to_half_uniform(uint4 x) {
     float f = uint32_to_single_uniform(x.x);
     return half(f);
-}|}, ["uint32_to_single_uniform"]);
-
-  ("uint4x32_to_fp8_uniform", {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {
+}|},
+      [ "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_fp8_uniform",
+      {|uint8_t uint4x32_to_fp8_uniform(uint4 x) {
     return uint8_t(x.x & 0xFF);
-}|}, []);
-
-  ("uint4x32_to_single_uniform_vec", {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_single_uniform_vec",
+      {|float4_t uint4x32_to_single_uniform_vec(uint4 x) {
     float4_t result;
     result.v.x = uint32_to_single_uniform(x.x);
     result.v.y = uint32_to_single_uniform(x.y);
     result.v.z = uint32_to_single_uniform(x.z);
     result.v.w = uint32_to_single_uniform(x.w);
     return result;
-}|}, ["float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {
+}|},
+      [ "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|float2_t uint4x32_to_double_uniform_vec(uint4 x) {
     float2_t result;
     uint64_t combined1 = (uint64_t(x.y) << 32) | x.x;
     uint64_t combined2 = (uint64_t(x.w) << 32) | x.z;
     result.v.x = float(combined1) * (1.0f / 18446744073709551616.0f);
     result.v.y = float(combined2) * (1.0f / 18446744073709551616.0f);
     return result;
-}|}, ["float2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {
+}|},
+      [ "float2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) {
     int32x4_t result;
     result.v = int4(x);
     return result;
-}|}, ["int32x4_t"]);
-
-  ("uint4x32_to_int64_uniform_vec", {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {
+}|},
+      [ "int32x4_t" ] );
+    ( "uint4x32_to_int64_uniform_vec",
+      {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) {
     int64x2_t result;
     result.v[0] = (int64_t(x.y) << 32) | x.x;
     result.v[1] = (int64_t(x.w) << 32) | x.z;
     return result;
-}|}, ["int64x2_t"]);
-
-  ("uint4x32_to_uint32_uniform_vec", {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {
+}|},
+      [ "int64x2_t" ] );
+    ( "uint4x32_to_uint32_uniform_vec",
+      {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) {
     return x;
-}|}, []);
-
-  ("uint4x32_to_uint64_uniform_vec", {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {
+}|},
+      [] );
+    ( "uint4x32_to_uint64_uniform_vec",
+      {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) {
     uint64x2_t result;
     result.v[0] = (uint64_t(x.y) << 32) | x.x;
     result.v[1] = (uint64_t(x.w) << 32) | x.z;
     return result;
-}|}, ["uint64x2_t"]);
-
-  ("uint4x32_to_byte_uniform_vec", {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {
+}|},
+      [ "uint64x2_t" ] );
+    ( "uint4x32_to_byte_uniform_vec",
+      {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) {
     int8x16_t result;
     uint4 v = x;
     for (int i = 0; i < 4; i++) {
@@ -270,9 +307,10 @@ using namespace metal;|}, []);
         result.v[i*4 + 3] = int8_t((val >> 24) & 0xFF);
     }
     return result;
-}|}, ["int8x16_t"]);
-
-  ("uint4x32_to_uint16_uniform_vec", {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {
+}|},
+      [ "int8x16_t" ] );
+    ( "uint4x32_to_uint16_uniform_vec",
+      {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) {
     uint16x8_t result;
     uint4 v = x;
     for (int i = 0; i < 4; i++) {
@@ -281,9 +319,10 @@ using namespace metal;|}, []);
         result.v[i*2 + 1] = uint16_t((val >> 16) & 0xFFFF);
     }
     return result;
-}|}, ["uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {
+}|},
+      [ "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) {
     uint16x8_t result;
     uint4 v = x;
     for (int i = 0; i < 4; i++) {
@@ -294,9 +333,10 @@ using namespace metal;|}, []);
         result.v[i*2 + 1] = uint16_t(as_type<uint32_t>(f2) >> 16);
     }
     return result;
-}|}, ["uint16x8_t"]);
-
-  ("uint4x32_to_half_uniform_vec", {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {
+}|},
+      [ "uint16x8_t" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|half8_t uint4x32_to_half_uniform_vec(uint4 x) {
     half8_t result;
     uint4 v = x;
     for (int i = 0; i < 4; i++) {
@@ -307,9 +347,10 @@ using namespace metal;|}, []);
         result.v[i*2 + 1] = half(f2);
     }
     return result;
-}|}, ["half8_t"]);
-
-  ("uint4x32_to_fp8_uniform_vec", {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {
+}|},
+      [ "half8_t" ] );
+    ( "uint4x32_to_fp8_uniform_vec",
+      {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) {
     uint8x16_t result;
     uint4 v = x;
     for (int i = 0; i < 4; i++) {
@@ -320,53 +361,65 @@ using namespace metal;|}, []);
         result.v[i*4 + 3] = uint8_t((val >> 24) & 0xFF);
     }
     return result;
-}|}, ["uint8x16_t"]);
-
-  ("single_to_uint4x32", {|uint4 single_to_uint4x32(float x) {
+}|},
+      [ "uint8x16_t" ] );
+    ( "single_to_uint4x32",
+      {|uint4 single_to_uint4x32(float x) {
     uint32_t bits = as_type<uint32_t>(x);
     return uint4(bits, 0, 0, 0);
-}|}, []);
-
-  ("double_to_uint4x32", {|uint4 double_to_uint4x32(float x) {
+}|},
+      [] );
+    ( "double_to_uint4x32",
+      {|uint4 double_to_uint4x32(float x) {
     /* Metal doesn't have native double support, use float fallback */
     uint32_t bits = as_type<uint32_t>(x);
     return uint4(bits, 0, 0, 0);
-}|}, []);
-
-  ("int32_to_uint4x32", {|uint4 int32_to_uint4x32(int32_t x) {
+}|},
+      [] );
+    ( "int32_to_uint4x32",
+      {|uint4 int32_to_uint4x32(int32_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("int64_to_uint4x32", {|uint4 int64_to_uint4x32(int64_t x) {
+}|},
+      [] );
+    ( "int64_to_uint4x32",
+      {|uint4 int64_to_uint4x32(int64_t x) {
     uint64_t bits = uint64_t(x);
     return uint4(uint32_t(bits & 0xFFFFFFFF), uint32_t(bits >> 32), 0, 0);
-}|}, []);
-
-  ("uint32_to_uint4x32", {|uint4 uint32_to_uint4x32(uint32_t x) {
+}|},
+      [] );
+    ( "uint32_to_uint4x32",
+      {|uint4 uint32_to_uint4x32(uint32_t x) {
     return uint4(x, 0, 0, 0);
-}|}, []);
-
-  ("uint64_to_uint4x32", {|uint4 uint64_to_uint4x32(uint64_t x) {
+}|},
+      [] );
+    ( "uint64_to_uint4x32",
+      {|uint4 uint64_to_uint4x32(uint64_t x) {
     return uint4(uint32_t(x & 0xFFFFFFFF), uint32_t(x >> 32), 0, 0);
-}|}, []);
-
-  ("byte_to_uint4x32", {|uint4 byte_to_uint4x32(int8_t x) {
+}|},
+      [] );
+    ( "byte_to_uint4x32",
+      {|uint4 byte_to_uint4x32(int8_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("uint16_to_uint4x32", {|uint4 uint16_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "uint16_to_uint4x32",
+      {|uint4 uint16_to_uint4x32(uint16_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("bfloat16_to_uint4x32", {|uint4 bfloat16_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "bfloat16_to_uint4x32",
+      {|uint4 bfloat16_to_uint4x32(uint16_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("half_to_uint4x32", {|uint4 half_to_uint4x32(uint16_t x) {
+}|},
+      [] );
+    ( "half_to_uint4x32",
+      {|uint4 half_to_uint4x32(uint16_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-
-  ("fp8_to_uint4x32", {|uint4 fp8_to_uint4x32(uint8_t x) {
+}|},
+      [] );
+    ( "fp8_to_uint4x32",
+      {|uint4 fp8_to_uint4x32(uint8_t x) {
     return uint4(uint32_t(x), 0, 0, 0);
-}|}, []);
-]
+}|},
+      [] );
+  ]
File "arrayjit/lib/builtins_cc.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/builtins_cc.ml b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml
index 6ec2a3c..b8df189 100644
--- a/_build/default/arrayjit/lib/builtins_cc.ml
+++ b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml
@@ -1,4 +1,5 @@
-let includes = {|
+let includes =
+  {|
 #include <stdio.h>
 #include <math.h>
 #include <stdint.h>
@@ -9,74 +10,84 @@ let includes = {|
 |}
 
 (* Each entry is (key, definition, dependencies) *)
-let builtins = [
-  (* Float16 feature detection and type definitions *)
-  ("HAS_NATIVE_FLOAT16", {|
+let builtins =
+  [
+    (* Float16 feature detection and type definitions *)
+    ( "HAS_NATIVE_FLOAT16",
+      {|
 #ifdef __FLT16_MAX__
   #define HAS_NATIVE_FLOAT16 1
 #else
   #define HAS_NATIVE_FLOAT16 0
 #endif
-|}, []);
-  
-  ("HALF_T", {|
+|},
+      [] );
+    ( "HALF_T",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define HALF_T _Float16
 #else
   #define HALF_T uint16_t
 #endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("HALF_TO_FP", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "HALF_TO_FP",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define HALF_TO_FP(x) (x)  /* Identity - already in floating point */
 #else
   #define HALF_TO_FP(x) half_to_float_emulated(x)  /* Convert to float for computation */
 #endif
-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);
-
-  ("FP_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );
+    ( "FP_TO_HALF",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define FP_TO_HALF(x) (x)  /* Identity - already half precision */
 #else
   #define FP_TO_HALF(x) float_to_half_emulated(x)  /* Convert back from float */
 #endif
-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);
-
-  ("HALF_TO_FLOAT", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );
+    ( "HALF_TO_FLOAT",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define HALF_TO_FLOAT(x) ((float)(x))
 #else
   #define HALF_TO_FLOAT(x) half_to_float_emulated(x)
 #endif
-|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]);
-
-  ("FLOAT_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] );
+    ( "FLOAT_TO_HALF",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define FLOAT_TO_HALF(x) ((_Float16)(x))
 #else
   #define FLOAT_TO_HALF(x) float_to_half_emulated(x)
 #endif
-|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]);
-
-  ("HALF_TO_UINT16", {|
+|},
+      [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] );
+    ( "HALF_TO_UINT16",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define HALF_TO_UINT16(x) ({ _Float16 _h = (x); uint16_t _r; memcpy(&_r, &_h, 2); _r; })
 #else
   #define HALF_TO_UINT16(x) (x)
 #endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("UINT16_TO_HALF", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "UINT16_TO_HALF",
+      {|
 #if HAS_NATIVE_FLOAT16
   #define UINT16_TO_HALF(x) ({ uint16_t _u = (x); _Float16 _h; memcpy(&_h, &_u, 2); _h; })
 #else
   #define UINT16_TO_HALF(x) (x)
 #endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  (* Float16 emulation functions *)
-  ("half_to_float_emulated", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    (* Float16 emulation functions *)
+    ( "half_to_float_emulated",
+      {|
 #if !HAS_NATIVE_FLOAT16
 /* Convert IEEE 754 half precision (stored as uint16_t) to float */
 static float half_to_float_emulated(uint16_t h) {
@@ -108,9 +119,10 @@ static float half_to_float_emulated(uint16_t h) {
     }
 }
 #endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  ("float_to_half_emulated", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    ( "float_to_half_emulated",
+      {|
 #if !HAS_NATIVE_FLOAT16
 /* Convert float to IEEE 754 half precision (stored as uint16_t) */
 static uint16_t float_to_half_emulated(float f) {
@@ -197,16 +209,16 @@ static uint16_t float_to_half_emulated(float f) {
     }
 }
 #endif
-|}, ["HAS_NATIVE_FLOAT16"]);
-
-  (* Threefry4x32 types and complete implementation *)
-  ("uint4x32_t", {|
+|},
+      [ "HAS_NATIVE_FLOAT16" ] );
+    (* Threefry4x32 types and complete implementation *)
+    ("uint4x32_t", {|
 typedef struct {
     uint32_t v[4];
 } uint4x32_t;
 |}, []);
-
-  ("threefry_common", {|
+    ( "threefry_common",
+      {|
 /* Threefry4x32 constants */
 const uint32_t THREEFRY_C240 = 0x1BD11BDA;
 
@@ -241,9 +253,10 @@ void threefry_round(uint32_t x[4], unsigned int r0, unsigned int r1, unsigned in
     x[1] = x[3];
     x[3] = tmp;
 }
-|}, ["uint4x32_t"]);
-
-  ("arrayjit_threefry4x32_crypto", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "arrayjit_threefry4x32_crypto",
+      {|
 
 /* Threefry4x32 implementation - 20 rounds (cryptographic version) */
 uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) {
@@ -332,9 +345,10 @@ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) {
     result.v[3] = x[3];
     return result;
 }
-|}, ["uint4x32_t"; "threefry_common"]);
-
-  ("arrayjit_threefry4x32_light", {|
+|},
+      [ "uint4x32_t"; "threefry_common" ] );
+    ( "arrayjit_threefry4x32_light",
+      {|
 /* Threefry4x32 implementation - 2 rounds (light version, as in JAX/XLA) */
 uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) {
     uint32_t x[4];
@@ -376,83 +390,81 @@ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) {
     result.v[3] = x[3];
     return result;
 }
-|}, ["uint4x32_t"; "threefry_common"]);
-
-  ("arrayjit_threefry4x32", {|
+|},
+      [ "uint4x32_t"; "threefry_common" ] );
+    ( "arrayjit_threefry4x32",
+      {|
 /* Default threefry4x32 function - will be configured at runtime */
 uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
     /* Default to light version */
     return arrayjit_threefry4x32_light(key, counter);
 }
-|}, ["uint4x32_t"; "arrayjit_threefry4x32_light"]);
-
-  (* Vector types with half precision *)
-  ("half8_t", {|
+|},
+      [ "uint4x32_t"; "arrayjit_threefry4x32_light" ] );
+    (* Vector types with half precision *)
+    ("half8_t", {|
 typedef struct { HALF_T v[8]; } half8_t;
-|}, ["HALF_T"]);
-
-  ("float4_t", {|
+|}, [ "HALF_T" ]);
+    ("float4_t", {|
 typedef struct { float v[4]; } float4_t;
 |}, []);
-
-  ("double2_t", {|
+    ("double2_t", {|
 typedef struct { double v[2]; } double2_t;
 |}, []);
-
-  ("int32x4_t", {|
+    ("int32x4_t", {|
 typedef struct { int32_t v[4]; } int32x4_t;
 |}, []);
-
-  ("int64x2_t", {|
+    ("int64x2_t", {|
 typedef struct { int64_t v[2]; } int64x2_t;
 |}, []);
-
-  ("int8x16_t", {|
+    ("int8x16_t", {|
 typedef struct { int8_t v[16]; } int8x16_t;
 |}, []);
-
-  ("uint16x8_t", {|
+    ("uint16x8_t", {|
 typedef struct { uint16_t v[8]; } uint16x8_t;
 |}, []);
-
-  ("uint8x16_t", {|
+    ("uint8x16_t", {|
 typedef struct { uint8_t v[16]; } uint8x16_t;
 |}, []);
-
-  (* Basic conversion functions *)
-  ("uint32_to_single_uniform", {|
+    (* Basic conversion functions *)
+    ( "uint32_to_single_uniform",
+      {|
 /* Convert to float in [0, 1) */
 float uint32_to_single_uniform(uint32_t x) {
     /* Use upper 24 bits for float mantissa (23 bits + implicit 1) */
     return (x >> 8) * (1.0f / 16777216.0f);
 }
-|}, []);
-
-  ("uint32_to_double_uniform", {|
+|},
+      [] );
+    ( "uint32_to_double_uniform",
+      {|
 /* Convert to double in [0, 1) */
 double uint32_to_double_uniform(uint32_t x) {
     return x * (1.0 / 4294967296.0);
 }
-|}, []);
-
-  (* Conversion functions with dependencies *)
-  ("uint4x32_to_single_uniform", {|
+|},
+      [] );
+    (* Conversion functions with dependencies *)
+    ( "uint4x32_to_single_uniform",
+      {|
 /* Uint4x32 to float32 uniform - uses first 32 bits */
 float uint4x32_to_single_uniform(uint4x32_t x) {
     return uint32_to_single_uniform(x.v[0]);
 }
-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_half_uniform", {|
+|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_half_uniform",
+      {|
 /* Uint4x32 to float16 uniform - uses first 16 bits */
 uint16_t uint4x32_to_half_uniform(uint4x32_t x) {
     /* Convert through float for consistent behavior */
     float f = (x.v[0] & 0xFFFF) * (1.0f / 65536.0f);
     return FLOAT_TO_HALF(f);
 }
-|}, ["uint4x32_t"; "FLOAT_TO_HALF"]);
-
-  ("uint4x32_to_half_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "FLOAT_TO_HALF" ] );
+    ( "uint4x32_to_half_uniform_vec",
+      {|
 /* Convert uint4x32 to 8 float16s uniform */
 half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
     half8_t result;
@@ -467,10 +479,11 @@ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "half8_t"; "FLOAT_TO_HALF"]);
-
-  (* Pure C conversion functions *)
-  ("bfloat16_to_single", {|
+|},
+      [ "uint4x32_t"; "half8_t"; "FLOAT_TO_HALF" ] );
+    (* Pure C conversion functions *)
+    ( "bfloat16_to_single",
+      {|
 /* BFloat16 to Float conversion (C function) */
 float bfloat16_to_single(uint16_t bf16)
 {
@@ -479,9 +492,10 @@ float bfloat16_to_single(uint16_t bf16)
   uint32_t f32 = ((uint32_t)bf16) << 16;
   return *((float *)&f32);
 }
-|}, []);
-
-  ("single_to_bfloat16", {|
+|},
+      [] );
+    ( "single_to_bfloat16",
+      {|
 /* Float to BFloat16 conversion (C function) */
 uint16_t single_to_bfloat16(float f)
 {
@@ -491,27 +505,30 @@ uint16_t single_to_bfloat16(float f)
   uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);
   return (uint16_t)(rounded >> 16);
 }
-|}, []);
-
-  ("half_to_single", {|
+|},
+      [] );
+    ( "half_to_single",
+      {|
 /* Half (Float16) to Float conversion (C function) */
 float half_to_single(uint16_t h)
 {
   HALF_T half_val = UINT16_TO_HALF(h);
   return HALF_TO_FLOAT(half_val);
 }
-|}, ["HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT"]);
-
-  ("single_to_half", {|
+|},
+      [ "HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT" ] );
+    ( "single_to_half",
+      {|
 /* Float to Half (Float16) conversion (C function) */
 uint16_t single_to_half(float f)
 {
   HALF_T half_val = FLOAT_TO_HALF(f);
   return HALF_TO_UINT16(half_val);
 }
-|}, ["HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16"]);
-
-  ("fp8_to_single", {|
+|},
+      [ "HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16" ] );
+    ( "fp8_to_single",
+      {|
 /* FP8 E5M2 format to Float conversion (C function)
    Format: 1 sign bit, 5 exponent bits, 2 mantissa bits */
 float fp8_to_single(uint8_t fp8)
@@ -555,9 +572,10 @@ float fp8_to_single(uint8_t fp8)
 
   return result;
 }
-|}, []);
-
-  ("single_to_fp8", {|
+|},
+      [] );
+    ( "single_to_fp8",
+      {|
 /* Float to FP8 E5M2 conversion (C function) */
 uint8_t single_to_fp8(float f)
 {
@@ -615,143 +633,162 @@ uint8_t single_to_fp8(float f)
 
   return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
 }
-|}, []);
-
-  (* Conversion functions from various precisions to uint4x32_t *)
-  ("int32_to_uint4x32", {|
+|},
+      [] );
+    (* Conversion functions from various precisions to uint4x32_t *)
+    ( "int32_to_uint4x32",
+      {|
 uint4x32_t int32_to_uint4x32(int32_t x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("int64_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "int64_to_uint4x32",
+      {|
 uint4x32_t int64_to_uint4x32(int64_t x) {
     uint64_t bits = (uint64_t)x;
     uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("uint32_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint32_to_uint4x32",
+      {|
 uint4x32_t uint32_to_uint4x32(uint32_t x) {
     uint4x32_t result = {{x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("uint64_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint64_to_uint4x32",
+      {|
 uint4x32_t uint64_to_uint4x32(uint64_t x) {
     uint4x32_t result = {{(uint32_t)(x & 0xFFFFFFFF), (uint32_t)(x >> 32), 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("single_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "single_to_uint4x32",
+      {|
 uint4x32_t single_to_uint4x32(float x) {
     uint32_t bits;
     memcpy(&bits, &x, sizeof(float));
     uint4x32_t result = {{bits, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("double_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "double_to_uint4x32",
+      {|
 uint4x32_t double_to_uint4x32(double x) {
     uint64_t bits;
     memcpy(&bits, &x, sizeof(double));
     uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("byte_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "byte_to_uint4x32",
+      {|
 uint4x32_t byte_to_uint4x32(unsigned char x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("uint16_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint16_to_uint4x32",
+      {|
 uint4x32_t uint16_to_uint4x32(uint16_t x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("bfloat16_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "bfloat16_to_uint4x32",
+      {|
 uint4x32_t bfloat16_to_uint4x32(uint16_t x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("half_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "half_to_uint4x32",
+      {|
 uint4x32_t half_to_uint4x32(uint16_t x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  ("fp8_to_uint4x32", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "fp8_to_uint4x32",
+      {|
 uint4x32_t fp8_to_uint4x32(uint8_t x) {
     uint4x32_t result = {{(uint32_t)x, 0, 0, 0}};
     return result;
 }
-|}, ["uint4x32_t"]);
-
-  (* More uint4x32 to various precision conversion functions *)
-  ("uint4x32_to_double_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    (* More uint4x32 to various precision conversion functions *)
+    ( "uint4x32_to_double_uniform",
+      {|
 /* Uint4x32 to float64 uniform - uses first 64 bits */
 double uint4x32_to_double_uniform(uint4x32_t x) {
     uint64_t combined = ((uint64_t)x.v[1] << 32) | x.v[0];
     return combined * (1.0 / 18446744073709551616.0);
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int32_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int32_uniform",
+      {|
 /* Uint4x32 to int32 uniform - full range */
 int32_t uint4x32_to_int32_uniform(uint4x32_t x) {
     return (int32_t)x.v[0];
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_int64_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_int64_uniform",
+      {|
 /* Uint4x32 to int64 uniform - full range */
 int64_t uint4x32_to_int64_uniform(uint4x32_t x) {
     return (int64_t)(((uint64_t)x.v[1] << 32) | x.v[0]);
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint32_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint32_uniform",
+      {|
 /* Uint4x32 to uint32 uniform - full range */
 uint32_t uint4x32_to_uint32_uniform(uint4x32_t x) {
     return x.v[0];
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint64_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint64_uniform",
+      {|
 /* Uint4x32 to uint64 uniform - full range */
 uint64_t uint4x32_to_uint64_uniform(uint4x32_t x) {
     return ((uint64_t)x.v[1] << 32) | x.v[0];
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_byte_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_byte_uniform",
+      {|
 /* Uint4x32 to int8 uniform - full range */
 int8_t uint4x32_to_byte_uniform(uint4x32_t x) {
     return (int8_t)(x.v[0] & 0xFF);
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_uint16_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_uint16_uniform",
+      {|
 /* Uint4x32 to uint16 uniform - full range */
 uint16_t uint4x32_to_uint16_uniform(uint4x32_t x) {
     return (uint16_t)(x.v[0] & 0xFFFF);
 }
-|}, ["uint4x32_t"]);
-
-  ("uint4x32_to_bfloat16_uniform", {|
+|},
+      [ "uint4x32_t" ] );
+    ( "uint4x32_to_bfloat16_uniform",
+      {|
 /* Uint4x32 to bfloat16 uniform - uses first 16 bits */
 uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
     /* Convert to float first, then to bfloat16 */
@@ -763,17 +800,19 @@ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
     if ((bits & 0x8000) && ((bits & 0x7FFF) || (bf & 1))) bf++;
     return bf;
 }
-|}, ["uint4x32_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_fp8_uniform", {|
+|},
+      [ "uint4x32_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_fp8_uniform",
+      {|
 /* Uint4x32 to fp8 uniform - uses first 8 bits */
 uint8_t uint4x32_to_fp8_uniform(uint4x32_t x) {
     return (uint8_t)(x.v[0] & 0xFF);
 }
-|}, ["uint4x32_t"]);
-
-  (* Vectorized conversion functions *)
-  ("uint4x32_to_single_uniform_vec", {|
+|},
+      [ "uint4x32_t" ] );
+    (* Vectorized conversion functions *)
+    ( "uint4x32_to_single_uniform_vec",
+      {|
 /* Convert uint4x32 to 4 floats in [0, 1) */
 float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
     float4_t result;
@@ -782,9 +821,10 @@ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]);
-
-  ("uint4x32_to_double_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] );
+    ( "uint4x32_to_double_uniform_vec",
+      {|
 /* Convert uint4x32 to 2 doubles in [0, 1) */
 double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
     double2_t result;
@@ -794,9 +834,10 @@ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) {
     result.v[1] = combined2 * (1.0 / 18446744073709551616.0);
     return result;
 }
-|}, ["uint4x32_t"; "double2_t"]);
-
-  ("uint4x32_to_int32_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "double2_t" ] );
+    ( "uint4x32_to_int32_uniform_vec",
+      {|
 /* Convert uint4x32 to 4 int32s - full range */
 int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
     int32x4_t result;
@@ -805,9 +846,10 @@ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "int32x4_t"]);
-
-  ("uint4x32_to_int64_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int32x4_t" ] );
+    ( "uint4x32_to_int64_uniform_vec",
+      {|
 /* Convert uint4x32 to 2 int64s - full range */
 int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {
     int64x2_t result;
@@ -815,9 +857,10 @@ int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) {
     result.v[1] = (int64_t)(((uint64_t)x.v[3] << 32) | x.v[2]);
     return result;
 }
-|}, ["uint4x32_t"; "int64x2_t"]);
-
-  ("uint4x32_to_byte_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int64x2_t" ] );
+    ( "uint4x32_to_byte_uniform_vec",
+      {|
 /* Convert uint4x32 to 16 int8s - full range */
 int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {
     int8x16_t result;
@@ -829,9 +872,10 @@ int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "int8x16_t"]);
-
-  ("uint4x32_to_uint16_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "int8x16_t" ] );
+    ( "uint4x32_to_uint16_uniform_vec",
+      {|
 /* Convert uint4x32 to 8 uint16s - full range */
 uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {
     uint16x8_t result;
@@ -841,9 +885,10 @@ uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_bfloat16_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_bfloat16_uniform_vec",
+      {|
 /* Convert uint4x32 to 8 bfloat16s uniform */
 uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
     uint16x8_t result;
@@ -865,9 +910,10 @@ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "uint16x8_t"]);
-
-  ("uint4x32_to_fp8_uniform_vec", {|
+|},
+      [ "uint4x32_t"; "uint16x8_t" ] );
+    ( "uint4x32_to_fp8_uniform_vec",
+      {|
 /* Convert uint4x32 to 16 fp8s uniform */
 uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {
     uint8x16_t result;
@@ -879,7 +925,8 @@ uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) {
     }
     return result;
 }
-|}, ["uint4x32_t"; "uint8x16_t"]);
-]
+|},
+      [ "uint4x32_t"; "uint8x16_t" ] );
+  ]
 
-let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)
\ No newline at end of file
+let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins)
File "arrayjit/lib/backend_impl.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/backend_impl.ml b/_build/default/arrayjit/lib/.formatted/backend_impl.ml
index 6d483ff..45e6bea 100644
--- a/_build/default/arrayjit/lib/backend_impl.ml
+++ b/_build/default/arrayjit/lib/.formatted/backend_impl.ml
@@ -50,13 +50,11 @@ module No_device_buffer_and_copying () :
     Stdlib.Gc.finalise finalize ptr;
     ptr
 
-  let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) :
-      buffer_ptr =
+  let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr =
     let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in
     alloc_impl ~size_in_bytes
 
-  let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) :
-      buffer_ptr =
+  let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr =
     let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in
     let ptr = alloc_impl ~size_in_bytes in
     (* Zero-initialize the allocated memory *)
File "lib/ppx_shared.ml", line 1, characters 0-0:
diff --git a/_build/default/lib/ppx_shared.ml b/_build/default/lib/.formatted/ppx_shared.ml
index 340047f..7210e1d 100644
--- a/_build/default/lib/ppx_shared.ml
+++ b/_build/default/lib/.formatted/ppx_shared.ml
@@ -167,9 +167,11 @@ let binary_ops =
       ("@-", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Min]));
       ("min", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Min]));
       ("^^^^", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto]));
-      ("threefry4x32_crypto", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto]));
+      ( "threefry4x32_crypto",
+        fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto]) );
       ("^^", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light]));
-      ("threefry4x32_light", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light]));
+      ( "threefry4x32_light",
+        fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light]) );
     ]
 
 (** Unary primitive ops. *)
File "arrayjit/lib/cc_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/cc_backend.ml b/_build/default/arrayjit/lib/.formatted/cc_backend.ml
index f5dfc7d..f3f3171 100644
--- a/_build/default/arrayjit/lib/cc_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/cc_backend.ml
@@ -220,13 +220,17 @@ struct
         | Ops.Uint4x32_prec _ ->
             let open PPrint in
             group (string "arrayjit_threefry4x32_crypto(" ^^ v1 ^^ string ", " ^^ v2 ^^ string ")")
-        | _ -> invalid_arg "CC_syntax_config.binop_syntax: Threefry4x32_crypto on non-uint4x32 precision")
+        | _ ->
+            invalid_arg
+              "CC_syntax_config.binop_syntax: Threefry4x32_crypto on non-uint4x32 precision")
     | Ops.Threefry4x32_light -> (
         match prec with
         | Ops.Uint4x32_prec _ ->
             let open PPrint in
             group (string "arrayjit_threefry4x32_light(" ^^ v1 ^^ string ", " ^^ v2 ^^ string ")")
-        | _ -> invalid_arg "CC_syntax_config.binop_syntax: Threefry4x32_light on non-uint4x32 precision")
+        | _ ->
+            invalid_arg
+              "CC_syntax_config.binop_syntax: Threefry4x32_light on non-uint4x32 precision")
     | _ -> (
         match prec with
         | Ops.Bfloat16_prec _ ->
@@ -312,10 +316,10 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)
   let idx_params = Indexing.bound_symbols bindings in
   let build_file = Utils.open_build_file ~base_name:name ~extension:".c" in
   let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-  let filtered_code = Syntax.filter_and_prepend_builtins 
-    ~includes:Builtins_cc.includes 
-    ~builtins:Builtins_cc.builtins 
-    ~proc_doc in
+  let filtered_code =
+    Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins
+      ~proc_doc
+  in
   (* Use ribbon = 1.0 for usual code formatting, width 110 *)
   Out_channel.output_string build_file.oc filtered_code;
   build_file.finalize ();
@@ -344,10 +348,10 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized opt
   in
   let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
   let combined_proc_doc = PPrint.separate PPrint.hardline all_proc_docs in
-  let filtered_code = Syntax.filter_and_prepend_builtins 
-    ~includes:Builtins_cc.includes 
-    ~builtins:Builtins_cc.builtins 
-    ~proc_doc:combined_proc_doc in
+  let filtered_code =
+    Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins
+      ~proc_doc:combined_proc_doc
+  in
   Out_channel.output_string build_file.oc filtered_code;
   build_file.finalize ();
   let result_library = c_compile_and_load ~f_path:build_file.f_path in
File "arrayjit/lib/ops.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/ops.ml b/_build/default/arrayjit/lib/.formatted/ops.ml
index 8b41458..2c24be3 100644
--- a/_build/default/arrayjit/lib/ops.ml
+++ b/_build/default/arrayjit/lib/.formatted/ops.ml
@@ -329,11 +329,11 @@ type binop =
   | Or
   | And
   | Threefry4x32_crypto
-      (** 4x32-bit Threefry PRNG, 20-round cryptographic version. Requires a 128-bit key and a 
+      (** 4x32-bit Threefry PRNG, 20-round cryptographic version. Requires a 128-bit key and a
           128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *)
   | Threefry4x32_light
-      (** 4x32-bit Threefry PRNG, 2-round light version (as in JAX/XLA). Requires a 128-bit key 
-          and a 128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *)
+      (** 4x32-bit Threefry PRNG, 2-round light version (as in JAX/XLA). Requires a 128-bit key and
+          a 128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *)
 [@@deriving sexp, compare, equal]
 
 type unop =
@@ -390,7 +390,9 @@ let neutral_elem = function
   | Min -> Float.infinity
   | And -> 1.
   | Or -> 0.
-  | Arg2 | Arg1 | Mod | Cmplt | Cmpeq | Cmpne | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) -> 0.
+  | Arg2 | Arg1 | Mod | Cmplt | Cmpeq | Cmpne | Threefry4x32_crypto
+  | Threefry4x32_light (* | Shl | Shr *) ->
+      0.
 
 let interpret_binop op v1 v2 =
   let open Float in
@@ -415,7 +417,7 @@ let interpret_binop op v1 v2 =
   (* | Shr -> v1 / (int_pow 2. @@ to_int v2) *)
   | Or -> if v1 <> 0. || v2 <> 0. then 1. else 0.
   | And -> if v1 <> 0. && v2 <> 0. then 1. else 0.
-  | Threefry4x32_crypto | Threefry4x32_light -> 
+  | Threefry4x32_crypto | Threefry4x32_light ->
       invalid_arg "Ops.interpret_binop: Threefry4x32 operations are outside the domain of float"
 
 let interpret_unop op v =
@@ -440,7 +442,8 @@ let interpret_unop op v =
   | Tanh_approx -> tanh v
   | Not -> if v = 0. then 1. else 0.
   | Uint4x32_to_prec_uniform1 ->
-      invalid_arg "Ops.interpret_unop: Uint4x32_to_prec_uniform1 argument outside the domain of float"
+      invalid_arg
+        "Ops.interpret_unop: Uint4x32_to_prec_uniform1 argument outside the domain of float"
 
 let interpret_ternop op v1 v2 v3 =
   let open Float in
@@ -541,11 +544,12 @@ let binop_c_syntax prec v =
   | Threefry4x32_crypto, _ ->
       (* This corresponds to the pure C implementation in builtins.c. *)
       ("arrayjit_threefry4x32_crypto(", ",", ")")
-  | Threefry4x32_light, _ ->
-      ("arrayjit_threefry4x32_light(", ",", ")")
+  | Threefry4x32_light, _ -> ("arrayjit_threefry4x32_light(", ",", ")")
 
 let is_assign_op = function
-  | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne -> false
+  | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne
+    ->
+      false
   | Add | Sub | Mul | Div | ToPowOf | Relu_gate | Satur01_gate | Arg2 | Max | Min | Or | And -> true
 
 let assign_op_cd_syntax ~initialize_neutral = function
@@ -572,7 +576,8 @@ let assign_op_cd_syntax ~initialize_neutral = function
   | Min -> "=@-"
   | Or -> "=||"
   | And -> "=&&"
-  | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne ->
+  | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne
+    ->
       invalid_arg "Ops.assign_op_cd_syntax: not an assignment op"
 
 (** Note: currently we do not support unary prefix symbols. *)
@@ -756,10 +761,12 @@ external copy_with_padding_c :
   axis_padding array ->
   unit = "arrayjit_copy_with_padding"
 
-external threefry4x32_crypto : int array -> int array -> int array = "arrayjit_threefry4x32_crypto_ocaml"
+external threefry4x32_crypto : int array -> int array -> int array
+  = "arrayjit_threefry4x32_crypto_ocaml"
 (** Threefry4x32 PRNG - 20 round cryptographic version *)
 
-external threefry4x32_light : int array -> int array -> int array = "arrayjit_threefry4x32_light_ocaml"
+external threefry4x32_light : int array -> int array -> int array
+  = "arrayjit_threefry4x32_light_ocaml"
 (** Threefry4x32 PRNG - 2 round light version *)
 
 external threefry4x32 : int array -> int array -> int array = "arrayjit_threefry4x32_ocaml"
File "arrayjit/lib/backends.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/backends.ml b/_build/default/arrayjit/lib/.formatted/backends.ml
index a6ab05e..166603a 100644
--- a/_build/default/arrayjit/lib/backends.ml
+++ b/_build/default/arrayjit/lib/.formatted/backends.ml
@@ -474,18 +474,17 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct
       [%log (key : Tnode.t)];
       let default () =
         let dims = Lazy.force key.dims in
-        (* Use alloc_array when zero initialization is not needed:
-           - When copying from host immediately after allocation
-           - When the node has explicit Zero_out operations in the lowered code *)
-        let will_copy_from_host = 
-          Utils.settings.automatic_host_transfers && Tn.known_constant key &&
-          match key.array with | (lazy (Some _)) -> true | _ -> false
+        (* Use alloc_array when zero initialization is not needed: - When copying from host
+           immediately after allocation - When the node has explicit Zero_out operations in the
+           lowered code *)
+        let will_copy_from_host =
+          Utils.settings.automatic_host_transfers && Tn.known_constant key
+          && match key.array with (lazy (Some _)) -> true | _ -> false
         in
-        let dst_ptr = 
+        let dst_ptr =
           if will_copy_from_host || node.Low_level.zero_initialized_by_code then
             alloc_array (Lazy.force key.prec) ~dims stream
-          else
-            alloc_zeros (Lazy.force key.prec) ~dims stream
+          else alloc_zeros (Lazy.force key.prec) ~dims stream
         in
         (if will_copy_from_host then
            match key.array with
File "arrayjit/lib/metal_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/metal_backend.ml b/_build/default/arrayjit/lib/.formatted/metal_backend.ml
index 7752213..f4b4bed 100644
--- a/_build/default/arrayjit/lib/metal_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/metal_backend.ml
@@ -547,8 +547,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
               raise
               @@ Utils.User_error
                    (Printf.sprintf
-                      "Metal backend: Threefry4x32_crypto requires target precision to be uint4x32, but \
-                       got %s"
+                      "Metal backend: Threefry4x32_crypto requires target precision to be \
+                       uint4x32, but got %s"
                       (Ops.prec_string prec)))
       | Threefry4x32_light, _ -> (
           (* Threefry4x32_light must output to uint4x32 precision *)
@@ -558,8 +558,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
               raise
               @@ Utils.User_error
                    (Printf.sprintf
-                      "Metal backend: Threefry4x32_light requires target precision to be uint4x32, but \
-                       got %s"
+                      "Metal backend: Threefry4x32_light requires target precision to be uint4x32, \
+                       but got %s"
                       (Ops.prec_string prec)))
       | Arg1, _ | Arg2, _ -> invalid_arg "Metal C_syntax_config: Arg1/Arg2 not operators"
 
@@ -675,8 +675,10 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
     let metal_includes = {|#include <metal_stdlib>
 using namespace metal;|} in
-    let source = Syntax.filter_and_prepend_builtins 
-      ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins
+        ~proc_doc
+    in
     {
       metal_source = source;
       compiled_code = Array.create ~len:num_devs None;
@@ -703,8 +705,10 @@ using namespace metal;|} in
     let final_doc = PPrint.(separate hardline all_proc_docs) in
     let metal_includes = {|#include <metal_stdlib>
 using namespace metal;|} in
-    let source = Syntax.filter_and_prepend_builtins 
-      ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc:final_doc in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins
+        ~proc_doc:final_doc
+    in
     let traced_stores = Array.map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store)) in
     let funcs = Array.map funcs_and_docs ~f:(Option.map ~f:fst) in
     {
Warning: Invalid documentation comment:
File "lib/operation.ml", line 407, characters 63-63:
End of text is not allowed in '[...]' (code).
File "lib/operation.ml", line 1, characters 0-0:
diff --git a/_build/default/lib/operation.ml b/_build/default/lib/.formatted/operation.ml
index c99465c..cd34f05 100644
--- a/_build/default/lib/operation.ml
+++ b/_build/default/lib/.formatted/operation.ml
@@ -375,13 +375,10 @@ let threefry4x32_light =
       result
 
 let threefry4x32 =
-  (* Select based on configuration *)
-  fun t1 t2 ->
-    let variant = Utils.settings.default_prng_variant in
-    if String.equal variant "crypto" then
-      threefry4x32_crypto t1 t2
-    else
-      threefry4x32_light t1 t2
+ (* Select based on configuration *)
+ fun t1 t2 ->
+  let variant = Utils.settings.default_prng_variant in
+  if String.equal variant "crypto" then threefry4x32_crypto t1 t2 else threefry4x32_light t1 t2
 
 let fma ?(label = []) ~grad_spec t1 t2 t3 =
   let module NTDSL = Initial_NTDSL in
File "arrayjit/lib/c_syntax.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/c_syntax.ml b/_build/default/arrayjit/lib/.formatted/c_syntax.ml
index ab0c03e..87b9a27 100644
--- a/_build/default/arrayjit/lib/c_syntax.ml
+++ b/_build/default/arrayjit/lib/.formatted/c_syntax.ml
@@ -240,13 +240,13 @@ module C_syntax (B : C_syntax_config) = struct
     let result_buffer = Buffer.create 4096 in
     Buffer.add_string result_buffer includes;
     Buffer.add_string result_buffer "\n";
-    
+
     (* Collect all needed keys, including dependencies *)
     let needed_keys = ref (Set.empty (module String)) in
     List.iter builtins ~f:(fun (key, _, _) ->
-      if String.is_substring doc_string ~substring:key then
-        needed_keys := Set.add !needed_keys key);
-    
+        if String.is_substring doc_string ~substring:key then
+          needed_keys := Set.add !needed_keys key);
+
     (* Add dependencies recursively *)
     let processed_keys = ref (Set.empty (module String)) in
     let rec add_dependencies key =
@@ -255,17 +255,15 @@ module C_syntax (B : C_syntax_config) = struct
         needed_keys := Set.add !needed_keys key;
         match List.find builtins ~f:(fun (k, _, _) -> String.equal k key) with
         | Some (_, _, deps) -> List.iter deps ~f:add_dependencies
-        | None -> ()
-      )
+        | None -> ())
     in
     Set.iter !needed_keys ~f:add_dependencies;
-    
+
     (* Add the builtins in order *)
     List.iter builtins ~f:(fun (key, definition, _) ->
-      if Set.mem !needed_keys key then (
-        Buffer.add_string result_buffer definition;
-        Buffer.add_string result_buffer "\n";
-      ));
+        if Set.mem !needed_keys key then (
+          Buffer.add_string result_buffer definition;
+          Buffer.add_string result_buffer "\n"));
     Buffer.add_string result_buffer doc_string;
     Buffer.contents result_buffer
 
@@ -582,13 +580,13 @@ module C_syntax (B : C_syntax_config) = struct
         let expr = group (B.binop_syntax prec op e1 e2) in
         (defs, expr)
     | Unop (op, v) ->
-        let arg_prec = 
+        let arg_prec =
           match op with
-          | Ops.Uint4x32_to_prec_uniform1 -> 
-            (* The argument to Uint4x32_to_prec_uniform1 must be evaluated with uint4x32 precision,
-               regardless of the target precision. This handles the case where the operation is
-               inlined as part of a scalar expression. *)
-            Ops.uint4x32
+          | Ops.Uint4x32_to_prec_uniform1 ->
+              (* The argument to Uint4x32_to_prec_uniform1 must be evaluated with uint4x32
+                 precision, regardless of the target precision. This handles the case where the
+                 operation is inlined as part of a scalar expression. *)
+              Ops.uint4x32
           | _ -> prec
         in
         let defs, expr_v = pp_scalar arg_prec v in
@@ -785,7 +783,9 @@ module C_syntax (B : C_syntax_config) = struct
                let ident_doc = string (get_ident tn) in
                let num_elems = Tn.num_elems tn in
                let size_doc = OCaml.int num_elems in
-               let init_doc = if node.Low_level.zero_initialized_by_code then string " = {0}" else empty in
+               let init_doc =
+                 if node.Low_level.zero_initialized_by_code then string " = {0}" else empty
+               in
                typ_doc ^^ space ^^ ident_doc ^^ brackets size_doc ^^ init_doc ^^ semi ^^ hardline
              else empty)
            (Hashtbl.to_alist traced_store)
File "arrayjit/lib/utils.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/utils.ml b/_build/default/arrayjit/lib/.formatted/utils.ml
index 3783503..9adf0a6 100644
--- a/_build/default/arrayjit/lib/utils.ml
+++ b/_build/default/arrayjit/lib/.formatted/utils.ml
@@ -50,8 +50,8 @@ type settings = {
             routine's context if the host array was not yet transfered since its creation or most
             recent modification. *)
   mutable default_prng_variant : string;
-      (** The default variant of threefry4x32 PRNG to use. Options: "crypto" (20 rounds) or "light" (2 rounds).
-          Defaults to "light" for better performance. *)
+      (** The default variant of threefry4x32 PRNG to use. Options: "crypto" (20 rounds) or "light"
+          (2 rounds). Defaults to "light" for better performance. *)
 }
 [@@deriving sexp]
 
@@ -465,8 +465,7 @@ let restore_settings () =
     @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0";
   settings.automatic_host_transfers <-
     get_global_flag ~default:true ~arg_name:"automatic_host_transfers";
-  settings.default_prng_variant <-
-    get_global_arg ~default:"light" ~arg_name:"default_prng_variant"
+  settings.default_prng_variant <- get_global_arg ~default:"light" ~arg_name:"default_prng_variant"
 
 let () = restore_settings ()
 let with_runtime_debug () = settings.output_debug_files_in_build_directory && settings.log_level > 1
File "arrayjit/lib/cuda_backend.ml", line 1, characters 0-0:
diff --git a/_build/default/arrayjit/lib/cuda_backend.ml b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml
index 25b5253..58be075 100644
--- a/_build/default/arrayjit/lib/cuda_backend.ml
+++ b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml
@@ -633,8 +633,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
               raise
               @@ Utils.User_error
                    (Printf.sprintf
-                      "CUDA backend: Threefry4x32_crypto requires target precision to be uint4x32, but \
-                       got %s"
+                      "CUDA backend: Threefry4x32_crypto requires target precision to be uint4x32, \
+                       but got %s"
                       (Ops.prec_string prec)))
       | Threefry4x32_light, _ -> (
           (* Threefry4x32_light must output to uint4x32 precision *)
@@ -644,8 +644,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
               raise
               @@ Utils.User_error
                    (Printf.sprintf
-                      "CUDA backend: Threefry4x32_light requires target precision to be uint4x32, but \
-                       got %s"
+                      "CUDA backend: Threefry4x32_light requires target precision to be uint4x32, \
+                       but got %s"
                       (Ops.prec_string prec)))
 
     let unop_syntax prec v =
@@ -780,7 +780,6 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       ^^ rparen ^^ semi
   end
 
-
   let%diagn2_sexp compile ~name bindings ({ Low_level.traced_store; _ } as lowered) =
     (* TODO: The following link seems to claim it's better to expand into loops than use memset.
        https://stackoverflow.com/questions/23712558/how-do-i-best-initialize-a-local-memory-array-to-0 *)
@@ -789,11 +788,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     end)) in
     let idx_params = Indexing.bound_symbols bindings in
     let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-    let cuda_includes = {|#include <cuda_fp16.h>
-#include <cuda_bf16.h>|} ^
-      (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc in
+    let cuda_includes =
+      {|#include <cuda_fp16.h>
+#include <cuda_bf16.h>|}
+      ^
+      if Utils.debug_log_from_routines () then
+        "\n__device__ int printf (const char * format, ... );"
+      else ""
+    in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins
+        ~proc_doc
+    in
     let ptx = cuda_to_ptx ~name source in
     { traced_store; ptx; params; bindings; name }
 
@@ -811,11 +817,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     in
     let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
     let final_doc = PPrint.(separate hardline all_proc_docs) in
-    let cuda_includes = {|#include <cuda_fp16.h>
-#include <cuda_bf16.h>|} ^
-      (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in
-    let source = Syntax.filter_and_prepend_builtins
-      ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc:final_doc in
+    let cuda_includes =
+      {|#include <cuda_fp16.h>
+#include <cuda_bf16.h>|}
+      ^
+      if Utils.debug_log_from_routines () then
+        "\n__device__ int printf (const char * format, ... );"
+      else ""
+    in
+    let source =
+      Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins
+        ~proc_doc:final_doc
+    in
 
     let name : string =
       String.(
dune build @fmt failed
"/usr/bin/env" "bash" "-c" "opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)" failed with exit status 2
2025-08-24 09:40.18: Job failed: Failed: Build failed