2025-08-24 09:40.15: New job: test ahrefs/ocannl https://github.com/ahrefs/ocannl.git#refs/heads/master (7fe3406b69dea29244d16ef354c1a477c1410682) (linux-x86_64:(lint-fmt)) Base: ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a ocamlformat version: version 0.27.0 (from opam) To reproduce locally: git clone --recursive "https://github.com/ahrefs/ocannl.git" -b "master" && cd "ocannl" && git reset --hard 7fe3406b cat > Dockerfile <<'END-OF-DOCKERFILE' FROM ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a USER 1000:1000 RUN cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u RUN opam depext -i dune WORKDIR /src RUN opam depext -i ocamlformat=0.27.0 COPY --chown=1000:1000 . /src/ RUN opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2) END-OF-DOCKERFILE docker build . END-REPRO-BLOCK 2025-08-24 09:40.15: Using cache hint "ahrefs/ocannl-ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a-debian-12-4.08_opam-2.4-ocamlformat-b8021439f8c57ba6435bc2263f6596671f4f4466" 2025-08-24 09:40.15: Using OBuilder spec: ((from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a) (user (uid 1000) (gid 1000)) (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u")) (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "opam depext -i dune")) (workdir /src) (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "opam depext -i ocamlformat=0.27.0")) (copy (src .) (dst /src/)) (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)")) ) 2025-08-24 09:40.15: Waiting for resource in pool OCluster 2025-08-24 09:40.15: Waiting for worker… 2025-08-24 09:40.15: Got resource from pool OCluster Building on eumache All commits already cached HEAD is now at 7fe3406b Merge pull request #368 from ahrefs/feature/tensor-rootness-check (from ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a) Unable to find image 'ocaml/opam:debian-12-ocaml-4.08@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a' locally docker.io/ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a: Pulling from ocaml/opam 2b437dab448b: Already exists 348b54b4f842: Already exists d67c1d522a4e: Already exists dea856d1a4d8: Already exists b580a9d7b5b1: Already exists Digest: sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a Status: Downloaded newer image for ocaml/opam@sha256:474656ea1593a299054f8966c700443fa0944c9534de3da94ca6dfab4a44c47a 2025-08-24 09:40.16 ---> using "1d0024db739bd078f91b2384c47919652a4b72a425e3e24ce24cfd1f6debdfbc" from cache /: (user (uid 1000) (gid 1000)) /: (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "cd ~/opam-repository && (git cat-file -e b8021439f8c57ba6435bc2263f6596671f4f4466 || git fetch origin master) && git reset -q --hard b8021439f8c57ba6435bc2263f6596671f4f4466 && git log --no-decorate -n1 --oneline && opam update -u")) b8021439f8 Merge pull request #28261 from kit-ty-kate/deploy-fix-25819 <><> Updating package repositories ><><><><><><><><><><><><><><><><><><><><><><> [default] Initialised default (at git+file:///home/opam/opam-repository): [INFO] opam 2.1 and 2.2 include many performance and security improvements over 2.0; please consider upgrading (https://opam.ocaml.org/doc/Install.html) Everything as up-to-date as possible (run with --verbose to show unavailable upgrades). However, you may "opam upgrade" these packages explicitly, which will ask permission to downgrade or uninstall the conflicting packages. Nothing to do. # Run eval $(opam env) to update the current shell environment 2025-08-24 09:40.16 ---> using "76d9d96bb26da3c78200d383fd35f876d80571baf05962331a1fff5f47db0e2e" from cache /: (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "opam depext -i dune")) # Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian # No extra OS packages requirements found. # All required OS packages found. # Now letting opam install the packages The following actions will be performed: - install dune 3.19.1 <><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><> [dune.3.19.1] found in cache <><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><> -> installed dune.3.19.1 Done. # Run eval $(opam env) to update the current shell environment 2025-08-24 09:40.16 ---> using "da0888a20a067de19f6183f0b497dcc2d1ea7f7036861cc50f633c662efdce4f" from cache /: (workdir /src) /src: (run (cache (opam-archives (target /home/opam/.opam/download-cache))) (network host) (shell "opam depext -i ocamlformat=0.27.0")) # Detecting depexts using vars: arch=x86_64, os=linux, os-distribution=debian, os-family=debian # No extra OS packages requirements found. # All required OS packages found. # Now letting opam install the packages The following actions will be performed: - install sexplib0 v0.14.0 [required by base] - install menhirLib 20240715 [required by ocamlformat-lib] - install menhirCST 20240715 [required by menhir] - install ocamlbuild 0.16.1 [required by fpath, astring, uuseg] - install cmdliner 1.3.0 [required by ocamlformat] - install menhirSdk 20240715 [required by ocamlformat-lib] - install either 1.0.0 [required by ocamlformat-lib] - install csexp 1.5.2 [required by ocamlformat] - install dune-build-info 3.19.1 [required by ocamlformat-lib] - install camlp-streams 5.0.1 [required by ocamlformat-lib] - install seq base [required by re] - install fix 20250428 [required by ocamlformat-lib] - install ocaml-version 4.0.1 [required by ocamlformat-lib] - install ocamlfind 1.9.8 [required by ocp-indent, astring, fpath, uuseg] - install menhir 20240715 [required by ocamlformat-lib] - install dune-configurator 3.19.1 [required by base] - install re 1.11.0 [required by ocamlformat] - install topkg 1.1.0 [required by fpath, astring, uuseg] - install base-bytes base [required by ocp-indent] - install base v0.14.3 [required by ocamlformat-lib] - install uutf 1.0.4 [required by ocamlformat-lib] - install astring 0.8.5 [required by ocamlformat-lib] - install ocp-indent 1.8.1 [required by ocamlformat-lib] - install stdio v0.14.0 [required by ocamlformat-lib] - install uucp 15.0.0 [required by uuseg] - install fpath 0.7.3 [required by ocamlformat-lib] - install uuseg 15.0.0 [required by ocamlformat-lib] - install ocamlformat-lib 0.27.0 [required by ocamlformat] - install ocamlformat 0.27.0 ===== 29 to install ===== <><> Gathering sources ><><><><><><><><><><><><><><><><><><><><><><><><><><><><> [astring.0.8.5] found in cache [base.v0.14.3] found in cache [camlp-streams.5.0.1] found in cache [cmdliner.1.3.0] found in cache [csexp.1.5.2] found in cache [dune-build-info.3.19.1] found in cache [dune-configurator.3.19.1] found in cache [either.1.0.0] found in cache [fix.20250428] found in cache [fpath.0.7.3] found in cache [menhir.20240715] found in cache [menhirCST.20240715] found in cache [menhirLib.20240715] found in cache [menhirSdk.20240715] found in cache [ocaml-version.4.0.1] found in cache [ocamlbuild.0.16.1] found in cache [ocamlfind.1.9.8] found in cache [ocamlformat.0.27.0] found in cache [ocamlformat-lib.0.27.0] found in cache [ocp-indent.1.8.1] found in cache [re.1.11.0] found in cache [sexplib0.v0.14.0] found in cache [stdio.v0.14.0] found in cache [topkg.1.1.0] found in cache [uucp.15.0.0] found in cache [uuseg.15.0.0] found in cache [uutf.1.0.4] found in cache <><> Processing actions <><><><><><><><><><><><><><><><><><><><><><><><><><><><> -> installed seq.base -> installed camlp-streams.5.0.1 -> installed csexp.1.5.2 -> installed either.1.0.0 -> installed cmdliner.1.3.0 -> installed fix.20250428 -> installed menhirCST.20240715 -> installed menhirLib.20240715 -> installed menhirSdk.20240715 -> installed ocaml-version.4.0.1 -> installed sexplib0.v0.14.0 -> installed re.1.11.0 -> installed dune-build-info.3.19.1 -> installed dune-configurator.3.19.1 -> installed ocamlfind.1.9.8 -> installed base-bytes.base -> installed ocp-indent.1.8.1 -> installed ocamlbuild.0.16.1 -> installed base.v0.14.3 -> installed stdio.v0.14.0 -> installed topkg.1.1.0 -> installed uutf.1.0.4 -> installed astring.0.8.5 -> installed fpath.0.7.3 -> installed menhir.20240715 -> installed uucp.15.0.0 -> installed uuseg.15.0.0 -> installed ocamlformat-lib.0.27.0 -> installed ocamlformat.0.27.0 Done. <><> ocp-indent.1.8.1 installed successfully ><><><><><><><><><><><><><><><><><> => This package requires additional configuration for use in editors. Install package 'user-setup', or manually: * for Emacs, add these lines to ~/.emacs: (add-to-list 'load-path "/home/opam/.opam/4.08/share/emacs/site-lisp") (require 'ocp-indent) * for Vim, add this line to ~/.vimrc: set rtp^="/home/opam/.opam/4.08/share/ocp-indent/vim" # Run eval $(opam env) to update the current shell environment 2025-08-24 09:40.16 ---> using "86ec8dcb8046a1e5dacfb1841e8c026d30cfead67649bcb6d6a8f9ddd6fb153d" from cache /src: (copy (src .) (dst /src/)) 2025-08-24 09:40.17 ---> saved as "a809cbb37111f2865c2f50946bfde16e234850e0c770765ec6b6e948e09e729f" /src: (run (shell "opam exec -- dune build @fmt --ignore-promoted-rules || (echo \"dune build @fmt failed\"; exit 2)")) File "arrayjit/lib/dune", line 7, characters 30-51: 7 | (libraries base stdio pprint ppx_minidebug.runtime) ^^^^^^^^^^^^^^^^^^^^^ Error: Library "ppx_minidebug.runtime" not found. -> required by library "arrayjit.utils" in _build/default/arrayjit/lib -> required by executable ocannl_read_config in test/operations/config/dune:5 -> required by _build/default/test/operations/config/ocannl_read_config.exe -> required by %{dep:ocannl_read_config.exe} at test/operations/dune:22 -> required by _build/default/test/operations/config/ocannl_backend.txt -> required by %{read:config/ocannl_backend.txt} at test/operations/dune:36 -> required by Computing directory contents of _build/default/test/operations File "arrayjit/lib/no_device_backend_missing.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/no_device_backend_missing.ml b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml index 0e04309..890e1c2 100644 --- a/_build/default/arrayjit/lib/no_device_backend_missing.ml +++ b/_build/default/arrayjit/lib/.formatted/no_device_backend_missing.ml @@ -35,12 +35,8 @@ struct let alloc_buffer ?old_buffer:_ ~size_in_bytes:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)" - let alloc_array _prec ~dims:_ () = - failwith @@ "Backend " ^ Config.name ^ " missing (no device)" - - let alloc_zeros _prec ~dims:_ () = - failwith @@ "Backend " ^ Config.name ^ " missing (no device)" - + let alloc_array _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)" + let alloc_zeros _prec ~dims:_ () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)" let free_buffer = None let get_used_memory () = failwith @@ "Backend " ^ Config.name ^ " missing (no device)" File "arrayjit/lib/builtins_cuda.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/builtins_cuda.ml b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml index c2a2e96..596c827 100644 --- a/_build/default/arrayjit/lib/builtins_cuda.ml +++ b/_build/default/arrayjit/lib/.formatted/builtins_cuda.ml @@ -1,103 +1,119 @@ (* CUDA builtin code split into (key, definition, dependencies) triples for filtering *) -let builtins = [ - ("uint4x32_t", {|typedef struct { +let builtins = + [ + ("uint4x32_t", {|typedef struct { unsigned int v[4]; } uint4x32_t;|}, []); - - ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []); - ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []); - ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []); - ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []); - ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []); - ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []); - ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []); - ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []); - - ("uint32_to_single_uniform", {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) { + ("float4_t", {|typedef struct { float v[4]; } float4_t;|}, []); + ("double2_t", {|typedef struct { double v[2]; } double2_t;|}, []); + ("int32x4_t", {|typedef struct { int v[4]; } int32x4_t;|}, []); + ("int64x2_t", {|typedef struct { long long v[2]; } int64x2_t;|}, []); + ("int8x16_t", {|typedef struct { signed char v[16]; } int8x16_t;|}, []); + ("uint16x8_t", {|typedef struct { unsigned short v[8]; } uint16x8_t;|}, []); + ("uint8x16_t", {|typedef struct { unsigned char v[16]; } uint8x16_t;|}, []); + ("half8_t", {|typedef struct { __half v[8]; } half8_t;|}, []); + ( "uint32_to_single_uniform", + {|__device__ __forceinline__ float uint32_to_single_uniform(unsigned int x) { /* Use __uint2float_rn for correct rounding */ return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f); -}|}, []); - - ("uint32_to_double_uniform", {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) { +}|}, + [] ); + ( "uint32_to_double_uniform", + {|__device__ __forceinline__ double uint32_to_double_uniform(unsigned int x) { return __uint2double_rn(x) * (1.0 / 4294967296.0); -}|}, []); - - ("uint4x32_to_single_uniform", {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) { +}|}, + [] ); + ( "uint4x32_to_single_uniform", + {|__device__ float uint4x32_to_single_uniform(uint4x32_t x) { return uint32_to_single_uniform(x.v[0]); -}|}, ["uint4x32_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_double_uniform", {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_double_uniform", + {|__device__ double uint4x32_to_double_uniform(uint4x32_t x) { unsigned long long combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0])); return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0); -}|}, ["uint4x32_t"]); - - ("uint4x32_to_int32_uniform", {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_int32_uniform", + {|__device__ int uint4x32_to_int32_uniform(uint4x32_t x) { return (int)x.v[0]; -}|}, ["uint4x32_t"]); - - ("uint4x32_to_i64_uniform", {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_i64_uniform", + {|__device__ long long uint4x32_to_i64_uniform(uint4x32_t x) { return __double_as_longlong(__hiloint2double(x.v[1], x.v[0])); -}|}, ["uint4x32_t"]); - - ("uint4x32_to_u32_uniform", {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_u32_uniform", + {|__device__ unsigned int uint4x32_to_u32_uniform(uint4x32_t x) { return x.v[0]; -}|}, ["uint4x32_t"]); - - ("uint4x32_to_u64_uniform", {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_u64_uniform", + {|__device__ unsigned long long uint4x32_to_u64_uniform(uint4x32_t x) { return (unsigned long long)__double_as_longlong(__hiloint2double(x.v[1], x.v[0])); -}|}, ["uint4x32_t"]); - - ("uint4x32_to_i8_uniform", {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_i8_uniform", + {|__device__ signed char uint4x32_to_i8_uniform(uint4x32_t x) { return (signed char)(x.v[0] & 0xFF); -}|}, ["uint4x32_t"]); - - ("uint4x32_to_u8_uniform", {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_u8_uniform", + {|__device__ unsigned char uint4x32_to_u8_uniform(uint4x32_t x) { return (unsigned char)(x.v[0] & 0xFF); -}|}, ["uint4x32_t"]); - - ("uint4x32_to_bfloat16_uniform", {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_bfloat16_uniform", + {|__device__ unsigned short uint4x32_to_bfloat16_uniform(uint4x32_t x) { float f = uint32_to_single_uniform(x.v[0]); return (unsigned short)(__float_as_uint(f) >> 16); -}|}, ["uint4x32_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_half_uniform", {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_half_uniform", + {|__device__ __half uint4x32_to_half_uniform(uint4x32_t x) { float f = uint32_to_single_uniform(x.v[0]); return __float2half(f); -}|}, ["uint4x32_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_single_uniform_vec", {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_single_uniform_vec", + {|__device__ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) { float4_t result; #pragma unroll for (int i = 0; i < 4; i++) { result.v[i] = uint32_to_single_uniform(x.v[i]); } return result; -}|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_double_uniform_vec", {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_double_uniform_vec", + {|__device__ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) { double2_t result; result.v[0] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[1], x.v[0]))) * (1.0 / 18446744073709551616.0); result.v[1] = __longlong_as_double(__double_as_longlong(__hiloint2double(x.v[3], x.v[2]))) * (1.0 / 18446744073709551616.0); return result; -}|}, ["uint4x32_t"; "double2_t"]); - - ("uint4x32_to_int32_uniform_vec", {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "double2_t" ] ); + ( "uint4x32_to_int32_uniform_vec", + {|__device__ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) { int32x4_t result; #pragma unroll for (int i = 0; i < 4; i++) { result.v[i] = (int)x.v[i]; } return result; -}|}, ["uint4x32_t"; "int32x4_t"]); - - ("uint4x32_to_i64_uniform_vec", {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "int32x4_t" ] ); + ( "uint4x32_to_i64_uniform_vec", + {|__device__ int64x2_t uint4x32_to_i64_uniform_vec(uint4x32_t x) { int64x2_t result; result.v[0] = __double_as_longlong(__hiloint2double(x.v[1], x.v[0])); result.v[1] = __double_as_longlong(__hiloint2double(x.v[3], x.v[2])); return result; -}|}, ["uint4x32_t"; "int64x2_t"]); - - ("uint4x32_to_i8_uniform_vec", {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "int64x2_t" ] ); + ( "uint4x32_to_i8_uniform_vec", + {|__device__ int8x16_t uint4x32_to_i8_uniform_vec(uint4x32_t x) { int8x16_t result; #pragma unroll for (int i = 0; i < 4; i++) { @@ -107,9 +123,10 @@ let builtins = [ result.v[i*4 + 3] = (signed char)((x.v[i] >> 24) & 0xFF); } return result; -}|}, ["uint4x32_t"; "int8x16_t"]); - - ("uint4x32_to_u16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "int8x16_t" ] ); + ( "uint4x32_to_u16_uniform_vec", + {|__device__ uint16x8_t uint4x32_to_u16_uniform_vec(uint4x32_t x) { uint16x8_t result; #pragma unroll for (int i = 0; i < 4; i++) { @@ -117,9 +134,10 @@ let builtins = [ result.v[i*2 + 1] = (unsigned short)((x.v[i] >> 16) & 0xFFFF); } return result; -}|}, ["uint4x32_t"; "uint16x8_t"]); - - ("uint4x32_to_bfloat16_uniform_vec", {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "uint16x8_t" ] ); + ( "uint4x32_to_bfloat16_uniform_vec", + {|__device__ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) { uint16x8_t result; #pragma unroll for (int i = 0; i < 4; i++) { @@ -130,9 +148,10 @@ let builtins = [ result.v[i*2 + 1] = (unsigned short)(__float_as_uint(f2) >> 16); } return result; -}|}, ["uint4x32_t"; "uint16x8_t"]); - - ("uint4x32_to_half_uniform_vec", {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "uint16x8_t" ] ); + ( "uint4x32_to_half_uniform_vec", + {|__device__ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) { half8_t result; #pragma unroll for (int i = 0; i < 4; i++) { @@ -142,9 +161,10 @@ let builtins = [ result.v[i*2 + 1] = __float2half(f2); } return result; -}|}, ["uint4x32_t"; "half8_t"]); - - ("uint4x32_to_u8_uniform_vec", {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) { +}|}, + [ "uint4x32_t"; "half8_t" ] ); + ( "uint4x32_to_u8_uniform_vec", + {|__device__ uint8x16_t uint4x32_to_u8_uniform_vec(uint4x32_t x) { uint8x16_t result; #pragma unroll for (int i = 0; i < 4; i++) { @@ -154,70 +174,81 @@ let builtins = [ result.v[i*4 + 3] = (unsigned char)((x.v[i] >> 24) & 0xFF); } return result; -}|}, ["uint4x32_t"; "uint8x16_t"]); - - ("single_to_uint4x32", {|__device__ uint4x32_t single_to_uint4x32(float x) { +}|}, + [ "uint4x32_t"; "uint8x16_t" ] ); + ( "single_to_uint4x32", + {|__device__ uint4x32_t single_to_uint4x32(float x) { unsigned int bits = __float_as_uint(x); uint4x32_t result = {{bits, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("double_to_uint4x32", {|__device__ uint4x32_t double_to_uint4x32(double x) { +}|}, + [ "uint4x32_t" ] ); + ( "double_to_uint4x32", + {|__device__ uint4x32_t double_to_uint4x32(double x) { unsigned long long bits = __double_as_longlong(x); uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("int32_to_uint4x32", {|__device__ uint4x32_t int32_to_uint4x32(int x) { +}|}, + [ "uint4x32_t" ] ); + ( "int32_to_uint4x32", + {|__device__ uint4x32_t int32_to_uint4x32(int x) { uint4x32_t result = {{(unsigned int)x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("int64_to_uint4x32", {|__device__ uint4x32_t int64_to_uint4x32(long long x) { +}|}, + [ "uint4x32_t" ] ); + ( "int64_to_uint4x32", + {|__device__ uint4x32_t int64_to_uint4x32(long long x) { unsigned long long bits = (unsigned long long)x; uint4x32_t result = {{(unsigned int)(bits & 0xFFFFFFFF), (unsigned int)(bits >> 32), 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("uint32_to_uint4x32", {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint32_to_uint4x32", + {|__device__ uint4x32_t uint32_to_uint4x32(unsigned int x) { uint4x32_t result = {{x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("uint64_to_uint4x32", {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint64_to_uint4x32", + {|__device__ uint4x32_t uint64_to_uint4x32(unsigned long long x) { uint4x32_t result = {{(unsigned int)(x & 0xFFFFFFFF), (unsigned int)(x >> 32), 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("byte_to_uint4x32", {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) { +}|}, + [ "uint4x32_t" ] ); + ( "byte_to_uint4x32", + {|__device__ uint4x32_t byte_to_uint4x32(unsigned char x) { uint4x32_t result = {{(unsigned int)x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("uint16_to_uint4x32", {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) { +}|}, + [ "uint4x32_t" ] ); + ( "uint16_to_uint4x32", + {|__device__ uint4x32_t uint16_to_uint4x32(unsigned short x) { uint4x32_t result = {{(unsigned int)x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("bfloat16_to_uint4x32", {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) { +}|}, + [ "uint4x32_t" ] ); + ( "bfloat16_to_uint4x32", + {|__device__ uint4x32_t bfloat16_to_uint4x32(unsigned short x) { uint4x32_t result = {{(unsigned int)x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("half_to_uint4x32", {|__device__ uint4x32_t half_to_uint4x32(__half x) { +}|}, + [ "uint4x32_t" ] ); + ( "half_to_uint4x32", + {|__device__ uint4x32_t half_to_uint4x32(__half x) { unsigned short bits = __half_as_ushort(x); uint4x32_t result = {{(unsigned int)bits, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("fp8_to_uint4x32", {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) { +}|}, + [ "uint4x32_t" ] ); + ( "fp8_to_uint4x32", + {|__device__ uint4x32_t fp8_to_uint4x32(unsigned char x) { uint4x32_t result = {{(unsigned int)x, 0, 0, 0}}; return result; -}|}, ["uint4x32_t"]); - - ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []); - - ("THREEFRY_ROTATION", {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = { +}|}, + [ "uint4x32_t" ] ); + ("THREEFRY_C240", {|__device__ __constant__ unsigned int THREEFRY_C240 = 0x1BD11BDA;|}, []); + ( "THREEFRY_ROTATION", + {|__device__ __constant__ unsigned int THREEFRY_ROTATION[8][4] = { {13, 15, 26, 6}, {17, 29, 16, 24}, {13, 15, 26, 6}, @@ -226,13 +257,15 @@ let builtins = [ {17, 29, 16, 24}, {13, 15, 26, 6}, {17, 29, 16, 24} -};|}, []); - - ("rotl32", {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) { +};|}, + [] ); + ( "rotl32", + {|__device__ __forceinline__ unsigned int rotl32(unsigned int x, unsigned int n) { return __funnelshift_l(x, x, n); -}|}, []); - - ("threefry_round", {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) { +}|}, + [] ); + ( "threefry_round", + {|__device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) { x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x; x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z; @@ -246,9 +279,10 @@ let builtins = [ tmp = x.y; x.y = x.w; x.w = tmp; -}|}, ["rotl32"]); - - ("arrayjit_threefry4x32_crypto", {|__device__ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) { +}|}, + [ "rotl32" ] ); + ( "arrayjit_threefry4x32_crypto", + {|__device__ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) { uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]); uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]); @@ -310,9 +344,10 @@ let builtins = [ result.v[2] = x.z; result.v[3] = x.w; return result; -}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]); - - ("arrayjit_threefry4x32_light", {|__device__ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) { +}|}, + [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] ); + ( "arrayjit_threefry4x32_light", + {|__device__ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) { uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]); uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]); @@ -343,10 +378,12 @@ let builtins = [ result.v[2] = x.z; result.v[3] = x.w; return result; -}|}, ["uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION"]); - - ("arrayjit_threefry4x32", {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) { +}|}, + [ "uint4x32_t"; "THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION" ] ); + ( "arrayjit_threefry4x32", + {|__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) { /* Default to light version */ return arrayjit_threefry4x32_light(key, counter); -}|}, ["uint4x32_t"; "arrayjit_threefry4x32_light"]); -] \ No newline at end of file +}|}, + [ "uint4x32_t"; "arrayjit_threefry4x32_light" ] ); + ] File "arrayjit/lib/builtins_metal.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/builtins_metal.ml b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml index b403f89..f77385e 100644 --- a/_build/default/arrayjit/lib/builtins_metal.ml +++ b/_build/default/arrayjit/lib/.formatted/builtins_metal.ml @@ -1,24 +1,22 @@ (* Metal builtin code split into (key, definition, dependencies) triples for filtering *) -let builtins = [ - ("METAL_HEADERS", {|#include <metal_stdlib> +let builtins = + [ + ("METAL_HEADERS", {|#include <metal_stdlib> using namespace metal;|}, []); - - ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []); - - ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []); - ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []); - ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []); - ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []); - ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []); - ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []); - ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []); - ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []); - - ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) { + ("THREEFRY_C240", {|constant uint32_t THREEFRY_C240 = 0x1BD11BDA;|}, []); + ("THREEFRY_ROTATION_0_0", {|constant uint THREEFRY_ROTATION_0_0 = 13;|}, []); + ("THREEFRY_ROTATION_0_1", {|constant uint THREEFRY_ROTATION_0_1 = 15;|}, []); + ("THREEFRY_ROTATION_0_2", {|constant uint THREEFRY_ROTATION_0_2 = 26;|}, []); + ("THREEFRY_ROTATION_0_3", {|constant uint THREEFRY_ROTATION_0_3 = 6;|}, []); + ("THREEFRY_ROTATION_1_0", {|constant uint THREEFRY_ROTATION_1_0 = 17;|}, []); + ("THREEFRY_ROTATION_1_1", {|constant uint THREEFRY_ROTATION_1_1 = 29;|}, []); + ("THREEFRY_ROTATION_1_2", {|constant uint THREEFRY_ROTATION_1_2 = 16;|}, []); + ("THREEFRY_ROTATION_1_3", {|constant uint THREEFRY_ROTATION_1_3 = 24;|}, []); + ("rotl32", {|inline uint32_t rotl32(uint32_t x, uint n) { return rotate(x, n); }|}, []); - - ("threefry_round", {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) { + ( "threefry_round", + {|inline void threefry_round(thread uint4 &x, uint r0, uint r1, uint r2, uint r3) { x.x += x.y; x.y = rotl32(x.y, r0); x.y ^= x.x; x.z += x.w; x.w = rotl32(x.w, r1); x.w ^= x.z; @@ -32,9 +30,10 @@ using namespace metal;|}, []); tmp = x.y; x.y = x.w; x.w = tmp; -}|}, ["rotl32"]); - - ("arrayjit_threefry4x32_crypto", {|uint4 arrayjit_threefry4x32_crypto(uint4 key, uint4 counter) { +}|}, + [ "rotl32" ] ); + ( "arrayjit_threefry4x32_crypto", + {|uint4 arrayjit_threefry4x32_crypto(uint4 key, uint4 counter) { uint4 x = counter; uint4 k = key; @@ -119,11 +118,21 @@ using namespace metal;|}, []); x.w += 5; return x; -}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1"; - "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0"; - "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]); - - ("arrayjit_threefry4x32_light", {|uint4 arrayjit_threefry4x32_light(uint4 key, uint4 counter) { +}|}, + [ + "THREEFRY_C240"; + "threefry_round"; + "THREEFRY_ROTATION_0_0"; + "THREEFRY_ROTATION_0_1"; + "THREEFRY_ROTATION_0_2"; + "THREEFRY_ROTATION_0_3"; + "THREEFRY_ROTATION_1_0"; + "THREEFRY_ROTATION_1_1"; + "THREEFRY_ROTATION_1_2"; + "THREEFRY_ROTATION_1_3"; + ] ); + ( "arrayjit_threefry4x32_light", + {|uint4 arrayjit_threefry4x32_light(uint4 key, uint4 counter) { uint4 x = counter; uint4 k = key; @@ -146,120 +155,148 @@ using namespace metal;|}, []); x.w += ks4 + 1; return x; -}|}, ["THREEFRY_C240"; "threefry_round"; "THREEFRY_ROTATION_0_0"; "THREEFRY_ROTATION_0_1"; - "THREEFRY_ROTATION_0_2"; "THREEFRY_ROTATION_0_3"; "THREEFRY_ROTATION_1_0"; - "THREEFRY_ROTATION_1_1"; "THREEFRY_ROTATION_1_2"; "THREEFRY_ROTATION_1_3"]); - - ("arrayjit_threefry4x32", {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) { +}|}, + [ + "THREEFRY_C240"; + "threefry_round"; + "THREEFRY_ROTATION_0_0"; + "THREEFRY_ROTATION_0_1"; + "THREEFRY_ROTATION_0_2"; + "THREEFRY_ROTATION_0_3"; + "THREEFRY_ROTATION_1_0"; + "THREEFRY_ROTATION_1_1"; + "THREEFRY_ROTATION_1_2"; + "THREEFRY_ROTATION_1_3"; + ] ); + ( "arrayjit_threefry4x32", + {|uint4 arrayjit_threefry4x32(uint4 key, uint4 counter) { /* Default to light version */ return arrayjit_threefry4x32_light(key, counter); -}|}, ["arrayjit_threefry4x32_light"]); - - ("float4_t", {|struct float4_t { float4 v; };|}, []); - ("float2_t", {|struct float2_t { float2 v; };|}, []); - ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []); - ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []); - ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []); - ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []); - ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []); - ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []); - ("half8_t", {|struct half8_t { half v[8]; };|}, []); - - ("uint32_to_single_uniform", {|inline float uint32_to_single_uniform(uint32_t x) { +}|}, + [ "arrayjit_threefry4x32_light" ] ); + ("float4_t", {|struct float4_t { float4 v; };|}, []); + ("float2_t", {|struct float2_t { float2 v; };|}, []); + ("int32x4_t", {|struct int32x4_t { int4 v; };|}, []); + ("int64x2_t", {|struct int64x2_t { int64_t v[2]; };|}, []); + ("uint64x2_t", {|struct uint64x2_t { uint64_t v[2]; };|}, []); + ("int8x16_t", {|struct int8x16_t { int8_t v[16]; };|}, []); + ("uint16x8_t", {|struct uint16x8_t { uint16_t v[8]; };|}, []); + ("uint8x16_t", {|struct uint8x16_t { uint8_t v[16]; };|}, []); + ("half8_t", {|struct half8_t { half v[8]; };|}, []); + ( "uint32_to_single_uniform", + {|inline float uint32_to_single_uniform(uint32_t x) { return (x >> 8) * (1.0f / 16777216.0f); -}|}, []); - - ("uint4x32_to_single_uniform", {|float uint4x32_to_single_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_single_uniform", + {|float uint4x32_to_single_uniform(uint4 x) { return uint32_to_single_uniform(x.x); -}|}, ["uint32_to_single_uniform"]); - - ("uint4x32_to_double_uniform", {|float uint4x32_to_double_uniform(uint4 x) { +}|}, + [ "uint32_to_single_uniform" ] ); + ( "uint4x32_to_double_uniform", + {|float uint4x32_to_double_uniform(uint4 x) { /* Fallback to float precision */ uint64_t combined = (uint64_t(x.y) << 32) | x.x; return float(combined) * (1.0f / 18446744073709551616.0f); -}|}, []); - - ("uint4x32_to_int32_uniform", {|int32_t uint4x32_to_int32_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_int32_uniform", + {|int32_t uint4x32_to_int32_uniform(uint4 x) { return int32_t(x.x); -}|}, []); - - ("uint4x32_to_int64_uniform", {|int64_t uint4x32_to_int64_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_int64_uniform", + {|int64_t uint4x32_to_int64_uniform(uint4 x) { return int64_t((uint64_t(x.y) << 32) | x.x); -}|}, []); - - ("uint4x32_to_uint32_uniform", {|uint32_t uint4x32_to_uint32_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_uint32_uniform", + {|uint32_t uint4x32_to_uint32_uniform(uint4 x) { return x.x; -}|}, []); - - ("uint4x32_to_uint64_uniform", {|uint64_t uint4x32_to_uint64_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_uint64_uniform", + {|uint64_t uint4x32_to_uint64_uniform(uint4 x) { return (uint64_t(x.y) << 32) | x.x; -}|}, []); - - ("uint4x32_to_byte_uniform", {|int8_t uint4x32_to_byte_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_byte_uniform", + {|int8_t uint4x32_to_byte_uniform(uint4 x) { return int8_t(x.x & 0xFF); -}|}, []); - - ("uint4x32_to_uint16_uniform", {|uint16_t uint4x32_to_uint16_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_uint16_uniform", + {|uint16_t uint4x32_to_uint16_uniform(uint4 x) { return uint16_t(x.x & 0xFFFF); -}|}, []); - - ("uint4x32_to_bfloat16_uniform", {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_bfloat16_uniform", + {|uint16_t uint4x32_to_bfloat16_uniform(uint4 x) { float f = uint32_to_single_uniform(x.x); return uint16_t(as_type<uint32_t>(f) >> 16); -}|}, ["uint32_to_single_uniform"]); - - ("uint4x32_to_half_uniform", {|half uint4x32_to_half_uniform(uint4 x) { +}|}, + [ "uint32_to_single_uniform" ] ); + ( "uint4x32_to_half_uniform", + {|half uint4x32_to_half_uniform(uint4 x) { float f = uint32_to_single_uniform(x.x); return half(f); -}|}, ["uint32_to_single_uniform"]); - - ("uint4x32_to_fp8_uniform", {|uint8_t uint4x32_to_fp8_uniform(uint4 x) { +}|}, + [ "uint32_to_single_uniform" ] ); + ( "uint4x32_to_fp8_uniform", + {|uint8_t uint4x32_to_fp8_uniform(uint4 x) { return uint8_t(x.x & 0xFF); -}|}, []); - - ("uint4x32_to_single_uniform_vec", {|float4_t uint4x32_to_single_uniform_vec(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_single_uniform_vec", + {|float4_t uint4x32_to_single_uniform_vec(uint4 x) { float4_t result; result.v.x = uint32_to_single_uniform(x.x); result.v.y = uint32_to_single_uniform(x.y); result.v.z = uint32_to_single_uniform(x.z); result.v.w = uint32_to_single_uniform(x.w); return result; -}|}, ["float4_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_double_uniform_vec", {|float2_t uint4x32_to_double_uniform_vec(uint4 x) { +}|}, + [ "float4_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_double_uniform_vec", + {|float2_t uint4x32_to_double_uniform_vec(uint4 x) { float2_t result; uint64_t combined1 = (uint64_t(x.y) << 32) | x.x; uint64_t combined2 = (uint64_t(x.w) << 32) | x.z; result.v.x = float(combined1) * (1.0f / 18446744073709551616.0f); result.v.y = float(combined2) * (1.0f / 18446744073709551616.0f); return result; -}|}, ["float2_t"]); - - ("uint4x32_to_int32_uniform_vec", {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) { +}|}, + [ "float2_t" ] ); + ( "uint4x32_to_int32_uniform_vec", + {|int32x4_t uint4x32_to_int32_uniform_vec(uint4 x) { int32x4_t result; result.v = int4(x); return result; -}|}, ["int32x4_t"]); - - ("uint4x32_to_int64_uniform_vec", {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) { +}|}, + [ "int32x4_t" ] ); + ( "uint4x32_to_int64_uniform_vec", + {|int64x2_t uint4x32_to_int64_uniform_vec(uint4 x) { int64x2_t result; result.v[0] = (int64_t(x.y) << 32) | x.x; result.v[1] = (int64_t(x.w) << 32) | x.z; return result; -}|}, ["int64x2_t"]); - - ("uint4x32_to_uint32_uniform_vec", {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) { +}|}, + [ "int64x2_t" ] ); + ( "uint4x32_to_uint32_uniform_vec", + {|uint4 uint4x32_to_uint32_uniform_vec(uint4 x) { return x; -}|}, []); - - ("uint4x32_to_uint64_uniform_vec", {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) { +}|}, + [] ); + ( "uint4x32_to_uint64_uniform_vec", + {|uint64x2_t uint4x32_to_uint64_uniform_vec(uint4 x) { uint64x2_t result; result.v[0] = (uint64_t(x.y) << 32) | x.x; result.v[1] = (uint64_t(x.w) << 32) | x.z; return result; -}|}, ["uint64x2_t"]); - - ("uint4x32_to_byte_uniform_vec", {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) { +}|}, + [ "uint64x2_t" ] ); + ( "uint4x32_to_byte_uniform_vec", + {|int8x16_t uint4x32_to_byte_uniform_vec(uint4 x) { int8x16_t result; uint4 v = x; for (int i = 0; i < 4; i++) { @@ -270,9 +307,10 @@ using namespace metal;|}, []); result.v[i*4 + 3] = int8_t((val >> 24) & 0xFF); } return result; -}|}, ["int8x16_t"]); - - ("uint4x32_to_uint16_uniform_vec", {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) { +}|}, + [ "int8x16_t" ] ); + ( "uint4x32_to_uint16_uniform_vec", + {|uint16x8_t uint4x32_to_uint16_uniform_vec(uint4 x) { uint16x8_t result; uint4 v = x; for (int i = 0; i < 4; i++) { @@ -281,9 +319,10 @@ using namespace metal;|}, []); result.v[i*2 + 1] = uint16_t((val >> 16) & 0xFFFF); } return result; -}|}, ["uint16x8_t"]); - - ("uint4x32_to_bfloat16_uniform_vec", {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) { +}|}, + [ "uint16x8_t" ] ); + ( "uint4x32_to_bfloat16_uniform_vec", + {|uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4 x) { uint16x8_t result; uint4 v = x; for (int i = 0; i < 4; i++) { @@ -294,9 +333,10 @@ using namespace metal;|}, []); result.v[i*2 + 1] = uint16_t(as_type<uint32_t>(f2) >> 16); } return result; -}|}, ["uint16x8_t"]); - - ("uint4x32_to_half_uniform_vec", {|half8_t uint4x32_to_half_uniform_vec(uint4 x) { +}|}, + [ "uint16x8_t" ] ); + ( "uint4x32_to_half_uniform_vec", + {|half8_t uint4x32_to_half_uniform_vec(uint4 x) { half8_t result; uint4 v = x; for (int i = 0; i < 4; i++) { @@ -307,9 +347,10 @@ using namespace metal;|}, []); result.v[i*2 + 1] = half(f2); } return result; -}|}, ["half8_t"]); - - ("uint4x32_to_fp8_uniform_vec", {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) { +}|}, + [ "half8_t" ] ); + ( "uint4x32_to_fp8_uniform_vec", + {|uint8x16_t uint4x32_to_fp8_uniform_vec(uint4 x) { uint8x16_t result; uint4 v = x; for (int i = 0; i < 4; i++) { @@ -320,53 +361,65 @@ using namespace metal;|}, []); result.v[i*4 + 3] = uint8_t((val >> 24) & 0xFF); } return result; -}|}, ["uint8x16_t"]); - - ("single_to_uint4x32", {|uint4 single_to_uint4x32(float x) { +}|}, + [ "uint8x16_t" ] ); + ( "single_to_uint4x32", + {|uint4 single_to_uint4x32(float x) { uint32_t bits = as_type<uint32_t>(x); return uint4(bits, 0, 0, 0); -}|}, []); - - ("double_to_uint4x32", {|uint4 double_to_uint4x32(float x) { +}|}, + [] ); + ( "double_to_uint4x32", + {|uint4 double_to_uint4x32(float x) { /* Metal doesn't have native double support, use float fallback */ uint32_t bits = as_type<uint32_t>(x); return uint4(bits, 0, 0, 0); -}|}, []); - - ("int32_to_uint4x32", {|uint4 int32_to_uint4x32(int32_t x) { +}|}, + [] ); + ( "int32_to_uint4x32", + {|uint4 int32_to_uint4x32(int32_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); - - ("int64_to_uint4x32", {|uint4 int64_to_uint4x32(int64_t x) { +}|}, + [] ); + ( "int64_to_uint4x32", + {|uint4 int64_to_uint4x32(int64_t x) { uint64_t bits = uint64_t(x); return uint4(uint32_t(bits & 0xFFFFFFFF), uint32_t(bits >> 32), 0, 0); -}|}, []); - - ("uint32_to_uint4x32", {|uint4 uint32_to_uint4x32(uint32_t x) { +}|}, + [] ); + ( "uint32_to_uint4x32", + {|uint4 uint32_to_uint4x32(uint32_t x) { return uint4(x, 0, 0, 0); -}|}, []); - - ("uint64_to_uint4x32", {|uint4 uint64_to_uint4x32(uint64_t x) { +}|}, + [] ); + ( "uint64_to_uint4x32", + {|uint4 uint64_to_uint4x32(uint64_t x) { return uint4(uint32_t(x & 0xFFFFFFFF), uint32_t(x >> 32), 0, 0); -}|}, []); - - ("byte_to_uint4x32", {|uint4 byte_to_uint4x32(int8_t x) { +}|}, + [] ); + ( "byte_to_uint4x32", + {|uint4 byte_to_uint4x32(int8_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); - - ("uint16_to_uint4x32", {|uint4 uint16_to_uint4x32(uint16_t x) { +}|}, + [] ); + ( "uint16_to_uint4x32", + {|uint4 uint16_to_uint4x32(uint16_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); - - ("bfloat16_to_uint4x32", {|uint4 bfloat16_to_uint4x32(uint16_t x) { +}|}, + [] ); + ( "bfloat16_to_uint4x32", + {|uint4 bfloat16_to_uint4x32(uint16_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); - - ("half_to_uint4x32", {|uint4 half_to_uint4x32(uint16_t x) { +}|}, + [] ); + ( "half_to_uint4x32", + {|uint4 half_to_uint4x32(uint16_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); - - ("fp8_to_uint4x32", {|uint4 fp8_to_uint4x32(uint8_t x) { +}|}, + [] ); + ( "fp8_to_uint4x32", + {|uint4 fp8_to_uint4x32(uint8_t x) { return uint4(uint32_t(x), 0, 0, 0); -}|}, []); -] +}|}, + [] ); + ] File "arrayjit/lib/builtins_cc.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/builtins_cc.ml b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml index 6ec2a3c..b8df189 100644 --- a/_build/default/arrayjit/lib/builtins_cc.ml +++ b/_build/default/arrayjit/lib/.formatted/builtins_cc.ml @@ -1,4 +1,5 @@ -let includes = {| +let includes = + {| #include <stdio.h> #include <math.h> #include <stdint.h> @@ -9,74 +10,84 @@ let includes = {| |} (* Each entry is (key, definition, dependencies) *) -let builtins = [ - (* Float16 feature detection and type definitions *) - ("HAS_NATIVE_FLOAT16", {| +let builtins = + [ + (* Float16 feature detection and type definitions *) + ( "HAS_NATIVE_FLOAT16", + {| #ifdef __FLT16_MAX__ #define HAS_NATIVE_FLOAT16 1 #else #define HAS_NATIVE_FLOAT16 0 #endif -|}, []); - - ("HALF_T", {| +|}, + [] ); + ( "HALF_T", + {| #if HAS_NATIVE_FLOAT16 #define HALF_T _Float16 #else #define HALF_T uint16_t #endif -|}, ["HAS_NATIVE_FLOAT16"]); - - ("HALF_TO_FP", {| +|}, + [ "HAS_NATIVE_FLOAT16" ] ); + ( "HALF_TO_FP", + {| #if HAS_NATIVE_FLOAT16 #define HALF_TO_FP(x) (x) /* Identity - already in floating point */ #else #define HALF_TO_FP(x) half_to_float_emulated(x) /* Convert to float for computation */ #endif -|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]); - - ("FP_TO_HALF", {| +|}, + [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] ); + ( "FP_TO_HALF", + {| #if HAS_NATIVE_FLOAT16 #define FP_TO_HALF(x) (x) /* Identity - already half precision */ #else #define FP_TO_HALF(x) float_to_half_emulated(x) /* Convert back from float */ #endif -|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]); - - ("HALF_TO_FLOAT", {| +|}, + [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] ); + ( "HALF_TO_FLOAT", + {| #if HAS_NATIVE_FLOAT16 #define HALF_TO_FLOAT(x) ((float)(x)) #else #define HALF_TO_FLOAT(x) half_to_float_emulated(x) #endif -|}, ["HAS_NATIVE_FLOAT16"; "half_to_float_emulated"]); - - ("FLOAT_TO_HALF", {| +|}, + [ "HAS_NATIVE_FLOAT16"; "half_to_float_emulated" ] ); + ( "FLOAT_TO_HALF", + {| #if HAS_NATIVE_FLOAT16 #define FLOAT_TO_HALF(x) ((_Float16)(x)) #else #define FLOAT_TO_HALF(x) float_to_half_emulated(x) #endif -|}, ["HAS_NATIVE_FLOAT16"; "float_to_half_emulated"]); - - ("HALF_TO_UINT16", {| +|}, + [ "HAS_NATIVE_FLOAT16"; "float_to_half_emulated" ] ); + ( "HALF_TO_UINT16", + {| #if HAS_NATIVE_FLOAT16 #define HALF_TO_UINT16(x) ({ _Float16 _h = (x); uint16_t _r; memcpy(&_r, &_h, 2); _r; }) #else #define HALF_TO_UINT16(x) (x) #endif -|}, ["HAS_NATIVE_FLOAT16"]); - - ("UINT16_TO_HALF", {| +|}, + [ "HAS_NATIVE_FLOAT16" ] ); + ( "UINT16_TO_HALF", + {| #if HAS_NATIVE_FLOAT16 #define UINT16_TO_HALF(x) ({ uint16_t _u = (x); _Float16 _h; memcpy(&_h, &_u, 2); _h; }) #else #define UINT16_TO_HALF(x) (x) #endif -|}, ["HAS_NATIVE_FLOAT16"]); - - (* Float16 emulation functions *) - ("half_to_float_emulated", {| +|}, + [ "HAS_NATIVE_FLOAT16" ] ); + (* Float16 emulation functions *) + ( "half_to_float_emulated", + {| #if !HAS_NATIVE_FLOAT16 /* Convert IEEE 754 half precision (stored as uint16_t) to float */ static float half_to_float_emulated(uint16_t h) { @@ -108,9 +119,10 @@ static float half_to_float_emulated(uint16_t h) { } } #endif -|}, ["HAS_NATIVE_FLOAT16"]); - - ("float_to_half_emulated", {| +|}, + [ "HAS_NATIVE_FLOAT16" ] ); + ( "float_to_half_emulated", + {| #if !HAS_NATIVE_FLOAT16 /* Convert float to IEEE 754 half precision (stored as uint16_t) */ static uint16_t float_to_half_emulated(float f) { @@ -197,16 +209,16 @@ static uint16_t float_to_half_emulated(float f) { } } #endif -|}, ["HAS_NATIVE_FLOAT16"]); - - (* Threefry4x32 types and complete implementation *) - ("uint4x32_t", {| +|}, + [ "HAS_NATIVE_FLOAT16" ] ); + (* Threefry4x32 types and complete implementation *) + ("uint4x32_t", {| typedef struct { uint32_t v[4]; } uint4x32_t; |}, []); - - ("threefry_common", {| + ( "threefry_common", + {| /* Threefry4x32 constants */ const uint32_t THREEFRY_C240 = 0x1BD11BDA; @@ -241,9 +253,10 @@ void threefry_round(uint32_t x[4], unsigned int r0, unsigned int r1, unsigned in x[1] = x[3]; x[3] = tmp; } -|}, ["uint4x32_t"]); - - ("arrayjit_threefry4x32_crypto", {| +|}, + [ "uint4x32_t" ] ); + ( "arrayjit_threefry4x32_crypto", + {| /* Threefry4x32 implementation - 20 rounds (cryptographic version) */ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) { @@ -332,9 +345,10 @@ uint4x32_t arrayjit_threefry4x32_crypto(uint4x32_t key, uint4x32_t counter) { result.v[3] = x[3]; return result; } -|}, ["uint4x32_t"; "threefry_common"]); - - ("arrayjit_threefry4x32_light", {| +|}, + [ "uint4x32_t"; "threefry_common" ] ); + ( "arrayjit_threefry4x32_light", + {| /* Threefry4x32 implementation - 2 rounds (light version, as in JAX/XLA) */ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) { uint32_t x[4]; @@ -376,83 +390,81 @@ uint4x32_t arrayjit_threefry4x32_light(uint4x32_t key, uint4x32_t counter) { result.v[3] = x[3]; return result; } -|}, ["uint4x32_t"; "threefry_common"]); - - ("arrayjit_threefry4x32", {| +|}, + [ "uint4x32_t"; "threefry_common" ] ); + ( "arrayjit_threefry4x32", + {| /* Default threefry4x32 function - will be configured at runtime */ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) { /* Default to light version */ return arrayjit_threefry4x32_light(key, counter); } -|}, ["uint4x32_t"; "arrayjit_threefry4x32_light"]); - - (* Vector types with half precision *) - ("half8_t", {| +|}, + [ "uint4x32_t"; "arrayjit_threefry4x32_light" ] ); + (* Vector types with half precision *) + ("half8_t", {| typedef struct { HALF_T v[8]; } half8_t; -|}, ["HALF_T"]); - - ("float4_t", {| +|}, [ "HALF_T" ]); + ("float4_t", {| typedef struct { float v[4]; } float4_t; |}, []); - - ("double2_t", {| + ("double2_t", {| typedef struct { double v[2]; } double2_t; |}, []); - - ("int32x4_t", {| + ("int32x4_t", {| typedef struct { int32_t v[4]; } int32x4_t; |}, []); - - ("int64x2_t", {| + ("int64x2_t", {| typedef struct { int64_t v[2]; } int64x2_t; |}, []); - - ("int8x16_t", {| + ("int8x16_t", {| typedef struct { int8_t v[16]; } int8x16_t; |}, []); - - ("uint16x8_t", {| + ("uint16x8_t", {| typedef struct { uint16_t v[8]; } uint16x8_t; |}, []); - - ("uint8x16_t", {| + ("uint8x16_t", {| typedef struct { uint8_t v[16]; } uint8x16_t; |}, []); - - (* Basic conversion functions *) - ("uint32_to_single_uniform", {| + (* Basic conversion functions *) + ( "uint32_to_single_uniform", + {| /* Convert to float in [0, 1) */ float uint32_to_single_uniform(uint32_t x) { /* Use upper 24 bits for float mantissa (23 bits + implicit 1) */ return (x >> 8) * (1.0f / 16777216.0f); } -|}, []); - - ("uint32_to_double_uniform", {| +|}, + [] ); + ( "uint32_to_double_uniform", + {| /* Convert to double in [0, 1) */ double uint32_to_double_uniform(uint32_t x) { return x * (1.0 / 4294967296.0); } -|}, []); - - (* Conversion functions with dependencies *) - ("uint4x32_to_single_uniform", {| +|}, + [] ); + (* Conversion functions with dependencies *) + ( "uint4x32_to_single_uniform", + {| /* Uint4x32 to float32 uniform - uses first 32 bits */ float uint4x32_to_single_uniform(uint4x32_t x) { return uint32_to_single_uniform(x.v[0]); } -|}, ["uint4x32_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_half_uniform", {| +|}, + [ "uint4x32_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_half_uniform", + {| /* Uint4x32 to float16 uniform - uses first 16 bits */ uint16_t uint4x32_to_half_uniform(uint4x32_t x) { /* Convert through float for consistent behavior */ float f = (x.v[0] & 0xFFFF) * (1.0f / 65536.0f); return FLOAT_TO_HALF(f); } -|}, ["uint4x32_t"; "FLOAT_TO_HALF"]); - - ("uint4x32_to_half_uniform_vec", {| +|}, + [ "uint4x32_t"; "FLOAT_TO_HALF" ] ); + ( "uint4x32_to_half_uniform_vec", + {| /* Convert uint4x32 to 8 float16s uniform */ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) { half8_t result; @@ -467,10 +479,11 @@ half8_t uint4x32_to_half_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "half8_t"; "FLOAT_TO_HALF"]); - - (* Pure C conversion functions *) - ("bfloat16_to_single", {| +|}, + [ "uint4x32_t"; "half8_t"; "FLOAT_TO_HALF" ] ); + (* Pure C conversion functions *) + ( "bfloat16_to_single", + {| /* BFloat16 to Float conversion (C function) */ float bfloat16_to_single(uint16_t bf16) { @@ -479,9 +492,10 @@ float bfloat16_to_single(uint16_t bf16) uint32_t f32 = ((uint32_t)bf16) << 16; return *((float *)&f32); } -|}, []); - - ("single_to_bfloat16", {| +|}, + [] ); + ( "single_to_bfloat16", + {| /* Float to BFloat16 conversion (C function) */ uint16_t single_to_bfloat16(float f) { @@ -491,27 +505,30 @@ uint16_t single_to_bfloat16(float f) uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1); return (uint16_t)(rounded >> 16); } -|}, []); - - ("half_to_single", {| +|}, + [] ); + ( "half_to_single", + {| /* Half (Float16) to Float conversion (C function) */ float half_to_single(uint16_t h) { HALF_T half_val = UINT16_TO_HALF(h); return HALF_TO_FLOAT(half_val); } -|}, ["HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT"]); - - ("single_to_half", {| +|}, + [ "HALF_T"; "UINT16_TO_HALF"; "HALF_TO_FLOAT" ] ); + ( "single_to_half", + {| /* Float to Half (Float16) conversion (C function) */ uint16_t single_to_half(float f) { HALF_T half_val = FLOAT_TO_HALF(f); return HALF_TO_UINT16(half_val); } -|}, ["HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16"]); - - ("fp8_to_single", {| +|}, + [ "HALF_T"; "FLOAT_TO_HALF"; "HALF_TO_UINT16" ] ); + ( "fp8_to_single", + {| /* FP8 E5M2 format to Float conversion (C function) Format: 1 sign bit, 5 exponent bits, 2 mantissa bits */ float fp8_to_single(uint8_t fp8) @@ -555,9 +572,10 @@ float fp8_to_single(uint8_t fp8) return result; } -|}, []); - - ("single_to_fp8", {| +|}, + [] ); + ( "single_to_fp8", + {| /* Float to FP8 E5M2 conversion (C function) */ uint8_t single_to_fp8(float f) { @@ -615,143 +633,162 @@ uint8_t single_to_fp8(float f) return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3)); } -|}, []); - - (* Conversion functions from various precisions to uint4x32_t *) - ("int32_to_uint4x32", {| +|}, + [] ); + (* Conversion functions from various precisions to uint4x32_t *) + ( "int32_to_uint4x32", + {| uint4x32_t int32_to_uint4x32(int32_t x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("int64_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "int64_to_uint4x32", + {| uint4x32_t int64_to_uint4x32(int64_t x) { uint64_t bits = (uint64_t)x; uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("uint32_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "uint32_to_uint4x32", + {| uint4x32_t uint32_to_uint4x32(uint32_t x) { uint4x32_t result = {{x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("uint64_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "uint64_to_uint4x32", + {| uint4x32_t uint64_to_uint4x32(uint64_t x) { uint4x32_t result = {{(uint32_t)(x & 0xFFFFFFFF), (uint32_t)(x >> 32), 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("single_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "single_to_uint4x32", + {| uint4x32_t single_to_uint4x32(float x) { uint32_t bits; memcpy(&bits, &x, sizeof(float)); uint4x32_t result = {{bits, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("double_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "double_to_uint4x32", + {| uint4x32_t double_to_uint4x32(double x) { uint64_t bits; memcpy(&bits, &x, sizeof(double)); uint4x32_t result = {{(uint32_t)(bits & 0xFFFFFFFF), (uint32_t)(bits >> 32), 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("byte_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "byte_to_uint4x32", + {| uint4x32_t byte_to_uint4x32(unsigned char x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("uint16_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "uint16_to_uint4x32", + {| uint4x32_t uint16_to_uint4x32(uint16_t x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("bfloat16_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "bfloat16_to_uint4x32", + {| uint4x32_t bfloat16_to_uint4x32(uint16_t x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("half_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "half_to_uint4x32", + {| uint4x32_t half_to_uint4x32(uint16_t x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - ("fp8_to_uint4x32", {| +|}, + [ "uint4x32_t" ] ); + ( "fp8_to_uint4x32", + {| uint4x32_t fp8_to_uint4x32(uint8_t x) { uint4x32_t result = {{(uint32_t)x, 0, 0, 0}}; return result; } -|}, ["uint4x32_t"]); - - (* More uint4x32 to various precision conversion functions *) - ("uint4x32_to_double_uniform", {| +|}, + [ "uint4x32_t" ] ); + (* More uint4x32 to various precision conversion functions *) + ( "uint4x32_to_double_uniform", + {| /* Uint4x32 to float64 uniform - uses first 64 bits */ double uint4x32_to_double_uniform(uint4x32_t x) { uint64_t combined = ((uint64_t)x.v[1] << 32) | x.v[0]; return combined * (1.0 / 18446744073709551616.0); } -|}, ["uint4x32_t"]); - - ("uint4x32_to_int32_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_int32_uniform", + {| /* Uint4x32 to int32 uniform - full range */ int32_t uint4x32_to_int32_uniform(uint4x32_t x) { return (int32_t)x.v[0]; } -|}, ["uint4x32_t"]); - - ("uint4x32_to_int64_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_int64_uniform", + {| /* Uint4x32 to int64 uniform - full range */ int64_t uint4x32_to_int64_uniform(uint4x32_t x) { return (int64_t)(((uint64_t)x.v[1] << 32) | x.v[0]); } -|}, ["uint4x32_t"]); - - ("uint4x32_to_uint32_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_uint32_uniform", + {| /* Uint4x32 to uint32 uniform - full range */ uint32_t uint4x32_to_uint32_uniform(uint4x32_t x) { return x.v[0]; } -|}, ["uint4x32_t"]); - - ("uint4x32_to_uint64_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_uint64_uniform", + {| /* Uint4x32 to uint64 uniform - full range */ uint64_t uint4x32_to_uint64_uniform(uint4x32_t x) { return ((uint64_t)x.v[1] << 32) | x.v[0]; } -|}, ["uint4x32_t"]); - - ("uint4x32_to_byte_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_byte_uniform", + {| /* Uint4x32 to int8 uniform - full range */ int8_t uint4x32_to_byte_uniform(uint4x32_t x) { return (int8_t)(x.v[0] & 0xFF); } -|}, ["uint4x32_t"]); - - ("uint4x32_to_uint16_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_uint16_uniform", + {| /* Uint4x32 to uint16 uniform - full range */ uint16_t uint4x32_to_uint16_uniform(uint4x32_t x) { return (uint16_t)(x.v[0] & 0xFFFF); } -|}, ["uint4x32_t"]); - - ("uint4x32_to_bfloat16_uniform", {| +|}, + [ "uint4x32_t" ] ); + ( "uint4x32_to_bfloat16_uniform", + {| /* Uint4x32 to bfloat16 uniform - uses first 16 bits */ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) { /* Convert to float first, then to bfloat16 */ @@ -763,17 +800,19 @@ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) { if ((bits & 0x8000) && ((bits & 0x7FFF) || (bf & 1))) bf++; return bf; } -|}, ["uint4x32_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_fp8_uniform", {| +|}, + [ "uint4x32_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_fp8_uniform", + {| /* Uint4x32 to fp8 uniform - uses first 8 bits */ uint8_t uint4x32_to_fp8_uniform(uint4x32_t x) { return (uint8_t)(x.v[0] & 0xFF); } -|}, ["uint4x32_t"]); - - (* Vectorized conversion functions *) - ("uint4x32_to_single_uniform_vec", {| +|}, + [ "uint4x32_t" ] ); + (* Vectorized conversion functions *) + ( "uint4x32_to_single_uniform_vec", + {| /* Convert uint4x32 to 4 floats in [0, 1) */ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) { float4_t result; @@ -782,9 +821,10 @@ float4_t uint4x32_to_single_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "float4_t"; "uint32_to_single_uniform"]); - - ("uint4x32_to_double_uniform_vec", {| +|}, + [ "uint4x32_t"; "float4_t"; "uint32_to_single_uniform" ] ); + ( "uint4x32_to_double_uniform_vec", + {| /* Convert uint4x32 to 2 doubles in [0, 1) */ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) { double2_t result; @@ -794,9 +834,10 @@ double2_t uint4x32_to_double_uniform_vec(uint4x32_t x) { result.v[1] = combined2 * (1.0 / 18446744073709551616.0); return result; } -|}, ["uint4x32_t"; "double2_t"]); - - ("uint4x32_to_int32_uniform_vec", {| +|}, + [ "uint4x32_t"; "double2_t" ] ); + ( "uint4x32_to_int32_uniform_vec", + {| /* Convert uint4x32 to 4 int32s - full range */ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) { int32x4_t result; @@ -805,9 +846,10 @@ int32x4_t uint4x32_to_int32_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "int32x4_t"]); - - ("uint4x32_to_int64_uniform_vec", {| +|}, + [ "uint4x32_t"; "int32x4_t" ] ); + ( "uint4x32_to_int64_uniform_vec", + {| /* Convert uint4x32 to 2 int64s - full range */ int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) { int64x2_t result; @@ -815,9 +857,10 @@ int64x2_t uint4x32_to_int64_uniform_vec(uint4x32_t x) { result.v[1] = (int64_t)(((uint64_t)x.v[3] << 32) | x.v[2]); return result; } -|}, ["uint4x32_t"; "int64x2_t"]); - - ("uint4x32_to_byte_uniform_vec", {| +|}, + [ "uint4x32_t"; "int64x2_t" ] ); + ( "uint4x32_to_byte_uniform_vec", + {| /* Convert uint4x32 to 16 int8s - full range */ int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) { int8x16_t result; @@ -829,9 +872,10 @@ int8x16_t uint4x32_to_byte_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "int8x16_t"]); - - ("uint4x32_to_uint16_uniform_vec", {| +|}, + [ "uint4x32_t"; "int8x16_t" ] ); + ( "uint4x32_to_uint16_uniform_vec", + {| /* Convert uint4x32 to 8 uint16s - full range */ uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) { uint16x8_t result; @@ -841,9 +885,10 @@ uint16x8_t uint4x32_to_uint16_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "uint16x8_t"]); - - ("uint4x32_to_bfloat16_uniform_vec", {| +|}, + [ "uint4x32_t"; "uint16x8_t" ] ); + ( "uint4x32_to_bfloat16_uniform_vec", + {| /* Convert uint4x32 to 8 bfloat16s uniform */ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) { uint16x8_t result; @@ -865,9 +910,10 @@ uint16x8_t uint4x32_to_bfloat16_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "uint16x8_t"]); - - ("uint4x32_to_fp8_uniform_vec", {| +|}, + [ "uint4x32_t"; "uint16x8_t" ] ); + ( "uint4x32_to_fp8_uniform_vec", + {| /* Convert uint4x32 to 16 fp8s uniform */ uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) { uint8x16_t result; @@ -879,7 +925,8 @@ uint8x16_t uint4x32_to_fp8_uniform_vec(uint4x32_t x) { } return result; } -|}, ["uint4x32_t"; "uint8x16_t"]); -] +|}, + [ "uint4x32_t"; "uint8x16_t" ] ); + ] -let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins) \ No newline at end of file +let source = includes ^ String.concat "" (List.map (fun (_, def, _) -> def) builtins) File "arrayjit/lib/backend_impl.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/backend_impl.ml b/_build/default/arrayjit/lib/.formatted/backend_impl.ml index 6d483ff..45e6bea 100644 --- a/_build/default/arrayjit/lib/backend_impl.ml +++ b/_build/default/arrayjit/lib/.formatted/backend_impl.ml @@ -50,13 +50,11 @@ module No_device_buffer_and_copying () : Stdlib.Gc.finalise finalize ptr; ptr - let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) : - buffer_ptr = + let%track7_sexp alloc_array (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr = let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in alloc_impl ~size_in_bytes - let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) : - buffer_ptr = + let%track7_sexp alloc_zeros (prec : Ops.prec) ~(dims : int array) (() : unit) : buffer_ptr = let size_in_bytes = Array.fold dims ~init:1 ~f:( * ) * Ops.prec_in_bytes prec in let ptr = alloc_impl ~size_in_bytes in (* Zero-initialize the allocated memory *) File "lib/ppx_shared.ml", line 1, characters 0-0: diff --git a/_build/default/lib/ppx_shared.ml b/_build/default/lib/.formatted/ppx_shared.ml index 340047f..7210e1d 100644 --- a/_build/default/lib/ppx_shared.ml +++ b/_build/default/lib/.formatted/ppx_shared.ml @@ -167,9 +167,11 @@ let binary_ops = ("@-", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Min])); ("min", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Min])); ("^^^^", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto])); - ("threefry4x32_crypto", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto])); + ( "threefry4x32_crypto", + fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_crypto]) ); ("^^", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light])); - ("threefry4x32_light", fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light])); + ( "threefry4x32_light", + fun loc -> ([%expr Shape.Pointwise_bin], [%expr Ir.Ops.Threefry4x32_light]) ); ] (** Unary primitive ops. *) File "arrayjit/lib/cc_backend.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/cc_backend.ml b/_build/default/arrayjit/lib/.formatted/cc_backend.ml index f5dfc7d..f3f3171 100644 --- a/_build/default/arrayjit/lib/cc_backend.ml +++ b/_build/default/arrayjit/lib/.formatted/cc_backend.ml @@ -220,13 +220,17 @@ struct | Ops.Uint4x32_prec _ -> let open PPrint in group (string "arrayjit_threefry4x32_crypto(" ^^ v1 ^^ string ", " ^^ v2 ^^ string ")") - | _ -> invalid_arg "CC_syntax_config.binop_syntax: Threefry4x32_crypto on non-uint4x32 precision") + | _ -> + invalid_arg + "CC_syntax_config.binop_syntax: Threefry4x32_crypto on non-uint4x32 precision") | Ops.Threefry4x32_light -> ( match prec with | Ops.Uint4x32_prec _ -> let open PPrint in group (string "arrayjit_threefry4x32_light(" ^^ v1 ^^ string ", " ^^ v2 ^^ string ")") - | _ -> invalid_arg "CC_syntax_config.binop_syntax: Threefry4x32_light on non-uint4x32 precision") + | _ -> + invalid_arg + "CC_syntax_config.binop_syntax: Threefry4x32_light on non-uint4x32 precision") | _ -> ( match prec with | Ops.Bfloat16_prec _ -> @@ -312,10 +316,10 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized) let idx_params = Indexing.bound_symbols bindings in let build_file = Utils.open_build_file ~base_name:name ~extension:".c" in let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in - let filtered_code = Syntax.filter_and_prepend_builtins - ~includes:Builtins_cc.includes - ~builtins:Builtins_cc.builtins - ~proc_doc in + let filtered_code = + Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins + ~proc_doc + in (* Use ribbon = 1.0 for usual code formatting, width 110 *) Out_channel.output_string build_file.oc filtered_code; build_file.finalize (); @@ -344,10 +348,10 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized opt in let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in let combined_proc_doc = PPrint.separate PPrint.hardline all_proc_docs in - let filtered_code = Syntax.filter_and_prepend_builtins - ~includes:Builtins_cc.includes - ~builtins:Builtins_cc.builtins - ~proc_doc:combined_proc_doc in + let filtered_code = + Syntax.filter_and_prepend_builtins ~includes:Builtins_cc.includes ~builtins:Builtins_cc.builtins + ~proc_doc:combined_proc_doc + in Out_channel.output_string build_file.oc filtered_code; build_file.finalize (); let result_library = c_compile_and_load ~f_path:build_file.f_path in File "arrayjit/lib/ops.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/ops.ml b/_build/default/arrayjit/lib/.formatted/ops.ml index 8b41458..2c24be3 100644 --- a/_build/default/arrayjit/lib/ops.ml +++ b/_build/default/arrayjit/lib/.formatted/ops.ml @@ -329,11 +329,11 @@ type binop = | Or | And | Threefry4x32_crypto - (** 4x32-bit Threefry PRNG, 20-round cryptographic version. Requires a 128-bit key and a + (** 4x32-bit Threefry PRNG, 20-round cryptographic version. Requires a 128-bit key and a 128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *) | Threefry4x32_light - (** 4x32-bit Threefry PRNG, 2-round light version (as in JAX/XLA). Requires a 128-bit key - and a 128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *) + (** 4x32-bit Threefry PRNG, 2-round light version (as in JAX/XLA). Requires a 128-bit key and + a 128-bit counter and outputs a 128-bit value (precision [Uint4x32]). *) [@@deriving sexp, compare, equal] type unop = @@ -390,7 +390,9 @@ let neutral_elem = function | Min -> Float.infinity | And -> 1. | Or -> 0. - | Arg2 | Arg1 | Mod | Cmplt | Cmpeq | Cmpne | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) -> 0. + | Arg2 | Arg1 | Mod | Cmplt | Cmpeq | Cmpne | Threefry4x32_crypto + | Threefry4x32_light (* | Shl | Shr *) -> + 0. let interpret_binop op v1 v2 = let open Float in @@ -415,7 +417,7 @@ let interpret_binop op v1 v2 = (* | Shr -> v1 / (int_pow 2. @@ to_int v2) *) | Or -> if v1 <> 0. || v2 <> 0. then 1. else 0. | And -> if v1 <> 0. && v2 <> 0. then 1. else 0. - | Threefry4x32_crypto | Threefry4x32_light -> + | Threefry4x32_crypto | Threefry4x32_light -> invalid_arg "Ops.interpret_binop: Threefry4x32 operations are outside the domain of float" let interpret_unop op v = @@ -440,7 +442,8 @@ let interpret_unop op v = | Tanh_approx -> tanh v | Not -> if v = 0. then 1. else 0. | Uint4x32_to_prec_uniform1 -> - invalid_arg "Ops.interpret_unop: Uint4x32_to_prec_uniform1 argument outside the domain of float" + invalid_arg + "Ops.interpret_unop: Uint4x32_to_prec_uniform1 argument outside the domain of float" let interpret_ternop op v1 v2 v3 = let open Float in @@ -541,11 +544,12 @@ let binop_c_syntax prec v = | Threefry4x32_crypto, _ -> (* This corresponds to the pure C implementation in builtins.c. *) ("arrayjit_threefry4x32_crypto(", ",", ")") - | Threefry4x32_light, _ -> - ("arrayjit_threefry4x32_light(", ",", ")") + | Threefry4x32_light, _ -> ("arrayjit_threefry4x32_light(", ",", ")") let is_assign_op = function - | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne -> false + | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne + -> + false | Add | Sub | Mul | Div | ToPowOf | Relu_gate | Satur01_gate | Arg2 | Max | Min | Or | And -> true let assign_op_cd_syntax ~initialize_neutral = function @@ -572,7 +576,8 @@ let assign_op_cd_syntax ~initialize_neutral = function | Min -> "=@-" | Or -> "=||" | And -> "=&&" - | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne -> + | Arg1 | Mod | Threefry4x32_crypto | Threefry4x32_light (* | Shl | Shr *) | Cmplt | Cmpeq | Cmpne + -> invalid_arg "Ops.assign_op_cd_syntax: not an assignment op" (** Note: currently we do not support unary prefix symbols. *) @@ -756,10 +761,12 @@ external copy_with_padding_c : axis_padding array -> unit = "arrayjit_copy_with_padding" -external threefry4x32_crypto : int array -> int array -> int array = "arrayjit_threefry4x32_crypto_ocaml" +external threefry4x32_crypto : int array -> int array -> int array + = "arrayjit_threefry4x32_crypto_ocaml" (** Threefry4x32 PRNG - 20 round cryptographic version *) -external threefry4x32_light : int array -> int array -> int array = "arrayjit_threefry4x32_light_ocaml" +external threefry4x32_light : int array -> int array -> int array + = "arrayjit_threefry4x32_light_ocaml" (** Threefry4x32 PRNG - 2 round light version *) external threefry4x32 : int array -> int array -> int array = "arrayjit_threefry4x32_ocaml" File "arrayjit/lib/backends.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/backends.ml b/_build/default/arrayjit/lib/.formatted/backends.ml index a6ab05e..166603a 100644 --- a/_build/default/arrayjit/lib/backends.ml +++ b/_build/default/arrayjit/lib/.formatted/backends.ml @@ -474,18 +474,17 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct [%log (key : Tnode.t)]; let default () = let dims = Lazy.force key.dims in - (* Use alloc_array when zero initialization is not needed: - - When copying from host immediately after allocation - - When the node has explicit Zero_out operations in the lowered code *) - let will_copy_from_host = - Utils.settings.automatic_host_transfers && Tn.known_constant key && - match key.array with | (lazy (Some _)) -> true | _ -> false + (* Use alloc_array when zero initialization is not needed: - When copying from host + immediately after allocation - When the node has explicit Zero_out operations in the + lowered code *) + let will_copy_from_host = + Utils.settings.automatic_host_transfers && Tn.known_constant key + && match key.array with (lazy (Some _)) -> true | _ -> false in - let dst_ptr = + let dst_ptr = if will_copy_from_host || node.Low_level.zero_initialized_by_code then alloc_array (Lazy.force key.prec) ~dims stream - else - alloc_zeros (Lazy.force key.prec) ~dims stream + else alloc_zeros (Lazy.force key.prec) ~dims stream in (if will_copy_from_host then match key.array with File "arrayjit/lib/metal_backend.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/metal_backend.ml b/_build/default/arrayjit/lib/.formatted/metal_backend.ml index 7752213..f4b4bed 100644 --- a/_build/default/arrayjit/lib/metal_backend.ml +++ b/_build/default/arrayjit/lib/.formatted/metal_backend.ml @@ -547,8 +547,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct raise @@ Utils.User_error (Printf.sprintf - "Metal backend: Threefry4x32_crypto requires target precision to be uint4x32, but \ - got %s" + "Metal backend: Threefry4x32_crypto requires target precision to be \ + uint4x32, but got %s" (Ops.prec_string prec))) | Threefry4x32_light, _ -> ( (* Threefry4x32_light must output to uint4x32 precision *) @@ -558,8 +558,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct raise @@ Utils.User_error (Printf.sprintf - "Metal backend: Threefry4x32_light requires target precision to be uint4x32, but \ - got %s" + "Metal backend: Threefry4x32_light requires target precision to be uint4x32, \ + but got %s" (Ops.prec_string prec))) | Arg1, _ | Arg2, _ -> invalid_arg "Metal C_syntax_config: Arg1/Arg2 not operators" @@ -675,8 +675,10 @@ end) : Ir.Backend_impl.Lowered_backend = struct let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in let metal_includes = {|#include <metal_stdlib> using namespace metal;|} in - let source = Syntax.filter_and_prepend_builtins - ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc in + let source = + Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins + ~proc_doc + in { metal_source = source; compiled_code = Array.create ~len:num_devs None; @@ -703,8 +705,10 @@ using namespace metal;|} in let final_doc = PPrint.(separate hardline all_proc_docs) in let metal_includes = {|#include <metal_stdlib> using namespace metal;|} in - let source = Syntax.filter_and_prepend_builtins - ~includes:metal_includes ~builtins:Builtins_metal.builtins ~proc_doc:final_doc in + let source = + Syntax.filter_and_prepend_builtins ~includes:metal_includes ~builtins:Builtins_metal.builtins + ~proc_doc:final_doc + in let traced_stores = Array.map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store)) in let funcs = Array.map funcs_and_docs ~f:(Option.map ~f:fst) in { Warning: Invalid documentation comment: File "lib/operation.ml", line 407, characters 63-63: End of text is not allowed in '[...]' (code). File "lib/operation.ml", line 1, characters 0-0: diff --git a/_build/default/lib/operation.ml b/_build/default/lib/.formatted/operation.ml index c99465c..cd34f05 100644 --- a/_build/default/lib/operation.ml +++ b/_build/default/lib/.formatted/operation.ml @@ -375,13 +375,10 @@ let threefry4x32_light = result let threefry4x32 = - (* Select based on configuration *) - fun t1 t2 -> - let variant = Utils.settings.default_prng_variant in - if String.equal variant "crypto" then - threefry4x32_crypto t1 t2 - else - threefry4x32_light t1 t2 + (* Select based on configuration *) + fun t1 t2 -> + let variant = Utils.settings.default_prng_variant in + if String.equal variant "crypto" then threefry4x32_crypto t1 t2 else threefry4x32_light t1 t2 let fma ?(label = []) ~grad_spec t1 t2 t3 = let module NTDSL = Initial_NTDSL in File "arrayjit/lib/c_syntax.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/c_syntax.ml b/_build/default/arrayjit/lib/.formatted/c_syntax.ml index ab0c03e..87b9a27 100644 --- a/_build/default/arrayjit/lib/c_syntax.ml +++ b/_build/default/arrayjit/lib/.formatted/c_syntax.ml @@ -240,13 +240,13 @@ module C_syntax (B : C_syntax_config) = struct let result_buffer = Buffer.create 4096 in Buffer.add_string result_buffer includes; Buffer.add_string result_buffer "\n"; - + (* Collect all needed keys, including dependencies *) let needed_keys = ref (Set.empty (module String)) in List.iter builtins ~f:(fun (key, _, _) -> - if String.is_substring doc_string ~substring:key then - needed_keys := Set.add !needed_keys key); - + if String.is_substring doc_string ~substring:key then + needed_keys := Set.add !needed_keys key); + (* Add dependencies recursively *) let processed_keys = ref (Set.empty (module String)) in let rec add_dependencies key = @@ -255,17 +255,15 @@ module C_syntax (B : C_syntax_config) = struct needed_keys := Set.add !needed_keys key; match List.find builtins ~f:(fun (k, _, _) -> String.equal k key) with | Some (_, _, deps) -> List.iter deps ~f:add_dependencies - | None -> () - ) + | None -> ()) in Set.iter !needed_keys ~f:add_dependencies; - + (* Add the builtins in order *) List.iter builtins ~f:(fun (key, definition, _) -> - if Set.mem !needed_keys key then ( - Buffer.add_string result_buffer definition; - Buffer.add_string result_buffer "\n"; - )); + if Set.mem !needed_keys key then ( + Buffer.add_string result_buffer definition; + Buffer.add_string result_buffer "\n")); Buffer.add_string result_buffer doc_string; Buffer.contents result_buffer @@ -582,13 +580,13 @@ module C_syntax (B : C_syntax_config) = struct let expr = group (B.binop_syntax prec op e1 e2) in (defs, expr) | Unop (op, v) -> - let arg_prec = + let arg_prec = match op with - | Ops.Uint4x32_to_prec_uniform1 -> - (* The argument to Uint4x32_to_prec_uniform1 must be evaluated with uint4x32 precision, - regardless of the target precision. This handles the case where the operation is - inlined as part of a scalar expression. *) - Ops.uint4x32 + | Ops.Uint4x32_to_prec_uniform1 -> + (* The argument to Uint4x32_to_prec_uniform1 must be evaluated with uint4x32 + precision, regardless of the target precision. This handles the case where the + operation is inlined as part of a scalar expression. *) + Ops.uint4x32 | _ -> prec in let defs, expr_v = pp_scalar arg_prec v in @@ -785,7 +783,9 @@ module C_syntax (B : C_syntax_config) = struct let ident_doc = string (get_ident tn) in let num_elems = Tn.num_elems tn in let size_doc = OCaml.int num_elems in - let init_doc = if node.Low_level.zero_initialized_by_code then string " = {0}" else empty in + let init_doc = + if node.Low_level.zero_initialized_by_code then string " = {0}" else empty + in typ_doc ^^ space ^^ ident_doc ^^ brackets size_doc ^^ init_doc ^^ semi ^^ hardline else empty) (Hashtbl.to_alist traced_store) File "arrayjit/lib/utils.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/utils.ml b/_build/default/arrayjit/lib/.formatted/utils.ml index 3783503..9adf0a6 100644 --- a/_build/default/arrayjit/lib/utils.ml +++ b/_build/default/arrayjit/lib/.formatted/utils.ml @@ -50,8 +50,8 @@ type settings = { routine's context if the host array was not yet transfered since its creation or most recent modification. *) mutable default_prng_variant : string; - (** The default variant of threefry4x32 PRNG to use. Options: "crypto" (20 rounds) or "light" (2 rounds). - Defaults to "light" for better performance. *) + (** The default variant of threefry4x32 PRNG to use. Options: "crypto" (20 rounds) or "light" + (2 rounds). Defaults to "light" for better performance. *) } [@@deriving sexp] @@ -465,8 +465,7 @@ let restore_settings () = @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0"; settings.automatic_host_transfers <- get_global_flag ~default:true ~arg_name:"automatic_host_transfers"; - settings.default_prng_variant <- - get_global_arg ~default:"light" ~arg_name:"default_prng_variant" + settings.default_prng_variant <- get_global_arg ~default:"light" ~arg_name:"default_prng_variant" let () = restore_settings () let with_runtime_debug () = settings.output_debug_files_in_build_directory && settings.log_level > 1 File "arrayjit/lib/cuda_backend.ml", line 1, characters 0-0: diff --git a/_build/default/arrayjit/lib/cuda_backend.ml b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml index 25b5253..58be075 100644 --- a/_build/default/arrayjit/lib/cuda_backend.ml +++ b/_build/default/arrayjit/lib/.formatted/cuda_backend.ml @@ -633,8 +633,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct raise @@ Utils.User_error (Printf.sprintf - "CUDA backend: Threefry4x32_crypto requires target precision to be uint4x32, but \ - got %s" + "CUDA backend: Threefry4x32_crypto requires target precision to be uint4x32, \ + but got %s" (Ops.prec_string prec))) | Threefry4x32_light, _ -> ( (* Threefry4x32_light must output to uint4x32 precision *) @@ -644,8 +644,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct raise @@ Utils.User_error (Printf.sprintf - "CUDA backend: Threefry4x32_light requires target precision to be uint4x32, but \ - got %s" + "CUDA backend: Threefry4x32_light requires target precision to be uint4x32, \ + but got %s" (Ops.prec_string prec))) let unop_syntax prec v = @@ -780,7 +780,6 @@ end) : Ir.Backend_impl.Lowered_backend = struct ^^ rparen ^^ semi end - let%diagn2_sexp compile ~name bindings ({ Low_level.traced_store; _ } as lowered) = (* TODO: The following link seems to claim it's better to expand into loops than use memset. https://stackoverflow.com/questions/23712558/how-do-i-best-initialize-a-local-memory-array-to-0 *) @@ -789,11 +788,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct end)) in let idx_params = Indexing.bound_symbols bindings in let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in - let cuda_includes = {|#include <cuda_fp16.h> -#include <cuda_bf16.h>|} ^ - (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in - let source = Syntax.filter_and_prepend_builtins - ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc in + let cuda_includes = + {|#include <cuda_fp16.h> +#include <cuda_bf16.h>|} + ^ + if Utils.debug_log_from_routines () then + "\n__device__ int printf (const char * format, ... );" + else "" + in + let source = + Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins + ~proc_doc + in let ptx = cuda_to_ptx ~name source in { traced_store; ptx; params; bindings; name } @@ -811,11 +817,18 @@ end) : Ir.Backend_impl.Lowered_backend = struct in let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in let final_doc = PPrint.(separate hardline all_proc_docs) in - let cuda_includes = {|#include <cuda_fp16.h> -#include <cuda_bf16.h>|} ^ - (if Utils.debug_log_from_routines () then "\n__device__ int printf (const char * format, ... );" else "") in - let source = Syntax.filter_and_prepend_builtins - ~includes:cuda_includes ~builtins:Builtins_cuda.builtins ~proc_doc:final_doc in + let cuda_includes = + {|#include <cuda_fp16.h> +#include <cuda_bf16.h>|} + ^ + if Utils.debug_log_from_routines () then + "\n__device__ int printf (const char * format, ... );" + else "" + in + let source = + Syntax.filter_and_prepend_builtins ~includes:cuda_includes ~builtins:Builtins_cuda.builtins + ~proc_doc:final_doc + in let name : string = String.( dune build @fmt failed "/usr/bin/env" "bash" "-c" "opam exec -- dune build @fmt --ignore-promoted-rules || (echo "dune build @fmt failed"; exit 2)" failed with exit status 2 2025-08-24 09:40.18: Job failed: Failed: Build failed