From 7435d2ece079af4dde799fe95d6910a0a6566005 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Fri, 12 Sep 2025 18:46:25 +0300
Subject: [PATCH 1/4] Small fixes suggested by @lehins

Co-authored-by: Alexey Kuleshevich <lehins@yandex.ru>
---
 vector/tests/Utilities.hs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vector/tests/Utilities.hs b/vector/tests/Utilities.hs
index 3c88926c..e4e6dc2d 100644
--- a/vector/tests/Utilities.hs
+++ b/vector/tests/Utilities.hs
@@ -266,8 +266,6 @@ xs // ps = go xs ps' 0
     go [] _ _      = []
 
 
--- withIndexFirst :: (Int -> a -> [a]) -> [a] -> [a]
-
 withIndexFirst :: (((Int, a) -> b) -> [(Int, a)] -> c)
                -> ((Int -> a -> b) -> [a]        -> c)
 withIndexFirst m f = m (uncurry f) . zip [0::Int ..]

From 11905f3fd76b8f051f5bce80890d7db1f9a15f7b Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Fri, 12 Sep 2025 19:45:12 +0300
Subject: [PATCH 2/4] Add test for allocations

This is good way to check that GHC generate tight non-allocating loops
for monadic/applicative functions.
---
 vector/tests-inspect/Inspect/Alloc.hs | 74 +++++++++++++++++++++++++++
 vector/tests-inspect/Test/Alloc.hs    | 59 +++++++++++++++++++++
 vector/tests-inspect/main.hs          |  2 +
 vector/vector.cabal                   |  3 ++
 4 files changed, 138 insertions(+)
 create mode 100644 vector/tests-inspect/Inspect/Alloc.hs
 create mode 100644 vector/tests-inspect/Test/Alloc.hs

diff --git a/vector/tests-inspect/Inspect/Alloc.hs b/vector/tests-inspect/Inspect/Alloc.hs
new file mode 100644
index 00000000..b4f55e4f
--- /dev/null
+++ b/vector/tests-inspect/Inspect/Alloc.hs
@@ -0,0 +1,74 @@
+{-# LANGUAGE BangPatterns        #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeApplications    #-}
+{- |
+Here we test that GHC is able to optimize well construction of vector
+using monadic\/applicative actions. Well is understood as able to
+generate code which does not allocate except for buffer and some
+constant overhead.
+-}
+module Inspect.Alloc where
+
+import Data.Int
+-- import Data.Monoid
+import Data.Functor.Identity
+import Test.Tasty
+import Test.Tasty.HUnit
+import System.Mem
+import Test.Alloc
+
+import qualified Data.Vector.Unboxed as VU
+
+
+tests :: TestTree
+tests = testGroup "allocations"
+  [ testGroup "traversable"
+    [ testCase "IO"
+      $ checkAllocations (linear 8)
+      $ whnfIO (VU.traverse (\_ -> getAllocationCounter) vector)
+    , testCase "Identity"
+      $ checkAllocations (linear 8)
+      $ VU.traverse (\n -> Identity (10*n)) `whnf` vector
+    -- NOTE: Naive traversal is lazy and allocated 2 words per element
+    --
+    -- , testCase "Const Sum"
+    --   $ checkAllocations constant
+    --   $ whnf (VU.traverse (Const @_ @() . Sum)) vector
+    ]
+  , testGroup "unstreamM"
+    [ testCase "IO"
+      $ checkAllocations (linear 8)
+      $ whnfIO (VU.replicateM size getAllocationCounter)
+    -- , testCase "Identity"
+    --   $ checkAllocations (linear 8)
+    --   $ (\sz -> VU.generateM sz (\n -> Identity (fromIntegral n :: Int64))) `whnf` size
+    ]
+  ]
+
+
+-- | Constant overhead. Measurement precision is 4k
+overhead :: Int64
+overhead = 4096*2
+
+-- | Vector size. It should be large so 1byte per element will be
+--   large than page.
+size :: Int
+size = 100000
+
+vector :: VU.Vector Int64
+{-# NOINLINE vector #-}
+vector = VU.generate size fromIntegral
+
+-- | N bytes per element + constant overhead. We also check that bound
+--   is tight.
+linear :: Int -> Range
+linear n = Range
+  { allocHi = fromIntegral (n * size) + overhead
+  , allocLo = fromIntegral (n * size)
+  }
+
+-- | Only constant overhead
+constant :: Range
+constant = Range { allocHi = overhead
+                 , allocLo = 0
+                 }
diff --git a/vector/tests-inspect/Test/Alloc.hs b/vector/tests-inspect/Test/Alloc.hs
new file mode 100644
index 00000000..6720cbc4
--- /dev/null
+++ b/vector/tests-inspect/Test/Alloc.hs
@@ -0,0 +1,59 @@
+{-# LANGUAGE RecordWildCards #-}
+-- |
+-- Test that function allocates is in range. This is good way to test
+-- that GHC produces tight non-allocating loops.
+module Test.Alloc where
+
+import Control.Exception
+import Data.Int
+import System.Mem
+import Test.Tasty.HUnit
+import Text.Printf
+
+----------------------------------------------------------------
+-- Benchmarking machinery copied from tasty-bench
+----------------------------------------------------------------
+
+newtype Benchmarkable = Benchmarkable (IO ())
+
+whnf :: (a -> b) -> a -> Benchmarkable
+{-# NOINLINE whnf #-}
+whnf f a = Benchmarkable $ do _ <- evaluate (f a)
+                              return ()
+
+whnfIO :: IO a -> Benchmarkable
+{-# NOINLINE whnfIO #-}
+whnfIO io = Benchmarkable $ do _ <- evaluate =<< io
+                               return ()
+
+
+-- | Measure allocations. Measurements use 'getAllocationCounter' so
+--   it's accurate up to 4k bytes.
+allocations :: Benchmarkable -> IO Int64
+allocations (Benchmarkable io) = do
+  -- We need to run `io` twice in order to ensure that all constant
+  -- parameters are evaluated.
+  io
+  n1 <- getAllocationCounter
+  io
+  n2 <- getAllocationCounter
+  return $! n1 - n2
+
+
+-- | Expected allocations range
+data Range = Range { allocLo :: !Int64
+                   , allocHi :: !Int64
+                   }
+             deriving Show
+
+-- | Check that computation's allocations lie in range
+checkAllocations :: Range -> Benchmarkable -> IO ()
+checkAllocations Range{..} bench = do
+  alloc <- allocations bench
+  let msg = unlines [ printf "allocated = %12d" alloc
+                    , printf "Low bound = %12d" allocLo
+                    , printf "Hi  bound = %12d" allocHi
+                    ]
+  assertBool msg $ alloc <= allocHi
+                && alloc >= allocLo
+
diff --git a/vector/tests-inspect/main.hs b/vector/tests-inspect/main.hs
index 52dd6db0..0a67eb9f 100644
--- a/vector/tests-inspect/main.hs
+++ b/vector/tests-inspect/main.hs
@@ -1,6 +1,7 @@
 module Main (main) where
 
 import qualified Inspect
+import qualified Inspect.Alloc
 import qualified Inspect.DerivingVia
 import Test.Tasty (defaultMain,testGroup)
 
@@ -8,4 +9,5 @@ main :: IO ()
 main = defaultMain $ testGroup "tests"
   [ Inspect.tests
   , Inspect.DerivingVia.tests
+  , Inspect.Alloc.tests
   ]
diff --git a/vector/vector.cabal b/vector/vector.cabal
index 62a4369b..479b35fe 100644
--- a/vector/vector.cabal
+++ b/vector/vector.cabal
@@ -252,13 +252,16 @@ test-suite vector-inspection
   main-is:          main.hs
   default-language: Haskell2010
   Other-modules:    Inspect
+                    Inspect.Alloc
                     Inspect.DerivingVia
                     Inspect.DerivingVia.OtherFoo
+                    Test.Alloc
   build-depends:
         base                     -any
       , primitive                >= 0.6.4.0 && < 0.10
       , vector                   -any
       , tasty
+      , tasty-hunit
       , tasty-inspection-testing >= 0.1
 
 library benchmarks-O2

From 1097859069c3c35be05935f71810d55289fb8b57 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Sun, 14 Sep 2025 17:40:47 +0300
Subject: [PATCH 3/4] Add tests for ST as well

---
 vector/tests-inspect/Inspect/Alloc.hs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vector/tests-inspect/Inspect/Alloc.hs b/vector/tests-inspect/Inspect/Alloc.hs
index b4f55e4f..f7fecf2f 100644
--- a/vector/tests-inspect/Inspect/Alloc.hs
+++ b/vector/tests-inspect/Inspect/Alloc.hs
@@ -9,6 +9,7 @@ constant overhead.
 -}
 module Inspect.Alloc where
 
+import Control.Monad.ST
 import Data.Int
 -- import Data.Monoid
 import Data.Functor.Identity
@@ -26,6 +27,9 @@ tests = testGroup "allocations"
     [ testCase "IO"
       $ checkAllocations (linear 8)
       $ whnfIO (VU.traverse (\_ -> getAllocationCounter) vector)
+    , testCase "ST"
+      $ checkAllocations (linear 8)
+      $ (\v -> runST $ VU.traverse (pureST . fromIntegral) v) `whnf` vector
     , testCase "Identity"
       $ checkAllocations (linear 8)
       $ VU.traverse (\n -> Identity (10*n)) `whnf` vector
@@ -39,6 +43,9 @@ tests = testGroup "allocations"
     [ testCase "IO"
       $ checkAllocations (linear 8)
       $ whnfIO (VU.replicateM size getAllocationCounter)
+    , testCase "ST"
+      $ checkAllocations (linear 8)
+      $ (\sz -> runST $ VU.generateM sz pureST) `whnf` size
     -- , testCase "Identity"
     --   $ checkAllocations (linear 8)
     --   $ (\sz -> VU.generateM sz (\n -> Identity (fromIntegral n :: Int64))) `whnf` size
@@ -46,6 +53,10 @@ tests = testGroup "allocations"
   ]
 
 
+pureST :: Int -> ST s Int64
+{-# NOINLINE pureST #-}
+pureST i = pure $! fromIntegral i
+
 -- | Constant overhead. Measurement precision is 4k
 overhead :: Int64
 overhead = 4096*2

From fc83758a6df166dc984f0be7368cfa2211a37d33 Mon Sep 17 00:00:00 2001
From: Alexey Khudyakov <alexey.skladnoy@gmail.com>
Date: Wed, 17 Sep 2025 19:59:02 +0300
Subject: [PATCH 4/4] Disable allocation test for older GHC

---
 vector/tests-inspect/Inspect/Alloc.hs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vector/tests-inspect/Inspect/Alloc.hs b/vector/tests-inspect/Inspect/Alloc.hs
index f7fecf2f..be399e5f 100644
--- a/vector/tests-inspect/Inspect/Alloc.hs
+++ b/vector/tests-inspect/Inspect/Alloc.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE BangPatterns        #-}
+{-# LANGUAGE CPP                 #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE TypeApplications    #-}
 {- |
@@ -27,12 +28,23 @@ tests = testGroup "allocations"
     [ testCase "IO"
       $ checkAllocations (linear 8)
       $ whnfIO (VU.traverse (\_ -> getAllocationCounter) vector)
+
+#if MIN_VERSION_base(4,17,0)
+    -- GHC<9.4 doesn't optimize well.
     , testCase "ST"
       $ checkAllocations (linear 8)
       $ (\v -> runST $ VU.traverse (pureST . fromIntegral) v) `whnf` vector
+#endif
+
+#if MIN_VERSION_base(4,15,0)
+      -- GHC<9.0 doesn't optimize this well. And there's no appetite
+      -- for finding out why. Thus it's disabled for them. We'll still
+      -- catch regression going forward.
     , testCase "Identity"
       $ checkAllocations (linear 8)
       $ VU.traverse (\n -> Identity (10*n)) `whnf` vector
+#endif
+
     -- NOTE: Naive traversal is lazy and allocated 2 words per element
     --
     -- , testCase "Const Sum"
@@ -43,9 +55,14 @@ tests = testGroup "allocations"
     [ testCase "IO"
       $ checkAllocations (linear 8)
       $ whnfIO (VU.replicateM size getAllocationCounter)
+
+#if MIN_VERSION_base(4,17,0)
+    -- GHC<9.4 doesn't optimize well.
     , testCase "ST"
       $ checkAllocations (linear 8)
       $ (\sz -> runST $ VU.generateM sz pureST) `whnf` size
+#endif
+
     -- , testCase "Identity"
     --   $ checkAllocations (linear 8)
     --   $ (\sz -> VU.generateM sz (\n -> Identity (fromIntegral n :: Int64))) `whnf` size