diff options
Diffstat (limited to 'dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch')
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch b/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch new file mode 100644 index 000000000000..b46e57301782 --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch @@ -0,0 +1,65 @@ +commit bdfc5375f219d6def81effda4e57cb56d01fc917 +Author: Sergei Trofimovich <slyfox@gentoo.org> +Date: Tue Aug 30 12:10:54 2016 +0100 + + rts: enable parallel GC scan of large (32M+) allocation area + + Parallel GC does not scan large allocation area (-A) + effectively as it does not do work stealing from nursery + by default. + + That leads to large imbalance when only one of threads + overflows allocation area: most of GC threads finish + quickly (as there is not much to collect) and sit idle + waiting while single GC thread finishes scan of single + allocation area for that thread. + + The patch enables work stealing for (equivalent of -qb0) + allocation area of -A32M or higher. + + Tested on a highlighting-kate package from Trac #9221 + + On 8-core machine the difference is around 5% faster + of wall-clock time. On 24-core VM the speedup is 20%. + + Signed-off-by: Sergei Trofimovich <siarheit@google.com> + + Test Plan: measured wall time and GC parallelism on highlighting-kate build + + Reviewers: austin, bgamari, erikd, simonmar + + Reviewed By: bgamari, simonmar + + Subscribers: thomie + + Differential Revision: https://phabricator.haskell.org/D2483 + + GHC Trac Issues: #9221 + +diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c +index fda33f0..7a719b9 100644 +--- a/rts/RtsFlags.c ++++ b/rts/RtsFlags.c +@@ -237,1 +237,1 @@ void initRtsFlagsDefaults(void) +- RtsFlags.ParFlags.parGcLoadBalancingGen = 1; ++ RtsFlags.ParFlags.parGcLoadBalancingGen = ~0u; /* auto, based on -A */ +@@ -1398,2 +1390,19 @@ static void normaliseRtsOpts (void) + } + ++#ifdef THREADED_RTS ++ if (RtsFlags.ParFlags.parGcLoadBalancingGen == ~0u) { ++ StgWord alloc_area_bytes ++ = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE; ++ ++ // If allocation area is larger that CPU cache ++ // we can finish scanning quicker doing work-stealing ++ // scan. Trac #9221 ++ // 32M looks big enough not to fit into L2 cache ++ // of popular modern CPUs. ++ if (alloc_area_bytes >= 32 * 1024 * 1024) { ++ RtsFlags.ParFlags.parGcLoadBalancingGen = 0; ++ } else { ++ RtsFlags.ParFlags.parGcLoadBalancingGen = 1; ++ } ++ } ++#endif |