Executor compiles

2024-11-22 06:47:56 +01:00 · 2021-11-14 17:51:48 +01:00 · 2021-11-14 17:51:48 +01:00 · 55d6609e33
commit 55d6609e33
parent 24be65b3d9
22 changed files with 2505 additions and 0 deletions
--- a/runtime/executor/Cargo.toml
+++ b/runtime/executor/Cargo.toml
@ -0,0 +1,34 @@
+[package]
+name = "executor"
+version = "0.3.0"
+publish = false
+description = "Executor"
+authors = []
+keywords = []
+categories = []
+readme = "README.md"
+license = "Apache-2.0/MIT"
+edition = "2021"
+exclude = [
+	"scripts/*",
+]
+
+[dependencies]
+lightproc = { path = "../lightproc" }
+
+crossbeam-utils = "0.8"
+crossbeam-channel = "0.5"
+crossbeam-epoch = "0.9"
+crossbeam-deque = "0.8.1"
+lazy_static = "1.4"
+libc = "0.2"
+num_cpus = "1.13"
+pin-utils = "0.1.0"
+
+# Allocator
+arrayvec = { version = "0.7.0" }
+futures-timer = "3.0.2"
+once_cell = "1.4.0"
+lever = "0.1"
+tracing = "0.1.19"
+crossbeam-queue = "0.3.0"
--- a/runtime/executor/README.md
+++ b/runtime/executor/README.md
@ -0,0 +1,94 @@
+# Bastion Executor
+
+<table align=left style='float: left; margin: 4px 10px 0px 0px; border: 1px solid #000000;'>
+<tr>
+  <td>Latest Release</td>
+  <td>
+    <a href="https://crates.io/crates/bastion">
+    <img alt="Crates.io" src="https://img.shields.io/crates/v/bastion-executor.svg?style=popout-square">
+    </a>
+  </td>
+</tr>
+<tr>
+  <td></td>
+</tr>
+<tr>
+  <td>License</td>
+  <td>
+    <a href="https://github.com/bastion-rs/bastion/blob/master/LICENSE">
+    <img alt="Crates.io" src="https://img.shields.io/crates/l/bastion.svg?style=popout-square">
+    </a>
+</td>
+</tr>
+<tr>
+  <td>Build Status</td>
+  <td>
+    <a href="https://actions-badge.atrox.dev/bastion-rs/bastion/goto">
+    <img alt="Build Status" src="https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fbastion-rs%2Fbastion%2Fbadge&style=flat" />
+    </a>
+  </td>
+</tr>
+<tr>
+  <td>Downloads</td>
+  <td>
+    <a href="https://crates.io/crates/bastion-executor">
+    <img alt="Crates.io" src="https://img.shields.io/crates/d/bastion-executor.svg?style=popout-square">
+    </a>
+  </td>
+</tr>
+<tr>
+	<td>Discord</td>
+	<td>
+		<a href="https://discord.gg/DqRqtRT">
+		<img src="https://img.shields.io/discord/628383521450360842.svg?logo=discord" />
+		</a>
+	</td>
+</tr>
+</table>
+
+Bastion Executor is NUMA-aware SMP based Fault-tolerant Executor
+
+Bastion Executor is a highly-available, fault-tolerant, async communication
+oriented executor. Bastion's main idea is supplying a fully async runtime
+with fault-tolerance to work on heavy loads.
+
+Main differences between other executors are:
+* Uses SMP based execution scheme to exploit cache affinity on multiple cores and execution is
+equally distributed over the system resources, which means utilizing the all system.
+* Uses NUMA-aware allocation for scheduler's queues and exploit locality on server workloads.
+* Tailored for creating middleware and working with actor model like concurrency and distributed communication.
+
+**NOTE:** Bastion Executor is independent of it's framework implementation.
+It uses [lightproc](https://docs.rs/lightproc) to encapsulate and provide fault-tolerance to your future based workloads.
+You can use your futures with [lightproc](https://docs.rs/lightproc) to run your workloads on Bastion Executor without the need to have framework.
+
+## Example Usage
+
+```rust
+use bastion_executor::prelude::*;
+use lightproc::proc_stack::ProcStack;
+
+fn main() {
+    let pid = 1;
+    let stack = ProcStack::default()
+        .with_pid(pid)
+        .with_after_panic(move || println!("after panic {}", pid.clone()));
+
+    let handle = spawn(
+        async {
+            panic!("test");
+        },
+        stack,
+    );
+
+    let pid = 2;
+    let stack = ProcStack::default().with_pid(pid);
+
+    run(
+        async {
+            handle.await;
+        },
+        stack.clone(),
+    );
+}
+```
--- a/runtime/executor/benches/blocking.rs
+++ b/runtime/executor/benches/blocking.rs
@ -0,0 +1,67 @@
+#![feature(test)]
+
+extern crate test;
+
+use bastion_executor::blocking;
+use lightproc::proc_stack::ProcStack;
+use std::thread;
+use std::time::Duration;
+use test::Bencher;
+
+#[cfg(feature = "tokio-runtime")]
+mod tokio_benchs {
+    use super::*;
+    #[bench]
+    fn blocking(b: &mut Bencher) {
+        tokio_test::block_on(async { _blocking(b) });
+    }
+    #[bench]
+    fn blocking_single(b: &mut Bencher) {
+        tokio_test::block_on(async {
+            _blocking_single(b);
+        });
+    }
+}
+
+#[cfg(not(feature = "tokio-runtime"))]
+mod no_tokio_benchs {
+    use super::*;
+    #[bench]
+    fn blocking(b: &mut Bencher) {
+        _blocking(b);
+    }
+    #[bench]
+    fn blocking_single(b: &mut Bencher) {
+        _blocking_single(b);
+    }
+}
+
+// Benchmark for a 10K burst task spawn
+fn _blocking(b: &mut Bencher) {
+    b.iter(|| {
+        (0..10_000)
+            .map(|_| {
+                blocking::spawn_blocking(
+                    async {
+                        let duration = Duration::from_millis(1);
+                        thread::sleep(duration);
+                    },
+                    ProcStack::default(),
+                )
+            })
+            .collect::<Vec<_>>()
+    });
+}
+
+// Benchmark for a single blocking task spawn
+fn _blocking_single(b: &mut Bencher) {
+    b.iter(|| {
+        blocking::spawn_blocking(
+            async {
+                let duration = Duration::from_millis(1);
+                thread::sleep(duration);
+            },
+            ProcStack::default(),
+        )
+    });
+}
--- a/runtime/executor/benches/perf.rs
+++ b/runtime/executor/benches/perf.rs
@ -0,0 +1,25 @@
+#![feature(test)]
+
+extern crate test;
+
+use bastion_executor::prelude::*;
+use lightproc::proc_stack::ProcStack;
+use test::{black_box, Bencher};
+
+#[bench]
+fn increment(b: &mut Bencher) {
+    let mut sum = 0;
+
+    b.iter(|| {
+        run(
+            async {
+                (0..10_000_000).for_each(|_| {
+                    sum += 1;
+                });
+            },
+            ProcStack::default(),
+        );
+    });
+
+    black_box(sum);
+}
--- a/runtime/executor/benches/run_blocking.rs
+++ b/runtime/executor/benches/run_blocking.rs
@ -0,0 +1,69 @@
+#![feature(test)]
+
+extern crate test;
+
+use bastion_executor::blocking;
+use bastion_executor::run::run;
+use futures::future::join_all;
+use lightproc::proc_stack::ProcStack;
+use std::thread;
+use std::time::Duration;
+use test::Bencher;
+
+#[cfg(feature = "tokio-runtime")]
+mod tokio_benchs {
+    use super::*;
+    #[bench]
+    fn blocking(b: &mut Bencher) {
+        tokio_test::block_on(async { _blocking(b) });
+    }
+    #[bench]
+    fn blocking_single(b: &mut Bencher) {
+        tokio_test::block_on(async {
+            _blocking_single(b);
+        });
+    }
+}
+
+#[cfg(not(feature = "tokio-runtime"))]
+mod no_tokio_benchs {
+    use super::*;
+    #[bench]
+    fn blocking(b: &mut Bencher) {
+        _blocking(b);
+    }
+    #[bench]
+    fn blocking_single(b: &mut Bencher) {
+        _blocking_single(b);
+    }
+}
+
+// Benchmark for a 10K burst task spawn
+fn _blocking(b: &mut Bencher) {
+    b.iter(|| {
+        (0..10_000)
+            .map(|_| {
+                blocking::spawn_blocking(
+                    async {
+                        let duration = Duration::from_millis(1);
+                        thread::sleep(duration);
+                    },
+                    ProcStack::default(),
+                )
+            })
+            .collect::<Vec<_>>()
+    });
+}
+
+// Benchmark for a single blocking task spawn
+fn _blocking_single(b: &mut Bencher) {
+    b.iter(|| {
+        blocking::spawn_blocking(
+            async {
+                let duration = Duration::from_millis(1);
+                thread::sleep(duration);
+            },
+            ProcStack::default(),
+        )
+    });
+}
--- a/runtime/executor/benches/spawn.rs
+++ b/runtime/executor/benches/spawn.rs
@ -0,0 +1,70 @@
+#![feature(test)]
+
+extern crate test;
+
+use bastion_executor::load_balancer;
+use bastion_executor::prelude::spawn;
+use futures_timer::Delay;
+use lightproc::proc_stack::ProcStack;
+use std::time::Duration;
+use test::Bencher;
+
+#[cfg(feature = "tokio-runtime")]
+mod tokio_benchs {
+    use super::*;
+    #[bench]
+    fn spawn_lot(b: &mut Bencher) {
+        tokio_test::block_on(async { _spawn_lot(b) });
+    }
+    #[bench]
+    fn spawn_single(b: &mut Bencher) {
+        tokio_test::block_on(async {
+            _spawn_single(b);
+        });
+    }
+}
+
+#[cfg(not(feature = "tokio-runtime"))]
+mod no_tokio_benchs {
+    use super::*;
+    #[bench]
+    fn spawn_lot(b: &mut Bencher) {
+        _spawn_lot(b);
+    }
+    #[bench]
+    fn spawn_single(b: &mut Bencher) {
+        _spawn_single(b);
+    }
+}
+
+// Benchmark for a 10K burst task spawn
+fn _spawn_lot(b: &mut Bencher) {
+    let proc_stack = ProcStack::default();
+    b.iter(|| {
+        let _ = (0..10_000)
+            .map(|_| {
+                spawn(
+                    async {
+                        let duration = Duration::from_millis(1);
+                        Delay::new(duration).await;
+                    },
+                    proc_stack.clone(),
+                )
+            })
+            .collect::<Vec<_>>();
+    });
+}
+
+// Benchmark for a single task spawn
+fn _spawn_single(b: &mut Bencher) {
+    let proc_stack = ProcStack::default();
+    b.iter(|| {
+        spawn(
+            async {
+                let duration = Duration::from_millis(1);
+                Delay::new(duration).await;
+            },
+            proc_stack.clone(),
+        );
+    });
+}
--- a/runtime/executor/benches/stats.rs
+++ b/runtime/executor/benches/stats.rs
@ -0,0 +1,71 @@
+#![feature(test)]
+
+extern crate test;
+use bastion_executor::load_balancer::{core_count, get_cores, stats, SmpStats};
+use bastion_executor::placement;
+use std::thread;
+use test::Bencher;
+
+fn stress_stats<S: SmpStats + Sync + Send>(stats: &'static S) {
+    let mut handles = Vec::with_capacity(*core_count());
+    for core in get_cores() {
+        let handle = thread::spawn(move || {
+            placement::set_for_current(*core);
+            for i in 0..100 {
+                stats.store_load(core.id, 10);
+                if i % 3 == 0 {
+                    let _sorted_load = stats.get_sorted_load();
+                }
+            }
+        });
+        handles.push(handle);
+    }
+
+    for handle in handles {
+        handle.join().unwrap();
+    }
+}
+
+// previous lock based stats benchmark 1,352,791 ns/iter (+/- 2,682,013)
+
+// 158,278 ns/iter (+/- 117,103)
+#[bench]
+fn lockless_stats_bench(b: &mut Bencher) {
+    b.iter(|| {
+        stress_stats(stats());
+    });
+}
+
+#[bench]
+fn lockless_stats_bad_load(b: &mut Bencher) {
+    let stats = stats();
+    const MAX_CORE: usize = 256;
+    for i in 0..MAX_CORE {
+        // Generating the worst possible mergesort scenario
+        // [0,2,4,6,8,10,1,3,5,7,9]...
+        if i <= MAX_CORE / 2 {
+            stats.store_load(i, i * 2);
+        } else {
+            stats.store_load(i, i - 1 - MAX_CORE / 2);
+        }
+    }
+
+    b.iter(|| {
+        let _sorted_load = stats.get_sorted_load();
+    });
+}
+
+#[bench]
+fn lockless_stats_good_load(b: &mut Bencher) {
+    let stats = stats();
+    const MAX_CORE: usize = 256;
+    for i in 0..MAX_CORE {
+        // Generating the best possible mergesort scenario
+        // [0,1,2,3,4,5,6,7,8,9]...
+        stats.store_load(i, i);
+    }
+
+    b.iter(|| {
+        let _sorted_load = stats.get_sorted_load();
+    });
+}
--- a/runtime/executor/examples/spawn_async.rs
+++ b/runtime/executor/examples/spawn_async.rs
@ -0,0 +1,42 @@
+use std::io::Write;
+use std::panic::resume_unwind;
+use std::time::Duration;
+use executor::pool;
+use executor::prelude::*;
+
+fn main() {
+    std::panic::set_hook(Box::new(|info| {
+        let tid = std::thread::current().id();
+        println!("Panicking ThreadId: {:?}", tid);
+        std::io::stdout().flush();
+        println!("panic hook: {:?}", info);
+    }));
+    let tid = std::thread::current().id();
+    println!("Main ThreadId: {:?}", tid);
+
+    let handle = spawn(
+        async {
+            panic!("test");
+        },
+    );
+
+    run(
+        async {
+            handle.await;
+        },
+        ProcStack {},
+    );
+
+    let pool = pool::get();
+    let manager = pool::get_manager().unwrap();
+    println!("After panic: {:?}", pool);
+    println!("{:#?}", manager);
+
+    let h = std::thread::spawn(|| {
+        panic!("This is a test");
+    });
+
+    std::thread::sleep(Duration::from_secs(30));
+
+    println!("After panic");
+}
--- a/runtime/executor/scripts/test_blocking_thread_pool.sh
+++ b/runtime/executor/scripts/test_blocking_thread_pool.sh
@ -0,0 +1,5 @@
+#!/bin/zsh
+
+cargo test longhauling_task_join -- --ignored --exact --nocapture
+cargo test slow_join_interrupted -- --ignored --exact --nocapture
+cargo test slow_join -- --ignored --exact --nocapture
--- a/runtime/executor/src/blocking.rs
+++ b/runtime/executor/src/blocking.rs
@ -0,0 +1,165 @@
+//!
+//! Pool of threads to run heavy processes
+//!
+//! We spawn futures onto the pool with [`spawn_blocking`] method of global run queue or
+//! with corresponding [`Worker`]'s spawn method.
+//!
+//! [`Worker`]: crate::run_queue::Worker
+
+use crate::thread_manager::{DynamicPoolManager, DynamicRunner};
+use crossbeam_channel::{unbounded, Receiver, Sender};
+use lazy_static::lazy_static;
+use lightproc::lightproc::LightProc;
+use lightproc::recoverable_handle::RecoverableHandle;
+use once_cell::sync::{Lazy, OnceCell};
+use std::future::Future;
+use std::iter::Iterator;
+use std::time::Duration;
+use std::{env, thread};
+use tracing::trace;
+
+/// If low watermark isn't configured this is the default scaler value.
+/// This value is used for the heuristics of the scaler
+const DEFAULT_LOW_WATERMARK: u64 = 2;
+
+const THREAD_RECV_TIMEOUT: Duration = Duration::from_millis(100);
+
+/// Spawns a blocking task.
+///
+/// The task will be spawned onto a thread pool specifically dedicated to blocking tasks.
+pub fn spawn_blocking<F, R>(future: F) -> RecoverableHandle<R>
+where
+    F: Future<Output = R> + Send + 'static,
+    R: Send + 'static,
+{
+    let (task, handle) = LightProc::recoverable(future, schedule);
+    task.schedule();
+    handle
+}
+
+#[derive(Debug)]
+struct BlockingRunner {
+    // We keep a handle to the tokio runtime here to make sure
+    // it will never be dropped while the DynamicPoolManager is alive,
+    // In case we need to spin up some threads.
+    #[cfg(feature = "tokio-runtime")]
+    runtime_handle: tokio::runtime::Handle,
+}
+
+impl DynamicRunner for BlockingRunner {
+    fn run_static(&self, park_timeout: Duration) -> ! {
+        loop {
+            while let Ok(task) = POOL.receiver.recv_timeout(THREAD_RECV_TIMEOUT) {
+                trace!("static thread: running task");
+                self.run(task);
+            }
+
+            trace!("static: empty queue, parking with timeout");
+            thread::park_timeout(park_timeout);
+        }
+    }
+    fn run_dynamic(&self, parker: impl Fn()) -> ! {
+        loop {
+            while let Ok(task) = POOL.receiver.recv_timeout(THREAD_RECV_TIMEOUT) {
+                trace!("dynamic thread: running task");
+                self.run(task);
+            }
+            trace!(
+                "dynamic thread: parking - {:?}",
+                std::thread::current().id()
+            );
+            parker();
+        }
+    }
+    fn run_standalone(&self) {
+        while let Ok(task) = POOL.receiver.recv_timeout(THREAD_RECV_TIMEOUT) {
+            self.run(task);
+        }
+        trace!("standalone thread: quitting.");
+    }
+}
+
+impl BlockingRunner {
+    fn run(&self, task: LightProc) {
+        #[cfg(feature = "tokio-runtime")]
+        {
+            self.runtime_handle.spawn_blocking(|| task.run());
+        }
+        #[cfg(not(feature = "tokio-runtime"))]
+        {
+            task.run();
+        }
+    }
+}
+
+/// Pool interface between the scheduler and thread pool
+struct Pool {
+    sender: Sender<LightProc>,
+    receiver: Receiver<LightProc>,
+}
+
+static DYNAMIC_POOL_MANAGER: OnceCell<DynamicPoolManager<BlockingRunner>> = OnceCell::new();
+
+static POOL: Lazy<Pool> = Lazy::new(|| {
+    #[cfg(feature = "tokio-runtime")]
+    {
+        let runner = BlockingRunner {
+            // We use current() here instead of try_current()
+            // because we want bastion to crash as soon as possible
+            // if there is no available runtime.
+            runtime_handle: tokio::runtime::Handle::current(),
+        };
+
+        DYNAMIC_POOL_MANAGER
+            .set(DynamicPoolManager::new(*low_watermark() as usize, runner))
+            .expect("couldn't create dynamic pool manager");
+    }
+    #[cfg(not(feature = "tokio-runtime"))]
+    {
+        let runner = BlockingRunner {};
+
+        DYNAMIC_POOL_MANAGER
+            .set(DynamicPoolManager::new(*low_watermark() as usize, runner))
+            .expect("couldn't create dynamic pool manager");
+    }
+
+    DYNAMIC_POOL_MANAGER
+        .get()
+        .expect("couldn't get static pool manager")
+        .initialize();
+
+    let (sender, receiver) = unbounded();
+    Pool { sender, receiver }
+});
+
+/// Enqueues work, attempting to send to the thread pool in a
+/// nonblocking way and spinning up needed amount of threads
+/// based on the previous statistics without relying on
+/// if there is not a thread ready to accept the work or not.
+fn schedule(t: LightProc) {
+    if let Err(err) = POOL.sender.try_send(t) {
+        // We were not able to send to the channel without
+        // blocking.
+        POOL.sender.send(err.into_inner()).unwrap();
+    }
+
+    // Add up for every incoming scheduled task
+    DYNAMIC_POOL_MANAGER.get().unwrap().increment_frequency();
+}
+
+///
+/// Low watermark value, defines the bare minimum of the pool.
+/// Spawns initial thread set.
+/// Can be configurable with env var `BASTION_BLOCKING_THREADS` at runtime.
+#[inline]
+fn low_watermark() -> &'static u64 {
+    lazy_static! {
+        static ref LOW_WATERMARK: u64 = {
+            env::var_os("BASTION_BLOCKING_THREADS")
+                .map(|x| x.to_str().unwrap().parse::<u64>().unwrap())
+                .unwrap_or(DEFAULT_LOW_WATERMARK)
+        };
+    }
+
+    &*LOW_WATERMARK
+}
--- a/runtime/executor/src/lib.rs
+++ b/runtime/executor/src/lib.rs
@ -0,0 +1,50 @@
+//!
+//!
+//!
+//! Bastion Executor is NUMA-aware SMP based Fault-tolerant Executor
+//!
+//! Bastion Executor is a highly-available, fault-tolerant, async communication
+//! oriented executor. Bastion's main idea is supplying a fully async runtime
+//! with fault-tolerance to work on heavy loads.
+//!
+//! Main differences between other executors are:
+//! * Uses SMP based execution scheme to exploit cache affinity on multiple cores and execution is
+//! equally distributed over the system resources, which means utilizing the all system.
+//! * Uses NUMA-aware allocation for scheduler's queues and exploit locality on server workloads.
+//! * Tailored for creating middleware and working with actor model like concurrency and distributed communication.
+//!
+//! **NOTE:** Bastion Executor is independent of it's framework implementation.
+//! It uses [lightproc] to encapsulate and provide fault-tolerance to your future based workloads.
+//! You can use your futures with [lightproc] to run your workloads on Bastion Executor without the need to have framework.
+//!
+//! [lightproc]: https://docs.rs/lightproc
+//!
+
+#![doc(
+    html_logo_url = "https://raw.githubusercontent.com/bastion-rs/bastion/master/img/bastion-logo.png"
+)]
+// Force missing implementations
+#![warn(missing_docs)]
+#![warn(missing_debug_implementations)]
+#![warn(unused_imports)]
+#![forbid(unused_must_use)]
+#![forbid(unused_import_braces)]
+
+pub mod blocking;
+pub mod load_balancer;
+pub mod placement;
+pub mod pool;
+pub mod run;
+pub mod sleepers;
+mod thread_manager;
+pub mod worker;
+mod proc_stack;
+
+///
+/// Prelude of Bastion Executor
+pub mod prelude {
+    pub use crate::blocking::*;
+    pub use crate::pool::*;
+    pub use crate::run::*;
+    pub use crate::proc_stack::*;
+}
--- a/runtime/executor/src/load_balancer.rs
+++ b/runtime/executor/src/load_balancer.rs
@ -0,0 +1,234 @@
+//!
+//! Module for gathering statistics about the run queues of the runtime
+//!
+//! Load balancer calculates sampled mean to provide average process execution amount
+//! to all runtime.
+//!
+use crate::load_balancer;
+use crate::placement;
+use arrayvec::ArrayVec;
+use fmt::{Debug, Formatter};
+use lazy_static::*;
+use once_cell::sync::Lazy;
+use placement::CoreId;
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::RwLock;
+use std::time::{Duration, Instant};
+use std::{fmt, usize};
+use tracing::{debug, error};
+
+const MEAN_UPDATE_TRESHOLD: Duration = Duration::from_millis(200);
+
+/// Stats of all the smp queues.
+pub trait SmpStats {
+    /// Stores the load of the given queue.
+    fn store_load(&self, affinity: usize, load: usize);
+    /// returns tuple of queue id and load ordered from highest load to lowest.
+    fn get_sorted_load(&self) -> ArrayVec<(usize, usize), MAX_CORE>;
+    /// mean of the all smp queue load.
+    fn mean(&self) -> usize;
+    /// update the smp mean.
+    fn update_mean(&self);
+}
+
+static LOAD_BALANCER: Lazy<LoadBalancer> = Lazy::new(|| {
+    let lb = LoadBalancer::new(placement::get_core_ids().unwrap());
+    debug!("Instantiated load_balancer: {:?}", lb);
+    lb
+});
+
+/// Load-balancer struct which allows us to update the mean load
+pub struct LoadBalancer {
+    /// The number of cores
+    /// available for this program
+    pub num_cores: usize,
+    /// The core Ids available for this program
+    /// This doesn't take affinity into account
+    pub cores: Vec<CoreId>,
+    mean_last_updated_at: RwLock<Instant>,
+}
+
+impl LoadBalancer {
+    /// Creates a new LoadBalancer.
+    /// if you're looking for `num_cores` and `cores`
+    /// Have a look at `load_balancer::core_count()`
+    /// and `load_balancer::get_cores()` respectively.
+    pub fn new(cores: Vec<CoreId>) -> Self {
+        Self {
+            num_cores: cores.len(),
+            cores,
+            mean_last_updated_at: RwLock::new(Instant::now()),
+        }
+    }
+}
+
+impl Debug for LoadBalancer {
+    fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
+        fmt.debug_struct("LoadBalancer")
+            .field("num_cores", &self.num_cores)
+            .field("cores", &self.cores)
+            .field("mean_last_updated_at", &self.mean_last_updated_at)
+            .finish()
+    }
+}
+
+impl LoadBalancer {
+    /// Iterates the statistics to get the mean load across the cores
+    pub fn update_load_mean(&self) {
+        // Check if update should occur
+        if !self.should_update() {
+            return;
+        }
+        self.mean_last_updated_at
+            .write()
+            .map(|mut last_updated_at| {
+                *last_updated_at = Instant::now();
+            })
+            .unwrap_or_else(|e| error!("couldn't update mean timestamp - {}", e));
+
+        load_balancer::stats().update_mean();
+    }
+
+    fn should_update(&self) -> bool {
+        // If we couldn't acquire a lock on the mean last_updated_at,
+        // There is probably someone else updating already
+        self.mean_last_updated_at
+            .try_read()
+            .map(|last_updated_at| last_updated_at.elapsed() > MEAN_UPDATE_TRESHOLD)
+            .unwrap_or(false)
+    }
+}
+
+/// Update the mean load on the singleton
+pub fn update() {
+    LOAD_BALANCER.update_load_mean()
+}
+
+/// Maximum number of core supported by modern computers.
+const MAX_CORE: usize = 256;
+
+///
+/// Holding all statistics related to the run queue
+///
+/// Contains:
+/// * Mean level of processes in the run queues
+/// * SMP queue distributions
+pub struct Stats {
+    smp_load: [AtomicUsize; MAX_CORE],
+    mean_level: AtomicUsize,
+    updating_mean: AtomicBool,
+}
+
+impl fmt::Debug for Stats {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct("Stats")
+            .field("smp_load", &&self.smp_load[..])
+            .field("mean_level", &self.mean_level)
+            .field("updating_mean", &self.updating_mean)
+            .finish()
+    }
+}
+
+impl Stats {
+    /// new returns LockLessStats
+    pub fn new(num_cores: usize) -> Stats {
+        let smp_load: [AtomicUsize; MAX_CORE] = {
+            let mut data: [MaybeUninit<AtomicUsize>; MAX_CORE] =
+                unsafe { MaybeUninit::uninit().assume_init() };
+
+            for core_data in data.iter_mut().take(num_cores) {
+                unsafe {
+                    std::ptr::write(core_data.as_mut_ptr(), AtomicUsize::new(0));
+                }
+            }
+            for core_data in data.iter_mut().take(MAX_CORE).skip(num_cores) {
+                unsafe {
+                    std::ptr::write(core_data.as_mut_ptr(), AtomicUsize::new(usize::MAX));
+                }
+            }
+
+            unsafe { std::mem::transmute::<_, [AtomicUsize; MAX_CORE]>(data) }
+        };
+        Stats {
+            smp_load,
+            mean_level: AtomicUsize::new(0),
+            updating_mean: AtomicBool::new(false),
+        }
+    }
+}
+
+unsafe impl Sync for Stats {}
+unsafe impl Send for Stats {}
+
+impl SmpStats for Stats {
+    fn store_load(&self, affinity: usize, load: usize) {
+        self.smp_load[affinity].store(load, Ordering::SeqCst);
+    }
+
+    fn get_sorted_load(&self) -> ArrayVec<(usize, usize), MAX_CORE> {
+        let mut sorted_load = ArrayVec::new();
+
+        for (core, load) in self.smp_load.iter().enumerate() {
+            let load = load.load(Ordering::SeqCst);
+            // load till maximum core.
+            if load == usize::MAX {
+                break;
+            }
+            // unsafe is ok here because self.smp_load.len() is MAX_CORE
+            unsafe { sorted_load.push_unchecked((core, load)) };
+        }
+        sorted_load.sort_by(|x, y| y.1.cmp(&x.1));
+        sorted_load
+    }
+
+    fn mean(&self) -> usize {
+        self.mean_level.load(Ordering::Acquire)
+    }
+
+    fn update_mean(&self) {
+        // Don't update if it's updating already
+        if self.updating_mean.load(Ordering::Acquire) {
+            return;
+        }
+
+        self.updating_mean.store(true, Ordering::Release);
+        let mut sum: usize = 0;
+        let num_cores = LOAD_BALANCER.num_cores;
+
+        for item in self.smp_load.iter().take(num_cores) {
+            if let Some(tmp) = sum.checked_add(item.load(Ordering::Acquire)) {
+                sum = tmp;
+            }
+        }
+
+        self.mean_level
+            .store(sum.wrapping_div(num_cores), Ordering::Release);
+
+        self.updating_mean.store(false, Ordering::Release);
+    }
+}
+
+///
+/// Static access to runtime statistics
+#[inline]
+pub fn stats() -> &'static Stats {
+    lazy_static! {
+        static ref LOCKLESS_STATS: Stats = Stats::new(*core_count());
+    }
+    &*LOCKLESS_STATS
+}
+
+///
+/// Retrieve core count for the runtime scheduling purposes
+#[inline]
+pub fn core_count() -> &'static usize {
+    &LOAD_BALANCER.num_cores
+}
+
+///
+/// Retrieve cores for the runtime scheduling purposes
+#[inline]
+pub fn get_cores() -> &'static [CoreId] {
+    &*LOAD_BALANCER.cores
+}
--- a/runtime/executor/src/placement.rs
+++ b/runtime/executor/src/placement.rs
@ -0,0 +1,414 @@
+//! Core placement configuration and management
+//!
+//! Placement module enables thread placement onto the cores.
+//! CPU level affinity assignment is done here.
+
+/// This function tries to retrieve information
+/// on all the "cores" active on this system.
+pub fn get_core_ids() -> Option<Vec<CoreId>> {
+    get_core_ids_helper()
+}
+
+/// This function tries to retrieve
+/// the number of active "cores" on the system.
+pub fn get_num_cores() -> Option<usize> {
+    get_core_ids().map(|ids| ids.len())
+}
+///
+/// Sets the current threads affinity
+pub fn set_for_current(core_id: CoreId) {
+    tracing::trace!("Executor: placement: set affinity on core {}", core_id.id);
+    set_for_current_helper(core_id);
+}
+
+///
+/// CoreID implementation to identify system cores.
+#[derive(Copy, Clone, Debug)]
+pub struct CoreId {
+    /// Used core ID
+    pub id: usize,
+}
+
+// Linux Section
+
+#[cfg(target_os = "linux")]
+#[inline]
+fn get_core_ids_helper() -> Option<Vec<CoreId>> {
+    linux::get_core_ids()
+}
+
+#[cfg(target_os = "linux")]
+#[inline]
+fn set_for_current_helper(core_id: CoreId) {
+    linux::set_for_current(core_id);
+}
+
+#[cfg(target_os = "linux")]
+mod linux {
+    use std::mem;
+
+    use libc::{cpu_set_t, sched_getaffinity, sched_setaffinity, CPU_ISSET, CPU_SET, CPU_SETSIZE};
+
+    use super::CoreId;
+
+    pub fn get_core_ids() -> Option<Vec<CoreId>> {
+        if let Some(full_set) = get_affinity_mask() {
+            let mut core_ids: Vec<CoreId> = Vec::new();
+
+            for i in 0..CPU_SETSIZE as usize {
+                if unsafe { CPU_ISSET(i, &full_set) } {
+                    core_ids.push(CoreId { id: i });
+                }
+            }
+
+            Some(core_ids)
+        } else {
+            None
+        }
+    }
+
+    pub fn set_for_current(core_id: CoreId) {
+        // Turn `core_id` into a `libc::cpu_set_t` with only
+        // one core active.
+        let mut set = new_cpu_set();
+
+        unsafe { CPU_SET(core_id.id, &mut set) };
+
+        // Set the current thread's core affinity.
+        unsafe {
+            sched_setaffinity(
+                0, // Defaults to current thread
+                mem::size_of::<cpu_set_t>(),
+                &set,
+            );
+        }
+    }
+
+    fn get_affinity_mask() -> Option<cpu_set_t> {
+        let mut set = new_cpu_set();
+
+        // Try to get current core affinity mask.
+        let result = unsafe {
+            sched_getaffinity(
+                0, // Defaults to current thread
+                mem::size_of::<cpu_set_t>(),
+                &mut set,
+            )
+        };
+
+        if result == 0 {
+            Some(set)
+        } else {
+            None
+        }
+    }
+
+    fn new_cpu_set() -> cpu_set_t {
+        unsafe { mem::zeroed::<cpu_set_t>() }
+    }
+
+    #[cfg(test)]
+    mod tests {
+
+        use super::*;
+
+        #[test]
+        fn test_linux_get_affinity_mask() {
+            match get_affinity_mask() {
+                Some(_) => {}
+                None => {
+                    panic!();
+                }
+            }
+        }
+
+        #[test]
+        fn test_linux_get_core_ids() {
+            match get_core_ids() {
+                Some(set) => {
+                    assert_eq!(set.len(), num_cpus::get());
+                }
+                None => {
+                    panic!();
+                }
+            }
+        }
+
+        #[test]
+        fn test_linux_set_for_current() {
+            let ids = get_core_ids().unwrap();
+
+            assert!(!ids.is_empty());
+
+            set_for_current(ids[0]);
+
+            // Ensure that the system pinned the current thread
+            // to the specified core.
+            let mut core_mask = new_cpu_set();
+            unsafe { CPU_SET(ids[0].id, &mut core_mask) };
+
+            let new_mask = get_affinity_mask().unwrap();
+
+            let mut is_equal = true;
+
+            for i in 0..CPU_SETSIZE as usize {
+                let is_set1 = unsafe { CPU_ISSET(i, &core_mask) };
+                let is_set2 = unsafe { CPU_ISSET(i, &new_mask) };
+
+                if is_set1 != is_set2 {
+                    is_equal = false;
+                }
+            }
+
+            assert!(is_equal);
+        }
+    }
+}
+
+// Windows Section
+
+#[cfg(target_os = "windows")]
+#[inline]
+fn get_core_ids_helper() -> Option<Vec<CoreId>> {
+    windows::get_core_ids()
+}
+
+#[cfg(target_os = "windows")]
+#[inline]
+fn set_for_current_helper(core_id: CoreId) {
+    windows::set_for_current(core_id);
+}
+
+#[cfg(target_os = "windows")]
+extern crate winapi;
+
+#[cfg(target_os = "windows")]
+mod windows {
+    #[allow(unused_imports)]
+    use winapi::shared::basetsd::{DWORD_PTR, PDWORD_PTR};
+    use winapi::um::processthreadsapi::{GetCurrentProcess, GetCurrentThread};
+    use winapi::um::winbase::{GetProcessAffinityMask, SetThreadAffinityMask};
+
+    use super::CoreId;
+
+    pub fn get_core_ids() -> Option<Vec<CoreId>> {
+        if let Some(mask) = get_affinity_mask() {
+            // Find all active cores in the bitmask.
+            let mut core_ids: Vec<CoreId> = Vec::new();
+
+            for i in 0..usize::MIN.count_zeros() as usize {
+                let test_mask = 1 << i;
+
+                if (mask & test_mask) == test_mask {
+                    core_ids.push(CoreId { id: i });
+                }
+            }
+
+            Some(core_ids)
+        } else {
+            None
+        }
+    }
+
+    pub fn set_for_current(core_id: CoreId) {
+        // Convert `CoreId` back into mask.
+        let mask: DWORD_PTR = 1 << core_id.id;
+
+        // Set core affinity for current thread.
+        unsafe {
+            SetThreadAffinityMask(GetCurrentThread(), mask);
+        }
+    }
+
+    fn get_affinity_mask() -> Option<usize> {
+        let mut process_mask: usize = 0;
+        let mut system_mask: usize = 0;
+
+        let res = unsafe {
+            GetProcessAffinityMask(
+                GetCurrentProcess(),
+                &mut process_mask as PDWORD_PTR,
+                &mut system_mask as PDWORD_PTR,
+            )
+        };
+
+        // Successfully retrieved affinity mask
+        if res != 0 {
+            Some(process_mask)
+        }
+        // Failed to retrieve affinity mask
+        else {
+            None
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use num_cpus;
+
+        use super::*;
+
+        #[test]
+        fn test_macos_get_core_ids() {
+            match get_core_ids() {
+                Some(set) => {
+                    assert_eq!(set.len(), num_cpus::get());
+                }
+                None => {
+                    assert!(false);
+                }
+            }
+        }
+
+        #[test]
+        fn test_macos_set_for_current() {
+            let ids = get_core_ids().unwrap();
+
+            assert!(ids.len() > 0);
+
+            set_for_current(ids[0]);
+        }
+    }
+}
+
+// MacOS Section
+
+#[cfg(target_os = "macos")]
+#[inline]
+fn get_core_ids_helper() -> Option<Vec<CoreId>> {
+    macos::get_core_ids()
+}
+
+#[cfg(target_os = "macos")]
+#[inline]
+fn set_for_current_helper(core_id: CoreId) {
+    macos::set_for_current(core_id);
+}
+
+#[cfg(target_os = "macos")]
+mod macos {
+    use std::mem;
+
+    use libc::{c_int, c_uint, pthread_self};
+
+    use super::CoreId;
+
+    type KernReturnT = c_int;
+    type IntegerT = c_int;
+    type NaturalT = c_uint;
+    type ThreadT = c_uint;
+    type ThreadPolicyFlavorT = NaturalT;
+    type MachMsgTypeNumberT = NaturalT;
+
+    #[repr(C)]
+    struct ThreadAffinityPolicyDataT {
+        affinity_tag: IntegerT,
+    }
+
+    type ThreadPolicyT = *mut ThreadAffinityPolicyDataT;
+
+    const THREAD_AFFINITY_POLICY: ThreadPolicyFlavorT = 4;
+
+    #[link(name = "System", kind = "framework")]
+    extern "C" {
+        fn thread_policy_set(
+            thread: ThreadT,
+            flavor: ThreadPolicyFlavorT,
+            policy_info: ThreadPolicyT,
+            count: MachMsgTypeNumberT,
+        ) -> KernReturnT;
+    }
+
+    pub fn get_core_ids() -> Option<Vec<CoreId>> {
+        Some(
+            (0..(num_cpus::get()))
+                .map(|n| CoreId { id: n as usize })
+                .collect::<Vec<_>>(),
+        )
+    }
+
+    pub fn set_for_current(core_id: CoreId) {
+        let thread_affinity_policy_count: MachMsgTypeNumberT =
+            mem::size_of::<ThreadAffinityPolicyDataT>() as MachMsgTypeNumberT
+                / mem::size_of::<IntegerT>() as MachMsgTypeNumberT;
+
+        let mut info = ThreadAffinityPolicyDataT {
+            affinity_tag: core_id.id as IntegerT,
+        };
+
+        unsafe {
+            thread_policy_set(
+                pthread_self() as ThreadT,
+                THREAD_AFFINITY_POLICY,
+                &mut info as ThreadPolicyT,
+                thread_affinity_policy_count,
+            );
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+
+        use super::*;
+
+        #[test]
+        fn test_windows_get_core_ids() {
+            match get_core_ids() {
+                Some(set) => {
+                    assert_eq!(set.len(), num_cpus::get());
+                }
+                None => {
+                    panic!();
+                }
+            }
+        }
+
+        #[test]
+        fn test_windows_set_for_current() {
+            let ids = get_core_ids().unwrap();
+
+            assert!(ids.len() > 0);
+
+            set_for_current(ids[0]);
+        }
+    }
+}
+
+// Stub Section
+
+#[cfg(not(any(target_os = "linux", target_os = "windows", target_os = "macos")))]
+#[inline]
+fn get_core_ids_helper() -> Option<Vec<CoreId>> {
+    None
+}
+
+#[cfg(not(any(target_os = "linux", target_os = "windows", target_os = "macos")))]
+#[inline]
+fn set_for_current_helper(core_id: CoreId) {}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_get_core_ids() {
+        match get_core_ids() {
+            Some(set) => {
+                assert_eq!(set.len(), num_cpus::get());
+            }
+            None => {
+                panic!();
+            }
+        }
+    }
+
+    #[test]
+    fn test_set_for_current() {
+        let ids = get_core_ids().unwrap();
+
+        assert!(!ids.is_empty());
+
+        set_for_current(ids[0]);
+    }
+}
--- a/runtime/executor/src/pool.rs
+++ b/runtime/executor/src/pool.rs
@ -0,0 +1,223 @@
+//!
+//! Pool of threads to run lightweight processes
+//!
+//! We spawn futures onto the pool with [`spawn`] method of global run queue or
+//! with corresponding [`Worker`]'s spawn method.
+//!
+//! [`spawn`]: crate::pool::spawn
+//! [`Worker`]: crate::run_queue::Worker
+
+use crate::thread_manager::{DynamicPoolManager, DynamicRunner};
+use crate::worker;
+use crossbeam_channel::{unbounded, Receiver, Sender};
+use lazy_static::lazy_static;
+use lightproc::lightproc::LightProc;
+use lightproc::recoverable_handle::RecoverableHandle;
+use once_cell::sync::{Lazy, OnceCell};
+use std::future::Future;
+use std::iter::Iterator;
+use std::time::Duration;
+use std::{env, thread};
+use tracing::trace;
+
+///
+/// Spawn a process (which contains future + process stack) onto the executor from the global level.
+///
+/// # Example
+/// ```rust
+/// use executor::prelude::*;
+///
+/// # #[cfg(feature = "tokio-runtime")]
+/// # #[tokio::main]
+/// # async fn main() {
+/// #    start();    
+/// # }
+/// #
+/// # #[cfg(not(feature = "tokio-runtime"))]
+/// # fn main() {
+/// #    start();    
+/// # }
+/// #
+/// # fn start() {
+///
+/// let handle = spawn(
+///     async {
+///         panic!("test");
+///     },
+/// );
+///
+/// run(
+///     async {
+///         handle.await;
+///     },
+///     ProcStack { },
+/// );
+/// # }
+/// ```
+pub fn spawn<F, R>(future: F) -> RecoverableHandle<R>
+where
+    F: Future<Output = R> + Send + 'static,
+    R: Send + 'static,
+{
+    let (task, handle) = LightProc::recoverable(future, worker::schedule);
+    task.schedule();
+    handle
+}
+
+/// Spawns a blocking task.
+///
+/// The task will be spawned onto a thread pool specifically dedicated to blocking tasks.
+pub fn spawn_blocking<F, R>(future: F) -> RecoverableHandle<R>
+where
+    F: Future<Output = R> + Send + 'static,
+    R: Send + 'static,
+{
+    let (task, handle) = LightProc::recoverable(future, schedule);
+    task.schedule();
+    handle
+}
+
+///
+/// Acquire the static Pool reference
+#[inline]
+pub fn get() -> &'static Pool {
+    &*POOL
+}
+
+pub fn get_manager() -> Option<&'static DynamicPoolManager<AsyncRunner>> {
+    DYNAMIC_POOL_MANAGER.get()
+}
+
+impl Pool {
+    ///
+    /// Spawn a process (which contains future + process stack) onto the executor via [Pool] interface.
+    pub fn spawn<F, R>(&self, future: F) -> RecoverableHandle<R>
+    where
+        F: Future<Output = R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let (task, handle) = LightProc::recoverable(future, worker::schedule);
+        task.schedule();
+        handle
+    }
+}
+
+/// Enqueues work, attempting to send to the thread pool in a
+/// nonblocking way and spinning up needed amount of threads
+/// based on the previous statistics without relying on
+/// if there is not a thread ready to accept the work or not.
+pub(crate) fn schedule(t: LightProc) {
+    if let Err(err) = POOL.sender.try_send(t) {
+        // We were not able to send to the channel without
+        // blocking.
+        POOL.sender.send(err.into_inner()).unwrap();
+    }
+    // Add up for every incoming scheduled task
+    DYNAMIC_POOL_MANAGER.get().unwrap().increment_frequency();
+}
+
+///
+/// Low watermark value, defines the bare minimum of the pool.
+/// Spawns initial thread set.
+/// Can be configurable with env var `BASTION_BLOCKING_THREADS` at runtime.
+#[inline]
+fn low_watermark() -> &'static u64 {
+    lazy_static! {
+        static ref LOW_WATERMARK: u64 = {
+            env::var_os("BASTION_BLOCKING_THREADS")
+                .map(|x| x.to_str().unwrap().parse::<u64>().unwrap())
+                .unwrap_or(DEFAULT_LOW_WATERMARK)
+        };
+    }
+
+    &*LOW_WATERMARK
+}
+
+/// If low watermark isn't configured this is the default scaler value.
+/// This value is used for the heuristics of the scaler
+const DEFAULT_LOW_WATERMARK: u64 = 2;
+
+/// Pool interface between the scheduler and thread pool
+#[derive(Debug)]
+pub struct Pool {
+    sender: Sender<LightProc>,
+    receiver: Receiver<LightProc>,
+}
+
+#[derive(Debug)]
+pub struct AsyncRunner {
+
+}
+
+impl DynamicRunner for AsyncRunner {
+    fn run_static(&self, park_timeout: Duration) -> ! {
+        loop {
+            for task in &POOL.receiver {
+                trace!("static: running task");
+                self.run(task);
+            }
+
+            trace!("static: empty queue, parking with timeout");
+            thread::park_timeout(park_timeout);
+        }
+    }
+    fn run_dynamic(&self, parker: impl Fn()) -> ! {
+        loop {
+            while let Ok(task) = POOL.receiver.try_recv() {
+                trace!("dynamic thread: running task");
+                self.run(task);
+            }
+            trace!(
+                "dynamic thread: parking - {:?}",
+                std::thread::current().id()
+            );
+            parker();
+        }
+    }
+    fn run_standalone(&self) {
+        while let Ok(task) = POOL.receiver.try_recv() {
+            self.run(task);
+        }
+        trace!("standalone thread: quitting.");
+    }
+}
+
+impl AsyncRunner {
+    fn run(&self, task: LightProc) {
+        task.run();
+    }
+}
+
+static DYNAMIC_POOL_MANAGER: OnceCell<DynamicPoolManager<AsyncRunner>> = OnceCell::new();
+
+static POOL: Lazy<Pool> = Lazy::new(|| {
+    #[cfg(feature = "tokio-runtime")]
+    {
+        let runner = AsyncRunner {
+            // We use current() here instead of try_current()
+            // because we want bastion to crash as soon as possible
+            // if there is no available runtime.
+            runtime_handle: tokio::runtime::Handle::current(),
+        };
+
+        DYNAMIC_POOL_MANAGER
+            .set(DynamicPoolManager::new(*low_watermark() as usize, runner))
+            .expect("couldn't create dynamic pool manager");
+    }
+    #[cfg(not(feature = "tokio-runtime"))]
+    {
+        let runner = AsyncRunner {};
+
+        DYNAMIC_POOL_MANAGER
+            .set(DynamicPoolManager::new(*low_watermark() as usize, runner))
+            .expect("couldn't create dynamic pool manager");
+    }
+
+    DYNAMIC_POOL_MANAGER
+        .get()
+        .expect("couldn't get static pool manager")
+        .initialize();
+
+    let (sender, receiver) = unbounded();
+    Pool { sender, receiver }
+});
--- a/runtime/executor/src/proc_stack.rs
+++ b/runtime/executor/src/proc_stack.rs
@ -0,0 +1,5 @@
+
+#[derive(Debug)]
+pub struct ProcStack {
+
+}
--- a/runtime/executor/src/run.rs
+++ b/runtime/executor/src/run.rs
@ -0,0 +1,154 @@
+//!
+//! Blocking run of the async processes
+//!
+//!
+use crate::worker;
+use crossbeam_utils::sync::{Parker, Unparker};
+use std::cell::Cell;
+use std::future::Future;
+use std::mem;
+use std::mem::{ManuallyDrop, MaybeUninit};
+use std::pin::Pin;
+use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
+use crate::proc_stack::ProcStack;
+
+///
+/// This method blocks the current thread until passed future is resolved with an output.
+///
+/// It is called `block_on` or `blocking` in some executors.
+///
+/// # Example
+/// ```rust
+/// use executor::prelude::*;
+/// use lightproc::prelude::*;
+/// let mut sum = 0;
+///
+/// run(
+///     async {
+///         (0..10_000_000).for_each(|_| {
+///             sum += 1;
+///         });
+///     },
+///     ProcStack::default(),
+/// );
+/// ```
+pub fn run<F, T>(future: F, stack: ProcStack) -> T
+where
+    F: Future<Output = T>,
+{
+    unsafe {
+        // An explicitly uninitialized `T`. Until `assume_init` is called this will not call any
+        // drop code for T
+        let mut out = MaybeUninit::uninit();
+
+        // Wrap the future into one that stores the result into `out`.
+        let future = {
+            let out = out.as_mut_ptr();
+
+            async move {
+                *out = future.await;
+            }
+        };
+
+        // Pin the future onto the stack.
+        pin_utils::pin_mut!(future);
+
+        // Extend the lifetime of the future to 'static.
+        let future = mem::transmute::<
+            Pin<&'_ mut dyn Future<Output = ()>>,
+            Pin<&'static mut dyn Future<Output = ()>>,
+        >(future);
+
+        // Block on the future and and wait for it to complete.
+        worker::set_stack(&stack, || block(future));
+
+        // Assume that if the future completed and didn't panic it fully initialized its output
+        out.assume_init()
+    }
+}
+
+fn block<F, T>(f: F) -> T
+where
+    F: Future<Output = T>,
+{
+    thread_local! {
+        // May hold a pre-allocated parker that can be reused for efficiency.
+        //
+        // Note that each invocation of `block` needs its own parker. In particular, if `block`
+        // recursively calls itself, we must make sure that each recursive call uses a distinct
+        // parker instance.
+        static CACHE: Cell<Option<Parker>> = Cell::new(None);
+    }
+
+    pin_utils::pin_mut!(f);
+
+    CACHE.with(|cache| {
+        // Reuse a cached parker or create a new one for this invocation of `block`.
+        let parker: Parker = cache.take().unwrap_or_else(|| Parker::new());
+
+        let ptr = Unparker::into_raw(parker.unparker().clone());
+        let vt = vtable();
+
+        // Waker must not be dropped until it's no longer required. We also happen to know that a
+        // Parker contains at least one reference to `Unparker` so the relevant `Unparker` will not
+        // be dropped at least until the `Parker` is.
+        let waker = unsafe { Waker::from_raw(RawWaker::new(ptr, vt)) };
+        let cx = &mut Context::from_waker(&waker);
+
+        loop {
+            if let Poll::Ready(t) = f.as_mut().poll(cx) {
+                // Save the parker for the next invocation of `block`.
+                cache.set(Some(parker));
+                return t;
+            }
+            parker.park();
+        }
+    })
+}
+
+fn vtable() -> &'static RawWakerVTable {
+    /// This function will be called when the RawWaker gets cloned, e.g. when the Waker in which
+    /// the RawWaker is stored gets cloned.
+    //
+    /// The implementation of this function must retain all resources that are required for this
+    /// additional instance of a RawWaker and associated task. Calling wake on the resulting
+    /// RawWaker should result in a wakeup of the same task that would have been awoken by the
+    /// original RawWaker.
+    unsafe fn clone_raw(ptr: *const ()) -> RawWaker {
+        // [`Unparker`] implements `Clone` and upholds the contract stated above. The current
+        // Implementation is simply an Arc over the actual inner values.
+        let unparker = Unparker::from_raw(ptr).clone();
+        RawWaker::new(Unparker::into_raw(unparker), vtable())
+    }
+
+    /// This function will be called when wake is called on the Waker. It must wake up the task
+    /// associated with this RawWaker.
+    ///
+    /// The implementation of this function must make sure to release any resources that are
+    /// associated with this instance of a RawWaker and associated task.
+    unsafe fn wake_raw(ptr: *const ()) {
+        // We reconstruct the Unparker from the pointer here thus ensuring it is dropped at the
+        // end of this function call.
+        Unparker::from_raw(ptr).unpark();
+    }
+
+    /// This function will be called when wake_by_ref is called on the Waker. It must wake up the
+    /// task associated with this RawWaker.
+    ///
+    /// This function is similar to wake, but must not consume the provided data pointer.
+    unsafe fn wake_by_ref_raw(ptr: *const ()) {
+        // We **must not** drop the resulting Unparker so we wrap it in `ManuallyDrop`.
+        let unparker = ManuallyDrop::new(Unparker::from_raw(ptr));
+        unparker.unpark();
+    }
+
+    /// This function gets called when a RawWaker gets dropped.
+    ///
+    /// The implementation of this function must make sure to release any resources that are
+    /// associated with this instance of a RawWaker and associated task.
+    unsafe fn drop_raw(ptr: *const ()) {
+        drop(Unparker::from_raw(ptr))
+    }
+
+    &RawWakerVTable::new(clone_raw, wake_raw, wake_by_ref_raw, drop_raw)
+}
--- a/runtime/executor/src/sleepers.rs
+++ b/runtime/executor/src/sleepers.rs
@ -0,0 +1,67 @@
+//!
+//! Where workers went to parking while no workload is in their worker queue.
+//!
+//! If a workload received pool will wake them up.
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Condvar, Mutex};
+
+/// The place where worker threads go to sleep.
+///
+/// Similar to how thread parking works, if a notification comes up while no threads are sleeping,
+/// the next thread that attempts to go to sleep will pick up the notification immediately.
+#[derive(Debug)]
+#[allow(clippy::mutex_atomic)]
+pub struct Sleepers {
+    /// How many threads are currently a sleep.
+    sleep: Mutex<usize>,
+
+    /// A condvar for notifying sleeping threads.
+    wake: Condvar,
+
+    /// Set to `true` if a notification came up while nobody was sleeping.
+    notified: AtomicBool,
+}
+
+#[allow(clippy::mutex_atomic)]
+impl Default for Sleepers {
+    /// Creates a new `Sleepers`.
+    fn default() -> Self {
+        Self {
+            sleep: Mutex::new(0),
+            wake: Condvar::new(),
+            notified: AtomicBool::new(false),
+        }
+    }
+}
+
+#[allow(clippy::mutex_atomic)]
+impl Sleepers {
+    /// Creates a new `Sleepers`.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Puts the current thread to sleep.
+    pub fn wait(&self) {
+        let mut sleep = self.sleep.lock().unwrap();
+
+        if !self.notified.swap(false, Ordering::SeqCst) {
+            *sleep += 1;
+            std::mem::drop(self.wake.wait(sleep).unwrap());
+        }
+    }
+
+    /// Notifies one thread.
+    pub fn notify_one(&self) {
+        if !self.notified.load(Ordering::SeqCst) {
+            let mut sleep = self.sleep.lock().unwrap();
+
+            if *sleep > 0 {
+                *sleep -= 1;
+                self.wake.notify_one();
+            } else {
+                self.notified.store(true, Ordering::SeqCst);
+            }
+        }
+    }
+}
--- a/runtime/executor/src/thread_manager.rs
+++ b/runtime/executor/src/thread_manager.rs
@ -0,0 +1,404 @@
+//! A thread manager to predict how many threads should be spawned to handle the upcoming load.
+//!
+//! The thread manager consists of three elements:
+//! * Frequency Detector
+//! * Trend Estimator
+//! * Predictive Upscaler
+//!
+//! ## Frequency Detector
+//! Detects how many tasks are submitted from scheduler to thread pool in a given time frame.
+//! Pool manager thread does this sampling every 90 milliseconds.
+//! This value is going to be used for trend estimation phase.
+//!
+//! ## Trend Estimator
+//! Hold up to the given number of frequencies to create an estimation.
+//! Trend estimator holds 10 frequencies at a time.
+//! This value is stored as constant in [FREQUENCY_QUEUE_SIZE](constant.FREQUENCY_QUEUE_SIZE.html).
+//! Estimation algorithm and prediction uses Exponentially Weighted Moving Average algorithm.
+//!
+//! This algorithm is adapted from [A Novel Predictive and Self–Adaptive Dynamic Thread Pool Management](https://doi.org/10.1109/ISPA.2011.61)
+//! and altered to:
+//! * use instead of heavy calculation of trend, utilize thread redundancy which is the sum of the differences between the predicted and observed value.
+//! * use instead of linear trend estimation, it uses exponential trend estimation where formula is:
+//! ```text
+//! LOW_WATERMARK * (predicted - observed) + LOW_WATERMARK
+//! ```
+//! *NOTE:* If this algorithm wants to be tweaked increasing [LOW_WATERMARK](constant.LOW_WATERMARK.html) will automatically adapt the additional dynamic thread spawn count
+//! * operate without watermarking by timestamps (in paper which is used to measure algorithms own performance during the execution)
+//! * operate extensive subsampling. Extensive subsampling congests the pool manager thread.
+//! * operate without keeping track of idle time of threads or job out queue like TEMA and FOPS implementations.
+//!
+//! ## Predictive Upscaler
+//! Upscaler has three cases (also can be seen in paper):
+//! * The rate slightly increases and there are many idle threads.
+//! * The number of worker threads tends to be reduced since the workload of the system is descending.
+//! * The system has no request or stalled. (Our case here is when the current tasks block further tasks from being processed – throughput hogs)
+//!
+//! For the first two EMA calculation and exponential trend estimation gives good performance.
+//! For the last case, upscaler selects upscaling amount by amount of tasks mapped when throughput hogs happen.
+//!
+//! **example scenario:** Let's say we have 10_000 tasks where every one of them is blocking for 1 second. Scheduler will map plenty of tasks but will get rejected.
+//! This makes estimation calculation nearly 0 for both entering and exiting parts. When this happens and we still see tasks mapped from scheduler.
+//! We start to slowly increase threads by amount of frequency linearly. High increase of this value either make us hit to the thread threshold on
+//! some OS or make congestion on the other thread utilizations of the program, because of context switch.
+//!
+//! Throughput hogs determined by a combination of job in / job out frequency and current scheduler task assignment frequency.
+//! Threshold of EMA difference is eluded by machine epsilon for floating point arithmetic errors.
+
+use crate::{load_balancer, placement};
+use core::fmt;
+use crossbeam_queue::ArrayQueue;
+use fmt::{Debug, Formatter};
+use lazy_static::lazy_static;
+use lever::prelude::TTas;
+use placement::CoreId;
+use std::collections::{HashMap, VecDeque};
+use std::time::Duration;
+use std::{
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Mutex,
+    },
+    thread::{self, Thread},
+};
+use std::any::Any;
+use std::panic::resume_unwind;
+use std::thread::{JoinHandle, ThreadId};
+use crossbeam_deque::Worker;
+use crossbeam_utils::sync::{Parker, Unparker};
+use tracing::{debug, trace};
+use lightproc::lightproc::LightProc;
+
+/// The default thread park timeout before checking for new tasks.
+const THREAD_PARK_TIMEOUT: Duration = Duration::from_millis(1);
+
+/// Frequency histogram's sliding window size.
+/// Defines how many frequencies will be considered for adaptation.
+const FREQUENCY_QUEUE_SIZE: usize = 10;
+
+/// If low watermark isn't configured this is the default scaler value.
+/// This value is used for the heuristics of the scaler
+const DEFAULT_LOW_WATERMARK: u64 = 2;
+
+/// Pool scaler interval time (milliseconds).
+/// This is the actual interval which makes adaptation calculation.
+const SCALER_POLL_INTERVAL: u64 = 90;
+
+/// Exponential moving average smoothing coefficient for limited window.
+/// Smoothing factor is estimated with: 2 / (N + 1) where N is sample size.
+const EMA_COEFFICIENT: f64 = 2_f64 / (FREQUENCY_QUEUE_SIZE as f64 + 1_f64);
+
+lazy_static! {
+    static ref ROUND_ROBIN_PIN: Mutex<CoreId> = Mutex::new(CoreId { id: 0 });
+}
+
+/// The `DynamicRunner` is piloted by `DynamicPoolManager`.
+/// Upon request it needs to be able to provide runner routines for:
+/// * Static threads.
+/// * Dynamic threads.
+/// * Standalone threads.
+///
+/// Your implementation of `DynamicRunner`
+/// will allow you to define what tasks must be accomplished.
+///
+/// Run static threads:
+///
+/// run_static should never return, and park for park_timeout instead.
+///
+/// Run dynamic threads:
+/// run_dynamic should never return, and call `parker()` when it has no more tasks to process.
+/// It will be unparked automatically by the `DynamicPoolManager` if needs be.
+///
+/// Run standalone threads:
+/// run_standalone should return once it has no more tasks to process.
+/// The `DynamicPoolManager` will spawn other standalone threads if needs be.
+pub trait DynamicRunner {
+    fn run_static(&self, park_timeout: Duration) -> ! {
+        let parker = Parker::new();
+        self.run_dynamic(|| parker.park_timeout(park_timeout));
+    }
+    fn run_dynamic(&self, parker: impl Fn()) -> !;
+    fn run_standalone(&self);
+}
+
+/// The `DynamicPoolManager` is responsible for
+/// growing and shrinking a pool according to EMA rules.
+///
+/// It needs to be passed a structure that implements `DynamicRunner`,
+/// That will be responsible for actually spawning threads.
+///
+/// The `DynamicPoolManager` keeps track of the number
+/// of required number of threads to process load correctly.
+/// and depending on the current state it will case it will:
+/// - Spawn a lot of threads (we're predicting a load spike, and we need to prepare for it)
+/// - Spawn few threads (there's a constant load, and throughput is low because the current resources are busy)
+/// - Do nothing (the load is shrinking, threads will automatically stop once they're done).
+///
+/// Kinds of threads:
+///
+/// ## Static threads:
+/// Defined in the constructor, they will always be available. They park for `THREAD_PARK_TIMEOUT` on idle.
+///
+/// ## Dynamic threads:
+/// Created during `DynamicPoolManager` initialization, they will park on idle.
+/// The `DynamicPoolManager` grows the number of Dynamic threads
+/// so the total number of Static threads + Dynamic threads
+/// is the number of available cores on the machine. (`num_cpus::get()`)
+///
+/// ## Standalone threads:
+/// They are created when there aren't enough static and dynamic threads to process the expected load.
+/// They will be destroyed on idle.
+///
+/// ## Spawn order:
+/// In order to handle a growing load, the pool manager will ask to:
+/// - Use Static threads
+/// - Unpark Dynamic threads
+/// - Spawn Standalone threads
+///
+/// The pool manager is not responsible for the tasks to be performed by the threads, it's handled by the `DynamicRunner`
+///
+/// If you use tracing, you can have a look at the trace! logs generated by the structure.
+///
+pub struct DynamicPoolManager<Runner> {
+    static_threads: usize,
+    dynamic_threads: usize,
+    parked_threads: ArrayQueue<Unparker>,
+    runner: Runner,
+    last_frequency: AtomicU64,
+    frequencies: TTas<VecDeque<u64>>,
+}
+
+impl<Runner: Debug> Debug for DynamicPoolManager<Runner> {
+    fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
+        fmt.debug_struct("DynamicPoolManager")
+            .field("static_threads", &self.static_threads)
+            .field("dynamic_threads", &self.dynamic_threads)
+            .field("parked_threads", &self.parked_threads.len())
+            .field("runner", &self.runner)
+            .field("last_frequency", &self.last_frequency)
+            .field("frequencies", &self.frequencies.try_lock())
+            .finish()
+    }
+}
+
+impl<Runner: DynamicRunner + Sync + Send> DynamicPoolManager<Runner> {
+    pub fn new(static_threads: usize, runner: Runner) -> Self {
+        let dynamic_threads = 1.max(num_cpus::get().checked_sub(static_threads).unwrap_or(0));
+
+        Self {
+            static_threads,
+            dynamic_threads,
+            parked_threads: ArrayQueue::new(dynamic_threads),
+            runner,
+            last_frequency: AtomicU64::new(0),
+            frequencies: TTas::new(VecDeque::with_capacity(
+                FREQUENCY_QUEUE_SIZE.saturating_add(1),
+            )),
+        }
+    }
+
+    pub fn increment_frequency(&self) {
+        self.last_frequency.fetch_add(1, Ordering::Acquire);
+    }
+
+    /// Initialize the dynamic pool
+    /// That will be scaled
+    pub fn initialize(&'static self) {
+        // Static thread manager that will always be available
+        trace!("spooling up {} static worker threads", self.static_threads);
+        (0..self.static_threads).for_each(|n| {
+            let runner = &self.runner;
+            thread::Builder::new()
+                .name(format!("static #{}", n))
+                .spawn(move || {
+                    Self::affinity_pinner();
+                    runner.run_static(THREAD_PARK_TIMEOUT);
+                })
+                .expect("failed to spawn static worker thread");
+        });
+
+        // Dynamic thread manager that will allow us to unpark threads when needed
+        trace!("spooling up {} dynamic worker threads", self.dynamic_threads);
+        (0..self.dynamic_threads).for_each(|n| {
+            let runner = &self.runner;
+            thread::Builder::new()
+                .name(format!("dynamic #{}", n))
+                .spawn(move || {
+                    Self::affinity_pinner();
+                    let parker = Parker::new();
+                    let unparker = parker.unparker();
+                    runner.run_dynamic(|| self.park_thread(&parker, unparker));
+                })
+                .expect("failed to spawn dynamic worker thread");
+        });
+
+        // Pool manager to check frequency of task rates
+        // and take action by scaling the pool accordingly.
+        thread::Builder::new()
+            .name("pool manager".to_string())
+            .spawn(move || {
+                let poll_interval = Duration::from_millis(SCALER_POLL_INTERVAL);
+                trace!("setting up the pool manager");
+                loop {
+                    self.scale_pool();
+                    thread::park_timeout(poll_interval);
+                }
+            })
+            .expect("failed to spawn pool manager thread");
+    }
+
+    /// Provision threads takes a number of threads that need to be made available.
+    /// It will try to unpark threads from the dynamic pool, and spawn more threads if needs be.
+    pub fn provision_threads(&'static self, n: usize) {
+        for i in 0..n {
+            if !self.unpark_thread() {
+                let new_threads = n - i;
+                trace!(
+                    "no more threads to unpark, spawning {} new threads",
+                    new_threads
+                );
+                return self.spawn_threads(new_threads);
+            }
+        }
+    }
+
+    fn spawn_threads(&'static self, n: usize) {
+        (0..n).for_each(|_| {
+            let runner = &self.runner;
+            thread::Builder::new()
+                .name("standalone worker".to_string())
+                .spawn(move || {
+                    Self::affinity_pinner();
+                    runner.run_standalone();
+                })
+                .unwrap();
+        })
+    }
+
+    /// Parks a thread until [`unpark_thread`] unparks it
+    pub fn park_thread(&self, parker: &Parker, unparker: &Unparker) {
+        if let Err(unparker) = self.parked_threads
+            // Unparker is an Arc internally so this is (comparatively) cheap to do.
+            .push(unparker.clone()) {
+            panic!("Failed to park with {:?}", unparker);
+        }
+
+        trace!("parking thread {:?}", std::thread::current().id());
+        parker.park();
+    }
+
+    /// Pops a thread from the parked_threads queue and unparks it.
+    ///
+    /// Returns true if there were threads to unpark
+    fn unpark_thread(&self) -> bool {
+        trace!("parked_threads: len is {}", self.parked_threads.len());
+        if let Some(unparker) = self.parked_threads.pop() {
+            debug!("Unparking thread with {:?}", &unparker);
+            unparker.unpark();
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Affinity pinner for blocking pool
+    ///
+    /// Pinning isn't going to be enabled for single core systems.
+    #[inline]
+    fn affinity_pinner() {
+        if 1 != *load_balancer::core_count() {
+            let mut core = ROUND_ROBIN_PIN.lock().unwrap();
+            placement::set_for_current(*core);
+            core.id = (core.id + 1) % *load_balancer::core_count();
+        }
+    }
+
+    /// Exponentially Weighted Moving Average calculation
+    ///
+    /// This allows us to find the EMA value.
+    /// This value represents the trend of tasks mapped onto the thread pool.
+    /// Calculation is following:
+    /// ```text
+    /// +--------+-----------------+----------------------------------+
+    /// | Symbol |   Identifier    |           Explanation            |
+    /// +--------+-----------------+----------------------------------+
+    /// | α      | EMA_COEFFICIENT | smoothing factor between 0 and 1 |
+    /// | Yt     | freq            | frequency sample at time t       |
+    /// | St     | acc             | EMA at time t                    |
+    /// +--------+-----------------+----------------------------------+
+    /// ```
+    /// Under these definitions formula is following:
+    /// ```text
+    /// EMA = α * [ Yt + (1 - α)*Yt-1 + ((1 - α)^2)*Yt-2 + ((1 - α)^3)*Yt-3 ... ] + St
+    /// ```
+    /// # Arguments
+    ///
+    /// * `freq_queue` - Sliding window of frequency samples
+    #[inline]
+    fn calculate_ema(freq_queue: &VecDeque<u64>) -> f64 {
+        freq_queue.iter().enumerate().fold(0_f64, |acc, (i, freq)| {
+            acc + ((*freq as f64) * ((1_f64 - EMA_COEFFICIENT).powf(i as f64) as f64))
+        }) * EMA_COEFFICIENT as f64
+    }
+
+    /// Adaptive pool scaling function
+    ///
+    /// This allows to spawn new threads to make room for incoming task pressure.
+    /// Works in the background detached from the pool system and scales up the pool based
+    /// on the request rate.
+    ///
+    /// It uses frequency based calculation to define work. Utilizing average processing rate.
+    fn scale_pool(&'static self) {
+        // Fetch current frequency, it does matter that operations are ordered in this approach.
+        let current_frequency = self.last_frequency.swap(0, Ordering::SeqCst);
+        let mut freq_queue = self.frequencies.lock();
+
+        // Make it safe to start for calculations by adding initial frequency scale
+        if freq_queue.len() == 0 {
+            freq_queue.push_back(0);
+        }
+
+        // Calculate message rate for the given time window
+        let frequency = (current_frequency as f64 / SCALER_POLL_INTERVAL as f64) as u64;
+
+        // Calculates current time window's EMA value (including last sample)
+        let prev_ema_frequency = Self::calculate_ema(&freq_queue);
+
+        // Add seen frequency data to the frequency histogram.
+        freq_queue.push_back(frequency);
+        if freq_queue.len() == FREQUENCY_QUEUE_SIZE.saturating_add(1) {
+            freq_queue.pop_front();
+        }
+
+        // Calculates current time window's EMA value (including last sample)
+        let curr_ema_frequency = Self::calculate_ema(&freq_queue);
+
+        // Adapts the thread count of pool
+        //
+        // Sliding window of frequencies visited by the pool manager.
+        // Pool manager creates EMA value for previous window and current window.
+        // Compare them to determine scaling amount based on the trends.
+        // If current EMA value is bigger, we will scale up.
+        if curr_ema_frequency > prev_ema_frequency {
+            // "Scale by" amount can be seen as "how much load is coming".
+            // "Scale" amount is "how many threads we should spawn".
+            let scale_by: f64 = curr_ema_frequency - prev_ema_frequency;
+            let scale = num_cpus::get().min(
+                ((DEFAULT_LOW_WATERMARK as f64 * scale_by) + DEFAULT_LOW_WATERMARK as f64) as usize,
+            );
+            trace!("unparking {} threads", scale);
+
+            // It is time to scale the pool!
+            self.provision_threads(scale);
+        } else if (curr_ema_frequency - prev_ema_frequency).abs() < f64::EPSILON
+            && current_frequency != 0
+        {
+            // Throughput is low. Allocate more threads to unblock flow.
+            // If we fall to this case, scheduler is congested by longhauling tasks.
+            // For unblock the flow we should add up some threads to the pool, but not that many to
+            // stagger the program's operation.
+            trace!("unparking {} threads", DEFAULT_LOW_WATERMARK);
+            self.provision_threads(DEFAULT_LOW_WATERMARK as usize);
+        }
+    }
+}
--- a/runtime/executor/src/worker.rs
+++ b/runtime/executor/src/worker.rs
@ -0,0 +1,93 @@
+//!
+//! SMP parallelism based cache affine worker implementation
+//!
+//! This worker implementation relies on worker run queue statistics which are hold in the pinned global memory
+//! where workload distribution calculated and amended to their own local queues.
+
+use crate::pool;
+
+use lightproc::prelude::*;
+use std::cell::Cell;
+use std::ptr;
+use std::time::Duration;
+use crossbeam_deque::{Stealer, Worker};
+use crate::proc_stack::ProcStack;
+
+/// The timeout we'll use when parking before an other Steal attempt
+pub const THREAD_PARK_TIMEOUT: Duration = Duration::from_millis(1);
+
+thread_local! {
+    static STACK: Cell<*const ProcStack> = Cell::new(ptr::null_mut());
+}
+
+///
+/// Set the current process's stack during the run of the future.
+pub(crate) fn set_stack<F, R>(stack: *const ProcStack, f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    struct ResetStack<'a>(&'a Cell<*const ProcStack>);
+
+    impl Drop for ResetStack<'_> {
+        fn drop(&mut self) {
+            self.0.set(ptr::null());
+        }
+    }
+
+    STACK.with(|st| {
+        st.set(stack);
+        // create a guard to reset STACK even if the future panics. This is important since we
+        // must not drop the pointed-to ProcStack here in any case.
+        let _guard = ResetStack(st);
+
+        f()
+    })
+}
+
+/*
+pub(crate) fn get_proc_stack<F, R>(f: F) -> Option<R>
+where
+    F: FnOnce(&ProcStack) -> R,
+{
+    let res = STACK.try_with(|st| unsafe { st.get().as_ref().map(f) });
+
+    match res {
+        Ok(Some(val)) => Some(val),
+        Ok(None) | Err(_) => None,
+    }
+}
+
+///
+/// Get the stack currently in use for this thread
+pub fn current() -> ProcStack {
+    get_proc_stack(|proc| proc.clone())
+        .expect("`proc::current()` called outside the context of the proc")
+}
+ */
+
+pub(crate) fn schedule(proc: LightProc) {
+    pool::schedule(proc)
+}
+
+/// A worker thread running futures locally and stealing work from other workers if it runs empty.
+pub struct WorkerThread {
+    queue: Worker<LightProc>,
+}
+
+impl WorkerThread {
+    pub fn new() -> Self {
+        Self {
+            queue: Worker::new_fifo(),
+        }
+    }
+
+    pub fn stealer(&self) -> Stealer<LightProc> {
+        self.queue.stealer()
+    }
+
+    pub fn tick(&self) {
+        if let Some(lightproc) =  self.queue.pop() {
+            lightproc.run()
+        }
+    }
+}
--- a/runtime/executor/tests/lib.rs
+++ b/runtime/executor/tests/lib.rs
@ -0,0 +1,26 @@
+#[cfg(test)]
+mod tests {
+    use bastion_executor::{placement, pool};
+
+    #[test]
+    fn affinity_replacement() {
+        let core_ids = placement::get_core_ids().unwrap();
+        dbg!(core_ids);
+    }
+
+    #[cfg(feature = "tokio-runtime")]
+    mod tokio_tests {
+        #[tokio::test]
+        async fn pool_check() {
+            super::pool::get();
+        }
+    }
+
+    #[cfg(not(feature = "tokio-runtime"))]
+    mod no_tokio_tests {
+        #[test]
+        fn pool_check() {
+            super::pool::get();
+        }
+    }
+}
--- a/runtime/executor/tests/run_blocking.rs
+++ b/runtime/executor/tests/run_blocking.rs
@ -0,0 +1,38 @@
+use bastion_executor::blocking;
+use bastion_executor::run::run;
+use lightproc::proc_stack::ProcStack;
+use std::thread;
+use std::time::Duration;
+
+#[cfg(feature = "tokio-runtime")]
+mod tokio_tests {
+    #[tokio::test]
+    async fn test_run_blocking() {
+        super::run_test()
+    }
+}
+
+#[cfg(not(feature = "tokio-runtime"))]
+mod no_tokio_tests {
+    #[test]
+    fn test_run_blocking() {
+        super::run_test()
+    }
+}
+
+fn run_test() {
+    let output = run(
+        blocking::spawn_blocking(
+            async {
+                let duration = Duration::from_millis(1);
+                thread::sleep(duration);
+                42
+            },
+            ProcStack::default(),
+        ),
+        ProcStack::default(),
+    )
+    .unwrap();
+
+    assert_eq!(42, output);
+}
--- a/runtime/executor/tests/thread_pool.rs
+++ b/runtime/executor/tests/thread_pool.rs
@ -0,0 +1,155 @@
+use bastion_executor::blocking;
+use bastion_executor::run::run;
+use futures::future::join_all;
+use lightproc::proc_stack::ProcStack;
+use lightproc::recoverable_handle::RecoverableHandle;
+use std::thread;
+use std::time::Duration;
+use std::time::Instant;
+
+// Test for slow joins without task bursts during joins.
+#[test]
+#[ignore]
+fn slow_join() {
+    let thread_join_time_max = 11_000;
+    let start = Instant::now();
+
+    // Send an initial batch of million bursts.
+    let handles = (0..1_000_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(1);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    run(join_all(handles), ProcStack::default());
+
+    // Let them join to see how it behaves under different workloads.
+    let duration = Duration::from_millis(thread_join_time_max);
+    thread::sleep(duration);
+
+    // Spawn yet another batch of work on top of it
+    let handles = (0..10_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(100);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    run(join_all(handles), ProcStack::default());
+
+    // Slow joins shouldn't cause internal slow down
+    let elapsed = start.elapsed().as_millis() - thread_join_time_max as u128;
+    println!("Slow task join. Monotonic exec time: {:?} ns", elapsed);
+
+    // Previous implementation is around this threshold.
+}
+
+// Test for slow joins with task burst.
+#[test]
+#[ignore]
+fn slow_join_interrupted() {
+    let thread_join_time_max = 2_000;
+    let start = Instant::now();
+
+    // Send an initial batch of million bursts.
+    let handles = (0..1_000_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(1);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    run(join_all(handles), ProcStack::default());
+
+    // Let them join to see how it behaves under different workloads.
+    // This time join under the time window.
+    let duration = Duration::from_millis(thread_join_time_max);
+    thread::sleep(duration);
+
+    // Spawn yet another batch of work on top of it
+    let handles = (0..10_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(100);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    run(join_all(handles), ProcStack::default());
+
+    // Slow joins shouldn't cause internal slow down
+    let elapsed = start.elapsed().as_millis() - thread_join_time_max as u128;
+    println!("Slow task join. Monotonic exec time: {:?} ns", elapsed);
+
+    // Previous implementation is around this threshold.
+}
+
+// This test is expensive but it proves that longhauling tasks are working in adaptive thread pool.
+// Thread pool which spawns on-demand will panic with this test.
+#[test]
+#[ignore]
+fn longhauling_task_join() {
+    let thread_join_time_max = 11_000;
+    let start = Instant::now();
+
+    // First batch of overhauling tasks
+    let _ = (0..100_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(1000);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    // Let them join to see how it behaves under different workloads.
+    let duration = Duration::from_millis(thread_join_time_max);
+    thread::sleep(duration);
+
+    // Send yet another medium sized batch to see how it scales.
+    let handles = (0..10_000)
+        .map(|_| {
+            blocking::spawn_blocking(
+                async {
+                    let duration = Duration::from_millis(100);
+                    thread::sleep(duration);
+                },
+                ProcStack::default(),
+            )
+        })
+        .collect::<Vec<RecoverableHandle<()>>>();
+
+    run(join_all(handles), ProcStack::default());
+
+    // Slow joins shouldn't cause internal slow down
+    let elapsed = start.elapsed().as_millis() - thread_join_time_max as u128;
+    println!(
+        "Long-hauling task join. Monotonic exec time: {:?} ns",
+        elapsed
+    );
+
+    // Previous implementation will panic when this test is running.
+}