1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
//! Thread Local Storage on x86
//!
//! # Usage
//!
//! You declare a thread-local using the [#\[thread_local\] attribute] :
//!
//! ```
//! #![feature(thread_local)]
//! #[thread_local]
//! static MY_THREAD_LOCAL: core::cell::Cell<u8> = core::cell::Cell::new(42);
//! ```
//!
//! and access it as if it was a regular static, only that each thread will have its own view of
//! the static.
//!
//! The compiler is responsible for generating code that will access the right address, provided
//! we configured TLS correctly.
//!
//! ##### Early startup
//!
//! Note that you can't access a thread-local static before [`init_main_thread`] is called, because
//! the thread-local area for the main thread isn't initialized yet, and this will likely result to
//! a page fault or UB.
//!
//! # Inner workings
//!
//! We implement the TLS according to conventions laid out by [Ulrich Drepper's paper on TLS] which
//! is followed by LLVM and most compilers.
//!
//! Since we're running on i386, we're following variant II.
//!
//! Each thread's `gs` segment points to a thread local memory area where thread-local statics live.
//! thread-local statics are simply accessed through an offset from `gs`.
//!
//! The linker is in charge of creating an ELF segment of type `PT_TLS` where an initialization image
//! for cpu local regions can be found, and is meant to be copy-pasted for every thread we create.
//!
//! ##### on SunriseOS
//!
//! On Surnise, the area where `gs` points to is per-thread and user-controlled, we set it at the
//! startup of every thread with the [`set_thread_area`] syscall.
//!
//! The TLS initialisation image is supposed to be retrieved from our own program headers, which is
//! a really weird design.
//! Since we don't have access to our program headers, we instead use the linker to expose the following
//! symbols:
//!
//! * [`__tls_init_image_addr__`], `p_vaddr`: the address of our TLS initialisation image.
//! * [`__tls_file_size__`], `p_filesz`: the size of our TLS initialisation image.
//! * [`__tls_mem_size__`], `p_memsz`: the total size of our TLS segment.
//! * [`__tls_align__`], `p_align`: the alignment of our TLS segment.
//!
//! Those symbols are the addresses of the initialization in our `.tdata`, so it can directly be copied.
//!
//! ##### dtv and `__tls_get_addr`
//!
//! Since we don't do dynamic loading (yet ?), we know our TLS model will be static (either
//! Initial Exec or Local Exec).
//! Those models always access thread-locals directly via `gs`, and always short-circuit the dtv.
//!
//! So we don't even bother allocating a dtv array at all. Neither do we define a `__tls_get_addr`
//! function.
//!
//! This might change in the future when we will want to support dynamic loading.
//!
//! [`init_main_thread`]: crate::threads::init_main_thread
//! [`ARE_CPU_LOCALS_INITIALIZED_YET`]: crate::cpu_locals::ARE_CPU_LOCALS_INITIALIZED_YET
//! [Ulrich Drepper's paper on TLS]: https://web.archive.org/web/20190710135250/https://akkadia.org/drepper/tls.pdf
//! [`set_thread_area`]: crate::syscalls::set_thread_area
//! [#\[thread_local\] attribute]: https://github.com/rust-lang/rust/issues/10310
//! [`__tls_init_image_addr__`]: __tls_init_image_addr__
//! [`__tls_file_size__`]: __tls_file_size__
//! [`__tls_mem_size__`]: __tls_mem_size__
//! [`__tls_align__`]: __tls_align__

use crate::syscalls;
use sunrise_libutils::div_ceil;
use alloc::alloc::{alloc_zeroed, dealloc, Layout};
use core::mem::{align_of, size_of};
use core::fmt::Debug;

extern "C" {
    /// The address of the start of the TLS initialisation image in our `.tdata`.
    ///
    /// Because we don't want to read our own `P_TLS` program header,
    /// the linker provides a symbol for the start of the init image.
    ///
    /// This is an **absolute symbol**, which means its "address" is actually its value,
    /// i.e. to get a pointer do:
    ///
    /// ```ignore
    /// let tls_init_image_addr: *const u8 = unsafe { &__tls_init_image_addr__ as *const u8 };
    /// ```
    static __tls_init_image_addr__: u8;
    /// The size of the TLS initialisation image in our `.tdata`.
    ///
    /// Because we don't want to read our own `P_TLS` program header,
    /// the linker provides a symbol for the size of the init image.
    ///
    /// This is an **absolute symbol**, which means its "address" is actually its value,
    /// i.e. to get its value do:
    ///
    /// ```ignore
    /// let tls_init_image_size: usize = unsafe { &__tls_file_size__ as *const _ as usize };
    /// ```
    static __tls_file_size__: usize;
    /// The total memsize of the TLS segment: .tdata + .tbss
    ///
    /// Because we don't want to read our own `P_TLS` program header,
    /// the linker provides a symbol for the memsize of the TLS segment.
    ///
    /// This is an **absolute symbol**, which means its "address" is actually its value,
    /// i.e. to get its value do:
    ///
    /// ```ignore
    /// let tls_block_size = unsafe { &__tls_mem_size__ as *const _ as usize };
    /// ```
    static __tls_mem_size__: usize;
    /// The alignment of the TLS segment.
    ///
    /// Because we don't want to read our own `P_TLS` program header,
    /// the linker provides a symbol for the alignment it used.
    ///
    /// This is an **absolute symbol**, which means its "address" is actually its value,
    /// i.e. to get its value do:
    ///
    /// ```ignore
    /// let tls_align = unsafe { &__tls_align__ as *const _ as usize };
    /// ```
    static __tls_align__: usize;
}

/// The Thread Local Storage manager for a thread
///
/// We allocate one for every thread we create, and store it in the thread's context.
/// When it is dropped, all allocated memory is freed.
#[derive(Debug)]
pub struct TlsElf {
    /// The array of static module blocks + TCB
    static_region: ThreadLocalStaticRegion,
    // no dtv, no dynamics regions for now
}

impl TlsElf {
    /// Allocates and initializes the static region, including TCB.
    ///
    /// Finds out the location of the initialization image from linker defined symbols.
    pub fn allocate() -> Self {
        // copy tls static area
        let init_image_addr = unsafe {
            // safe: set by linker
            &__tls_init_image_addr__ as *const u8
        };
        let file_size = unsafe {
            // safe: set by the linker
            &__tls_file_size__ as *const _ as usize
        };
        let init_image = unsafe {
            // safe: - the initialization image will never be accessed mutably,
            //       - it lives in our .data so its lifetime is &'static,
            //       - u8 is POD and always aligned,
            //       => creating a const slice is ok.
            core::slice::from_raw_parts(init_image_addr, file_size)
        };
        let mem_size = unsafe {
            // safe: set by the linker
            &__tls_mem_size__ as *const _ as usize
        };
        let align = unsafe {
            // safe: set by the linker
            &__tls_align__ as *const _ as usize
        };

        let tls_static_region = ThreadLocalStaticRegion::allocate(
            init_image,
            mem_size,
            align);

        TlsElf {
            static_region: tls_static_region
        }
    }

    /// Calls [`syscalls::set_thread_area`] with the address of this TlsElf's [`ThreadControlBlock`].
    ///
    /// # Safety
    ///
    /// The TlsElf should not be enabled_for_current_thread by any other thread.
    /// Having a TLS shared by multiple threads is UB.
    ///
    /// # Panics
    ///
    /// Panics if the syscall returned an error, as this is unrecoverable.
    pub unsafe fn enable_for_current_thread(&self) {
        unsafe {
            // safe: TlsElf is RAII so self is a valid well-formed TLS region.
            //       However, we cannot guarantee that it's not used by anybody else,
            //       so propagate this constraint.
            syscalls::set_thread_area(self.static_region.tcb() as *const _ as usize)
                .expect("Cannot set thread TLS pointer");
        }
    }
}

/// The `round` function, as defined in section 3.0:
///
/// ```text
///     round(x,y) = y * ⌈x/y⌉
/// ```
///
/// Just a poorly-named `align_up`.
fn tls_align_up(x: usize, y: usize) -> usize {
    y * div_ceil(x, y)
}

/// Elf TLS TCB
///
/// The variant II leaves the specification of the ThreadControlBlock (TCB) to the implementor,
/// with the only requirement that the first word in the TCB, pointed by `tp`, contains its own
/// address, i.e. is a pointer to itself (GNU variant).
///
/// We don't need to store anything else in the TCB, it's just the self pointer.
#[repr(C)]
#[derive(Debug)]
struct ThreadControlBlock {
    /// Pointer containing its own address.
    tp_self_ptr: *const ThreadControlBlock,
}

/// Represents an allocated thread local static region.
///
/// Because TLS regions have a really specific layout, we don't use Box and instead interact with
/// the allocator directly. This type is the equivalent of a Box, it stores the pointer to the
/// allocated memory, and deallocates it on Drop.
struct ThreadLocalStaticRegion {
    /// Pointer to the allocated memory
    ptr: usize,
    /// Layout of the allocated memory. Used when deallocating.
    layout: Layout,
    /// Offset of the TCB in this allocation.
    tcb_offset: usize,
}

impl ThreadLocalStaticRegion {
    /// Returns a pointer to the [ThreadControlBlock] in the allocated region.
    /// All TLS arithmetic are done relative to this pointer.
    ///
    /// For TLS to work, the value stored at this address should be the address itself, i.e.
    /// having a pointer pointing to itself.
    fn tcb(&self) -> &ThreadControlBlock {
        unsafe {
            // safe: - guaranteed to be aligned, and still in the allocation,
            //       - no one should ever have a mut reference to the ThreadControlBlock after its
            //         initialisation.
            &*((self.ptr + self.tcb_offset) as *const ThreadControlBlock)
        }
    }

    /// Allocates a ThreadLocalStaticRegion.
    ///
    /// The region's content is copied from the TLS initialisation image described by `block_src`,
    /// padded with 0s for `block_size`, to which is appended a [`ThreadControlBlock`].
    ///
    /// The ThreadLocalStaticRegion uses `PT_TLS`'s `p_align` field passed in `block_align`
    /// to compute its layout and total size.
    ///
    /// ### Alignment
    ///
    /// ```text
    ///
    ///         V----------------------V  tls_align_up(tls_size_1, align_1)
    ///
    ///                                +-- gs:0
    ///                                |
    ///         +----------------------|-- tlsoffset_1 = gs:0 - tls_align_up(tls_size_1, align_1)
    ///         |                      |
    ///         V                      V
    ///
    ///         j----------------~-----j---------j
    ///    ...  |    tls_size_1  | pad |   TCB   |
    ///         j----------------~-----j---------j
    ///
    ///    ^    ^                      ^
    ///    |    |                      |
    ///    |    |                      +-- TCB_align: Determines alignment of everything.
    ///    |    |                          = max(align_of::<TCB>(), align_1). e.g. : 16.
    ///    |    |
    ///    |    +------------------------- TCB_align - n * align_1
    ///    |                               => still aligned to align_1 because TCB is aligned to align_1.
    ///    |
    ///    +------------------------------ alloc_align == TCB_align
    ///                                    => &TCB = &alloc + tls_align_up(gs:0 - tls_offset_1, TCB_align)
    ///
    ///    ^---^                           alloc_pad
    ///
    /// ```
    #[allow(clippy::cast_ptr_alignment)]
    fn allocate(block_src: &[u8], block_size: usize, block_align: usize) -> Self {
        let tls_offset1 = tls_align_up(block_size, block_align);
        let tcb_align = usize::max(align_of::<ThreadControlBlock>(), block_align);
        let tcb_offset = tls_align_up(tls_offset1, tcb_align);
        let alloc_pad_size = tcb_offset - tls_offset1;
        let layout = Layout::from_size_align(
            tcb_offset + size_of::<ThreadControlBlock>(),
            tcb_align
        ).unwrap();
        let alloc = unsafe {
            // safe: layout.size >= sizeof::<TCB> -> layout.size != 0
            alloc_zeroed(layout)
        };
        assert!(!alloc.is_null(), "thread_locals: failed static area allocation");

        unsafe {
            // safe: everything is done within our allocation, u8 is always aligned.
            // copy data
            core::ptr::copy_nonoverlapping(
                block_src as *const [u8] as *const u8,
                alloc.add(alloc_pad_size),
                block_src.len()
            );
            // .tbss + pad are already set to 0 by alloc_zeroed.
            // write tcb
            core::ptr::write(
                alloc.add(tcb_offset) as *mut ThreadControlBlock,
                ThreadControlBlock {
                    tp_self_ptr: alloc.add(tcb_offset) as *const ThreadControlBlock
                }
            );
        };
        Self {
            ptr: alloc as usize,
            layout,
            tcb_offset
        }
    }
}

impl Drop for ThreadLocalStaticRegion {
    /// Dropping a ThreadLocalStaticRegion deallocates it.
    fn drop(&mut self) {
        unsafe {
            // safe: - self.ptr is obviously allocated.
            //       - self.layout is the same argument that was used for alloc.
            dealloc(self.ptr as *mut u8, self.layout)
        };
    }
}

impl Debug for ThreadLocalStaticRegion {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
        f.debug_struct("ThreadLocalStaticRegion")
            .field("start_address", &self.ptr)
            .field("tcb_address", &self.tcb())
            .field("total_size", &self.layout.size())
            .finish()
    }
}