// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- // Copyright (c) 2008, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // --- // Author: Sanjay Ghemawat #ifndef TCMALLOC_THREAD_CACHE_H_ #define TCMALLOC_THREAD_CACHE_H_ #include #ifdef HAVE_PTHREAD #include // for pthread_t, pthread_key_t #endif #include // for size_t, NULL #ifdef HAVE_STDINT_H #include // for uint32_t, uint64_t #endif #include // for ssize_t #include "base/commandlineflags.h" #include "common.h" #include "linked_list.h" #include "maybe_threads.h" #include "page_heap_allocator.h" #include "sampler.h" #include "static_vars.h" #include "common.h" // for SizeMap, kMaxSize, etc #include "internal_logging.h" // for ASSERT, etc #include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc #include "page_heap_allocator.h" // for PageHeapAllocator #include "sampler.h" // for Sampler #include "static_vars.h" // for Static DECLARE_int64(tcmalloc_sample_parameter); namespace tcmalloc { //------------------------------------------------------------------- // Data kept per thread //------------------------------------------------------------------- class ThreadCache { public: #ifdef HAVE_TLS enum { have_tls = true }; #else enum { have_tls = false }; #endif // All ThreadCache objects are kept in a linked list (for stats collection) ThreadCache* next_; ThreadCache* prev_; void Init(pthread_t tid); void Cleanup(); // Accessors (mostly just for printing stats) int freelist_length(size_t cl) const { return list_[cl].length(); } // Total byte size in cache size_t Size() const { return size_; } // Allocate an object of the given size and class. The size given // must be the same as the size of the class in the size map. void* Allocate(size_t size, size_t cl); void Deallocate(void* ptr, size_t size_class); void Scavenge(); int GetSamplePeriod(); // Record allocation of "k" bytes. Return true iff allocation // should be sampled bool SampleAllocation(size_t k); static void InitModule(); static void InitTSD(); static ThreadCache* GetThreadHeap(); static ThreadCache* GetCache(); static ThreadCache* GetCacheIfPresent(); static ThreadCache* GetCacheWhichMustBePresent(); static ThreadCache* CreateCacheIfNecessary(); static void BecomeIdle(); static void BecomeTemporarilyIdle(); static size_t MinSizeForSlowPath(); static void SetMinSizeForSlowPath(size_t size); static void SetUseEmergencyMalloc(); static void ResetUseEmergencyMalloc(); static bool IsUseEmergencyMalloc(); static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; } // Return the number of thread heaps in use. static inline int HeapsInUse(); // Adds to *total_bytes the total number of bytes used by all thread heaps. // Also, if class_count is not NULL, it must be an array of size kNumClasses, // and this function will increment each element of class_count by the number // of items in all thread-local freelists of the corresponding size class. // REQUIRES: Static::pageheap_lock is held. static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); // Sets the total thread cache size to new_size, recomputing the // individual thread cache sizes as necessary. // REQUIRES: Static::pageheap lock is held. static void set_overall_thread_cache_size(size_t new_size); static size_t overall_thread_cache_size() { return overall_thread_cache_size_; } private: class FreeList { private: void* list_; // Linked list of nodes #ifdef _LP64 // On 64-bit hardware, manipulating 16-bit values may be slightly slow. uint32_t length_; // Current length. uint32_t lowater_; // Low water mark for list length. uint32_t max_length_; // Dynamic max list length based on usage. // Tracks the number of times a deallocation has caused // length_ > max_length_. After the kMaxOverages'th time, max_length_ // shrinks and length_overages_ is reset to zero. uint32_t length_overages_; #else // If we aren't using 64-bit pointers then pack these into less space. uint16_t length_; uint16_t lowater_; uint16_t max_length_; uint16_t length_overages_; #endif public: void Init() { list_ = NULL; length_ = 0; lowater_ = 0; max_length_ = 1; length_overages_ = 0; } // Return current length of list size_t length() const { return length_; } // Return the maximum length of the list. size_t max_length() const { return max_length_; } // Set the maximum length of the list. If 'new_max' > length(), the // client is responsible for removing objects from the list. void set_max_length(size_t new_max) { max_length_ = new_max; } // Return the number of times that length() has gone over max_length(). size_t length_overages() const { return length_overages_; } void set_length_overages(size_t new_count) { length_overages_ = new_count; } // Is list empty? bool empty() const { return list_ == NULL; } // Low-water mark management int lowwatermark() const { return lowater_; } void clear_lowwatermark() { lowater_ = length_; } void Push(void* ptr) { SLL_Push(&list_, ptr); length_++; } void* Pop() { ASSERT(list_ != NULL); length_--; if (length_ < lowater_) lowater_ = length_; return SLL_Pop(&list_); } void* Next() { return SLL_Next(&list_); } void PushRange(int N, void *start, void *end) { SLL_PushRange(&list_, start, end); length_ += N; } void PopRange(int N, void **start, void **end) { SLL_PopRange(&list_, N, start, end); ASSERT(length_ >= N); length_ -= N; if (length_ < lowater_) lowater_ = length_; } }; // Gets and returns an object from the central cache, and, if possible, // also adds some objects of that size class to this thread cache. void* FetchFromCentralCache(size_t cl, size_t byte_size); // Releases some number of items from src. Adjusts the list's max_length // to eventually converge on num_objects_to_move(cl). void ListTooLong(FreeList* src, size_t cl); // Releases N items from this thread cache. void ReleaseToCentralCache(FreeList* src, size_t cl, int N); // Increase max_size_ by reducing unclaimed_cache_space_ or by // reducing the max_size_ of some other thread. In both cases, // the delta is kStealAmount. void IncreaseCacheLimit(); // Same as above but requires Static::pageheap_lock() is held. void IncreaseCacheLimitLocked(); // If TLS is available, we also store a copy of the per-thread object // in a __thread variable since __thread variables are faster to read // than pthread_getspecific(). We still need pthread_setspecific() // because __thread variables provide no way to run cleanup code when // a thread is destroyed. // We also give a hint to the compiler to use the "initial exec" TLS // model. This is faster than the default TLS model, at the cost that // you cannot dlopen this library. (To see the difference, look at // the CPU use of __tls_get_addr with and without this attribute.) // Since we don't really use dlopen in google code -- and using dlopen // on a malloc replacement is asking for trouble in any case -- that's // a good tradeoff for us. #ifdef HAVE___ATTRIBUTE__ #define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec"))) #else #define ATTR_INITIAL_EXEC #endif #ifdef HAVE_TLS struct ThreadLocalData { ThreadCache* heap; // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise. // The latter is the common case and allows allocation to be faster // than it would be otherwise: typically a single branch will // determine that the requested allocation is no more than kMaxSize // and we can then proceed, knowing that global and thread-local tcmalloc // state is initialized. size_t min_size_for_slow_path; bool use_emergency_malloc; size_t old_min_size_for_slow_path; }; static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC; #endif // Thread-specific key. Initialization here is somewhat tricky // because some Linux startup code invokes malloc() before it // is in a good enough state to handle pthread_keycreate(). // Therefore, we use TSD keys only after tsd_inited is set to true. // Until then, we use a slow path to get the heap object. static bool tsd_inited_; static pthread_key_t heap_key_; // Linked list of heap objects. Protected by Static::pageheap_lock. static ThreadCache* thread_heaps_; static int thread_heap_count_; // A pointer to one of the objects in thread_heaps_. Represents // the next ThreadCache from which a thread over its max_size_ should // steal memory limit. Round-robin through all of the objects in // thread_heaps_. Protected by Static::pageheap_lock. static ThreadCache* next_memory_steal_; // Overall thread cache size. Protected by Static::pageheap_lock. static size_t overall_thread_cache_size_; // Global per-thread cache size. Writes are protected by // Static::pageheap_lock. Reads are done without any locking, which should be // fine as long as size_t can be written atomically and we don't place // invariants between this variable and other pieces of state. static volatile size_t per_thread_cache_size_; // Represents overall_thread_cache_size_ minus the sum of max_size_ // across all ThreadCaches. Protected by Static::pageheap_lock. static ssize_t unclaimed_cache_space_; // This class is laid out with the most frequently used fields // first so that hot elements are placed on the same cache line. size_t size_; // Combined size of data size_t max_size_; // size_ > max_size_ --> Scavenge() // We sample allocations, biased by the size of the allocation Sampler sampler_; // A sampler FreeList list_[kNumClasses]; // Array indexed by size-class pthread_t tid_; // Which thread owns it bool in_setspecific_; // In call to pthread_setspecific? // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. static ThreadCache* NewHeap(pthread_t tid); // Use only as pthread thread-specific destructor function. static void DestroyThreadCache(void* ptr); static void DeleteCache(ThreadCache* heap); static void RecomputePerThreadCacheSize(); // Ensure that this class is cacheline-aligned. This is critical for // performance, as false sharing would negate many of the benefits // of a per-thread cache. } CACHELINE_ALIGNED; // Allocator for thread heaps // This is logically part of the ThreadCache class, but MSVC, at // least, does not like using ThreadCache as a template argument // before the class is fully defined. So we put it outside the class. extern PageHeapAllocator threadcache_allocator; inline int ThreadCache::HeapsInUse() { return threadcache_allocator.inuse(); } inline bool ThreadCache::SampleAllocation(size_t k) { #ifndef NO_TCMALLOC_SAMPLES return UNLIKELY(FLAGS_tcmalloc_sample_parameter > 0) && sampler_.SampleAllocation(k); #else return false; #endif } inline void* ThreadCache::Allocate(size_t size, size_t cl) { ASSERT(size <= kMaxSize); ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); FreeList* list = &list_[cl]; if (UNLIKELY(list->empty())) { return FetchFromCentralCache(cl, size); } size_ -= size; return list->Pop(); } inline void ThreadCache::Deallocate(void* ptr, size_t cl) { FreeList* list = &list_[cl]; size_ += Static::sizemap()->ByteSizeForClass(cl); ssize_t size_headroom = max_size_ - size_ - 1; // This catches back-to-back frees of allocs in the same size // class. A more comprehensive (and expensive) test would be to walk // the entire freelist. But this might be enough to find some bugs. ASSERT(ptr != list->Next()); list->Push(ptr); ssize_t list_headroom = static_cast(list->max_length()) - list->length(); // There are two relatively uncommon things that require further work. // In the common case we're done, and in that case we need a single branch // because of the bitwise-or trick that follows. if (UNLIKELY((list_headroom | size_headroom) < 0)) { if (list_headroom < 0) { ListTooLong(list, cl); } if (size_ >= max_size_) Scavenge(); } } inline ThreadCache* ThreadCache::GetThreadHeap() { #ifdef HAVE_TLS return threadlocal_data_.heap; #else return reinterpret_cast( perftools_pthread_getspecific(heap_key_)); #endif } inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() { #ifdef HAVE_TLS ASSERT(threadlocal_data_.heap); return threadlocal_data_.heap; #else ASSERT(perftools_pthread_getspecific(heap_key_)); return reinterpret_cast( perftools_pthread_getspecific(heap_key_)); #endif } inline ThreadCache* ThreadCache::GetCache() { ThreadCache* ptr = NULL; if (!tsd_inited_) { InitModule(); } else { ptr = GetThreadHeap(); } if (ptr == NULL) ptr = CreateCacheIfNecessary(); return ptr; } // In deletion paths, we do not try to create a thread-cache. This is // because we may be in the thread destruction code and may have // already cleaned up the cache for this thread. inline ThreadCache* ThreadCache::GetCacheIfPresent() { #ifndef HAVE_TLS if (!tsd_inited_) return NULL; #endif return GetThreadHeap(); } inline size_t ThreadCache::MinSizeForSlowPath() { #ifdef HAVE_TLS return threadlocal_data_.min_size_for_slow_path; #else return 0; #endif } inline void ThreadCache::SetMinSizeForSlowPath(size_t size) { #ifdef HAVE_TLS threadlocal_data_.min_size_for_slow_path = size; #endif } inline void ThreadCache::SetUseEmergencyMalloc() { #ifdef HAVE_TLS threadlocal_data_.old_min_size_for_slow_path = threadlocal_data_.min_size_for_slow_path; threadlocal_data_.min_size_for_slow_path = 0; threadlocal_data_.use_emergency_malloc = true; #endif } inline void ThreadCache::ResetUseEmergencyMalloc() { #ifdef HAVE_TLS threadlocal_data_.min_size_for_slow_path = threadlocal_data_.old_min_size_for_slow_path; threadlocal_data_.use_emergency_malloc = false; #endif } inline bool ThreadCache::IsUseEmergencyMalloc() { #if defined(HAVE_TLS) && defined(ENABLE_EMERGENCY_MALLOC) return UNLIKELY(threadlocal_data_.use_emergency_malloc); #else return false; #endif } } // namespace tcmalloc #endif // TCMALLOC_THREAD_CACHE_H_