thread_cache.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. // -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
  2. // Copyright (c) 2008, Google Inc.
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are
  7. // met:
  8. //
  9. // * Redistributions of source code must retain the above copyright
  10. // notice, this list of conditions and the following disclaimer.
  11. // * Redistributions in binary form must reproduce the above
  12. // copyright notice, this list of conditions and the following disclaimer
  13. // in the documentation and/or other materials provided with the
  14. // distribution.
  15. // * Neither the name of Google Inc. nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. // ---
  31. // Author: Sanjay Ghemawat <opensource@google.com>
  32. #ifndef TCMALLOC_THREAD_CACHE_H_
  33. #define TCMALLOC_THREAD_CACHE_H_
  34. #include <config.h>
  35. #ifdef HAVE_PTHREAD
  36. #include <pthread.h> // for pthread_t, pthread_key_t
  37. #endif
  38. #include <stddef.h> // for size_t, NULL
  39. #ifdef HAVE_STDINT_H
  40. #include <stdint.h> // for uint32_t, uint64_t
  41. #endif
  42. #include <sys/types.h> // for ssize_t
  43. #include "base/commandlineflags.h"
  44. #include "common.h"
  45. #include "linked_list.h"
  46. #include "maybe_threads.h"
  47. #include "page_heap_allocator.h"
  48. #include "sampler.h"
  49. #include "static_vars.h"
  50. #include "common.h" // for SizeMap, kMaxSize, etc
  51. #include "internal_logging.h" // for ASSERT, etc
  52. #include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc
  53. #include "page_heap_allocator.h" // for PageHeapAllocator
  54. #include "sampler.h" // for Sampler
  55. #include "static_vars.h" // for Static
  56. DECLARE_int64(tcmalloc_sample_parameter);
  57. namespace tcmalloc {
  58. //-------------------------------------------------------------------
  59. // Data kept per thread
  60. //-------------------------------------------------------------------
  61. class ThreadCache {
  62. public:
  63. #ifdef HAVE_TLS
  64. enum { have_tls = true };
  65. #else
  66. enum { have_tls = false };
  67. #endif
  68. // All ThreadCache objects are kept in a linked list (for stats collection)
  69. ThreadCache* next_;
  70. ThreadCache* prev_;
  71. void Init(pthread_t tid);
  72. void Cleanup();
  73. // Accessors (mostly just for printing stats)
  74. int freelist_length(size_t cl) const { return list_[cl].length(); }
  75. // Total byte size in cache
  76. size_t Size() const { return size_; }
  77. // Allocate an object of the given size and class. The size given
  78. // must be the same as the size of the class in the size map.
  79. void* Allocate(size_t size, size_t cl);
  80. void Deallocate(void* ptr, size_t size_class);
  81. void Scavenge();
  82. int GetSamplePeriod();
  83. // Record allocation of "k" bytes. Return true iff allocation
  84. // should be sampled
  85. bool SampleAllocation(size_t k);
  86. static void InitModule();
  87. static void InitTSD();
  88. static ThreadCache* GetThreadHeap();
  89. static ThreadCache* GetCache();
  90. static ThreadCache* GetCacheIfPresent();
  91. static ThreadCache* GetCacheWhichMustBePresent();
  92. static ThreadCache* CreateCacheIfNecessary();
  93. static void BecomeIdle();
  94. static void BecomeTemporarilyIdle();
  95. static size_t MinSizeForSlowPath();
  96. static void SetMinSizeForSlowPath(size_t size);
  97. static void SetUseEmergencyMalloc();
  98. static void ResetUseEmergencyMalloc();
  99. static bool IsUseEmergencyMalloc();
  100. static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; }
  101. // Return the number of thread heaps in use.
  102. static inline int HeapsInUse();
  103. // Adds to *total_bytes the total number of bytes used by all thread heaps.
  104. // Also, if class_count is not NULL, it must be an array of size kNumClasses,
  105. // and this function will increment each element of class_count by the number
  106. // of items in all thread-local freelists of the corresponding size class.
  107. // REQUIRES: Static::pageheap_lock is held.
  108. static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
  109. // Sets the total thread cache size to new_size, recomputing the
  110. // individual thread cache sizes as necessary.
  111. // REQUIRES: Static::pageheap lock is held.
  112. static void set_overall_thread_cache_size(size_t new_size);
  113. static size_t overall_thread_cache_size() {
  114. return overall_thread_cache_size_;
  115. }
  116. private:
  117. class FreeList {
  118. private:
  119. void* list_; // Linked list of nodes
  120. #ifdef _LP64
  121. // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
  122. uint32_t length_; // Current length.
  123. uint32_t lowater_; // Low water mark for list length.
  124. uint32_t max_length_; // Dynamic max list length based on usage.
  125. // Tracks the number of times a deallocation has caused
  126. // length_ > max_length_. After the kMaxOverages'th time, max_length_
  127. // shrinks and length_overages_ is reset to zero.
  128. uint32_t length_overages_;
  129. #else
  130. // If we aren't using 64-bit pointers then pack these into less space.
  131. uint16_t length_;
  132. uint16_t lowater_;
  133. uint16_t max_length_;
  134. uint16_t length_overages_;
  135. #endif
  136. public:
  137. void Init() {
  138. list_ = NULL;
  139. length_ = 0;
  140. lowater_ = 0;
  141. max_length_ = 1;
  142. length_overages_ = 0;
  143. }
  144. // Return current length of list
  145. size_t length() const {
  146. return length_;
  147. }
  148. // Return the maximum length of the list.
  149. size_t max_length() const {
  150. return max_length_;
  151. }
  152. // Set the maximum length of the list. If 'new_max' > length(), the
  153. // client is responsible for removing objects from the list.
  154. void set_max_length(size_t new_max) {
  155. max_length_ = new_max;
  156. }
  157. // Return the number of times that length() has gone over max_length().
  158. size_t length_overages() const {
  159. return length_overages_;
  160. }
  161. void set_length_overages(size_t new_count) {
  162. length_overages_ = new_count;
  163. }
  164. // Is list empty?
  165. bool empty() const {
  166. return list_ == NULL;
  167. }
  168. // Low-water mark management
  169. int lowwatermark() const { return lowater_; }
  170. void clear_lowwatermark() { lowater_ = length_; }
  171. void Push(void* ptr) {
  172. SLL_Push(&list_, ptr);
  173. length_++;
  174. }
  175. void* Pop() {
  176. ASSERT(list_ != NULL);
  177. length_--;
  178. if (length_ < lowater_) lowater_ = length_;
  179. return SLL_Pop(&list_);
  180. }
  181. void* Next() {
  182. return SLL_Next(&list_);
  183. }
  184. void PushRange(int N, void *start, void *end) {
  185. SLL_PushRange(&list_, start, end);
  186. length_ += N;
  187. }
  188. void PopRange(int N, void **start, void **end) {
  189. SLL_PopRange(&list_, N, start, end);
  190. ASSERT(length_ >= N);
  191. length_ -= N;
  192. if (length_ < lowater_) lowater_ = length_;
  193. }
  194. };
  195. // Gets and returns an object from the central cache, and, if possible,
  196. // also adds some objects of that size class to this thread cache.
  197. void* FetchFromCentralCache(size_t cl, size_t byte_size);
  198. // Releases some number of items from src. Adjusts the list's max_length
  199. // to eventually converge on num_objects_to_move(cl).
  200. void ListTooLong(FreeList* src, size_t cl);
  201. // Releases N items from this thread cache.
  202. void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
  203. // Increase max_size_ by reducing unclaimed_cache_space_ or by
  204. // reducing the max_size_ of some other thread. In both cases,
  205. // the delta is kStealAmount.
  206. void IncreaseCacheLimit();
  207. // Same as above but requires Static::pageheap_lock() is held.
  208. void IncreaseCacheLimitLocked();
  209. // If TLS is available, we also store a copy of the per-thread object
  210. // in a __thread variable since __thread variables are faster to read
  211. // than pthread_getspecific(). We still need pthread_setspecific()
  212. // because __thread variables provide no way to run cleanup code when
  213. // a thread is destroyed.
  214. // We also give a hint to the compiler to use the "initial exec" TLS
  215. // model. This is faster than the default TLS model, at the cost that
  216. // you cannot dlopen this library. (To see the difference, look at
  217. // the CPU use of __tls_get_addr with and without this attribute.)
  218. // Since we don't really use dlopen in google code -- and using dlopen
  219. // on a malloc replacement is asking for trouble in any case -- that's
  220. // a good tradeoff for us.
  221. #ifdef HAVE___ATTRIBUTE__
  222. #define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec")))
  223. #else
  224. #define ATTR_INITIAL_EXEC
  225. #endif
  226. #ifdef HAVE_TLS
  227. struct ThreadLocalData {
  228. ThreadCache* heap;
  229. // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise.
  230. // The latter is the common case and allows allocation to be faster
  231. // than it would be otherwise: typically a single branch will
  232. // determine that the requested allocation is no more than kMaxSize
  233. // and we can then proceed, knowing that global and thread-local tcmalloc
  234. // state is initialized.
  235. size_t min_size_for_slow_path;
  236. bool use_emergency_malloc;
  237. size_t old_min_size_for_slow_path;
  238. };
  239. static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC;
  240. #endif
  241. // Thread-specific key. Initialization here is somewhat tricky
  242. // because some Linux startup code invokes malloc() before it
  243. // is in a good enough state to handle pthread_keycreate().
  244. // Therefore, we use TSD keys only after tsd_inited is set to true.
  245. // Until then, we use a slow path to get the heap object.
  246. static bool tsd_inited_;
  247. static pthread_key_t heap_key_;
  248. // Linked list of heap objects. Protected by Static::pageheap_lock.
  249. static ThreadCache* thread_heaps_;
  250. static int thread_heap_count_;
  251. // A pointer to one of the objects in thread_heaps_. Represents
  252. // the next ThreadCache from which a thread over its max_size_ should
  253. // steal memory limit. Round-robin through all of the objects in
  254. // thread_heaps_. Protected by Static::pageheap_lock.
  255. static ThreadCache* next_memory_steal_;
  256. // Overall thread cache size. Protected by Static::pageheap_lock.
  257. static size_t overall_thread_cache_size_;
  258. // Global per-thread cache size. Writes are protected by
  259. // Static::pageheap_lock. Reads are done without any locking, which should be
  260. // fine as long as size_t can be written atomically and we don't place
  261. // invariants between this variable and other pieces of state.
  262. static volatile size_t per_thread_cache_size_;
  263. // Represents overall_thread_cache_size_ minus the sum of max_size_
  264. // across all ThreadCaches. Protected by Static::pageheap_lock.
  265. static ssize_t unclaimed_cache_space_;
  266. // This class is laid out with the most frequently used fields
  267. // first so that hot elements are placed on the same cache line.
  268. size_t size_; // Combined size of data
  269. size_t max_size_; // size_ > max_size_ --> Scavenge()
  270. // We sample allocations, biased by the size of the allocation
  271. Sampler sampler_; // A sampler
  272. FreeList list_[kNumClasses]; // Array indexed by size-class
  273. pthread_t tid_; // Which thread owns it
  274. bool in_setspecific_; // In call to pthread_setspecific?
  275. // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
  276. static ThreadCache* NewHeap(pthread_t tid);
  277. // Use only as pthread thread-specific destructor function.
  278. static void DestroyThreadCache(void* ptr);
  279. static void DeleteCache(ThreadCache* heap);
  280. static void RecomputePerThreadCacheSize();
  281. // Ensure that this class is cacheline-aligned. This is critical for
  282. // performance, as false sharing would negate many of the benefits
  283. // of a per-thread cache.
  284. } CACHELINE_ALIGNED;
  285. // Allocator for thread heaps
  286. // This is logically part of the ThreadCache class, but MSVC, at
  287. // least, does not like using ThreadCache as a template argument
  288. // before the class is fully defined. So we put it outside the class.
  289. extern PageHeapAllocator<ThreadCache> threadcache_allocator;
  290. inline int ThreadCache::HeapsInUse() {
  291. return threadcache_allocator.inuse();
  292. }
  293. inline bool ThreadCache::SampleAllocation(size_t k) {
  294. #ifndef NO_TCMALLOC_SAMPLES
  295. return UNLIKELY(FLAGS_tcmalloc_sample_parameter > 0) && sampler_.SampleAllocation(k);
  296. #else
  297. return false;
  298. #endif
  299. }
  300. inline void* ThreadCache::Allocate(size_t size, size_t cl) {
  301. ASSERT(size <= kMaxSize);
  302. ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
  303. FreeList* list = &list_[cl];
  304. if (UNLIKELY(list->empty())) {
  305. return FetchFromCentralCache(cl, size);
  306. }
  307. size_ -= size;
  308. return list->Pop();
  309. }
  310. inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
  311. FreeList* list = &list_[cl];
  312. size_ += Static::sizemap()->ByteSizeForClass(cl);
  313. ssize_t size_headroom = max_size_ - size_ - 1;
  314. // This catches back-to-back frees of allocs in the same size
  315. // class. A more comprehensive (and expensive) test would be to walk
  316. // the entire freelist. But this might be enough to find some bugs.
  317. ASSERT(ptr != list->Next());
  318. list->Push(ptr);
  319. ssize_t list_headroom =
  320. static_cast<ssize_t>(list->max_length()) - list->length();
  321. // There are two relatively uncommon things that require further work.
  322. // In the common case we're done, and in that case we need a single branch
  323. // because of the bitwise-or trick that follows.
  324. if (UNLIKELY((list_headroom | size_headroom) < 0)) {
  325. if (list_headroom < 0) {
  326. ListTooLong(list, cl);
  327. }
  328. if (size_ >= max_size_) Scavenge();
  329. }
  330. }
  331. inline ThreadCache* ThreadCache::GetThreadHeap() {
  332. #ifdef HAVE_TLS
  333. return threadlocal_data_.heap;
  334. #else
  335. return reinterpret_cast<ThreadCache *>(
  336. perftools_pthread_getspecific(heap_key_));
  337. #endif
  338. }
  339. inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() {
  340. #ifdef HAVE_TLS
  341. ASSERT(threadlocal_data_.heap);
  342. return threadlocal_data_.heap;
  343. #else
  344. ASSERT(perftools_pthread_getspecific(heap_key_));
  345. return reinterpret_cast<ThreadCache *>(
  346. perftools_pthread_getspecific(heap_key_));
  347. #endif
  348. }
  349. inline ThreadCache* ThreadCache::GetCache() {
  350. ThreadCache* ptr = NULL;
  351. if (!tsd_inited_) {
  352. InitModule();
  353. } else {
  354. ptr = GetThreadHeap();
  355. }
  356. if (ptr == NULL) ptr = CreateCacheIfNecessary();
  357. return ptr;
  358. }
  359. // In deletion paths, we do not try to create a thread-cache. This is
  360. // because we may be in the thread destruction code and may have
  361. // already cleaned up the cache for this thread.
  362. inline ThreadCache* ThreadCache::GetCacheIfPresent() {
  363. #ifndef HAVE_TLS
  364. if (!tsd_inited_) return NULL;
  365. #endif
  366. return GetThreadHeap();
  367. }
  368. inline size_t ThreadCache::MinSizeForSlowPath() {
  369. #ifdef HAVE_TLS
  370. return threadlocal_data_.min_size_for_slow_path;
  371. #else
  372. return 0;
  373. #endif
  374. }
  375. inline void ThreadCache::SetMinSizeForSlowPath(size_t size) {
  376. #ifdef HAVE_TLS
  377. threadlocal_data_.min_size_for_slow_path = size;
  378. #endif
  379. }
  380. inline void ThreadCache::SetUseEmergencyMalloc() {
  381. #ifdef HAVE_TLS
  382. threadlocal_data_.old_min_size_for_slow_path = threadlocal_data_.min_size_for_slow_path;
  383. threadlocal_data_.min_size_for_slow_path = 0;
  384. threadlocal_data_.use_emergency_malloc = true;
  385. #endif
  386. }
  387. inline void ThreadCache::ResetUseEmergencyMalloc() {
  388. #ifdef HAVE_TLS
  389. threadlocal_data_.min_size_for_slow_path = threadlocal_data_.old_min_size_for_slow_path;
  390. threadlocal_data_.use_emergency_malloc = false;
  391. #endif
  392. }
  393. inline bool ThreadCache::IsUseEmergencyMalloc() {
  394. #if defined(HAVE_TLS) && defined(ENABLE_EMERGENCY_MALLOC)
  395. return UNLIKELY(threadlocal_data_.use_emergency_malloc);
  396. #else
  397. return false;
  398. #endif
  399. }
  400. } // namespace tcmalloc
  401. #endif // TCMALLOC_THREAD_CACHE_H_