Games Task Scheduler (GTS)
A multi-processor scheduling framework for games engines
1_submissionOnSingleThread.h
1 /*******************************************************************************
2  * Copyright 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files(the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions :
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20  * THE SOFTWARE.
21  ******************************************************************************/
22 #include <iostream>
23 #include <chrono>
24 
25 #include "gts/platform/Atomic.h"
26 
27 #include "gts/micro_scheduler/WorkerPool.h"
28 #include "gts/micro_scheduler/MicroScheduler.h"
29 #include "gts/micro_scheduler/patterns/ParallelFor.h"
30 #include "gts/micro_scheduler/patterns/Range1d.h"
31 
32 using namespace gts;
33 
34 namespace gts_examples {
35 
36 // MORAL: Don't submit singular tasks from the same thread. It causes an O(N)
37 // critical path that limits parallelism, and it causes worst case communication
38 // overhead because all threads work off a single task deque.
39 //
40 // Instead, use divide-and-conquer to recursively subdivide workloads, which
41 // normally (probability-wise) distributes tasks across threads and yields
42 // O(lgN) critical paths in expectation (on average).
43 
44 
45 //------------------------------------------------------------------------------
46 // Demonstrates a pattern of task submission that scales poorly.
48 {
49 public:
50 
51  //--------------------------------------------------------------------------
52  template<typename TFunc>
53  class ForTask : public Task
54  {
55  public:
56 
57  //----------------------------------------------------------------------
58  ForTask(TFunc& func, uint32_t start, uint32_t end)
59  : m_func(func)
60  , m_start(start)
61  , m_end(end)
62  {}
63 
64  //----------------------------------------------------------------------
65  virtual GTS_INLINE Task* execute(TaskContext const& ctx) override
66  {
67  m_func(m_start, m_end, ctx);
68  return nullptr;
69  }
70 
71  private:
72 
73  TFunc& m_func;
74  uint32_t m_start;
75  uint32_t m_end;
76  };
77 
78  //--------------------------------------------------------------------------
79  inline BadParallelFor(MicroScheduler& scheduler)
80  : m_scheduler(scheduler)
81  {}
82 
83  //--------------------------------------------------------------------------
84  template<typename TFunc>
85  inline void operator()(uint32_t start, uint32_t end, TFunc func)
86  {
87  // Create one task per item.
88  uint32_t taskCount = end - start;
89 
90  Task* pRootTask = m_scheduler.allocateTask<EmptyTask>();
91  pRootTask->addRef(taskCount, gts::memory_order::relaxed);
92 
93  // Here we are spawning each task on this thread. BAD!
94  for (uint32_t ii = 1; ii < end; ++ii)
95  {
96  Task* pChildTask = m_scheduler.allocateTask<ForTask<TFunc>>(func, ii, ii + 1);
97  pRootTask->addChildTaskWithoutRef(pChildTask);
98  m_scheduler.spawnTask(pChildTask);
99  }
100 
101  func(0, 1, TaskContext{nullptr, OwnedId(0, 0)});
102 
103  pRootTask->waitForAll();
104  m_scheduler.destoryTask(pRootTask);
105  }
106 
107 private:
108 
109  MicroScheduler& m_scheduler;
110 };
111 
112 //------------------------------------------------------------------------------
113 GTS_NO_INLINE void dummyWork(float& f)
114 {
115  // do some dummy work
116  for (int jj = 0; jj < 1000; ++jj)
117  {
118  f = sin(f);
119  }
120 }
121 
122 struct GTS_ALIGN(GTS_NO_SHARING_CACHE_LINE_SIZE) AlignedFloat
123 {
124  explicit AlignedFloat(float f) : f(f) {}
125  float f = 0;
126 };
127 
128 //------------------------------------------------------------------------------
129 void loopOverRange(size_t start, size_t end, gts::Vector<AlignedFloat, AlignedAllocator<GTS_NO_SHARING_CACHE_LINE_SIZE>>& dummyData, uint32_t workerId)
130 {
131  float& f = dummyData[workerId].f;
132  for (size_t ii = start; ii < end; ++ii)
133  {
134  dummyWork(f);
135  }
136 }
137 
138 //------------------------------------------------------------------------------
139 void submissionOnSingleThread()
140 {
141  printf ("================\n");
142  printf ("submissionOnSingleThread\n");
143  printf ("================\n");
144 
145  const uint32_t threadCount = gts::Thread::getHardwareThreadCount();
146  const uint32_t elementCount = threadCount * 100;
147 
148  WorkerPool workerPool;
149  workerPool.initialize(threadCount);
150 
151  MicroScheduler microScheduler;
152  microScheduler.initialize(&workerPool);
153 
154  const size_t sampleCount = 1000;
155 
156  //
157  // BAD:
158  // Uses a parallel for algorithm that only submits tasks from the main thread.
159 
160 
161  uint64_t samples = 0;
162 
163  gts::Vector<AlignedFloat, AlignedAllocator<GTS_NO_SHARING_CACHE_LINE_SIZE>> dummyData(microScheduler.workerCount(), AlignedFloat(0.f));
164 
165  for(size_t ii = 0; ii < sampleCount; ++ii)
166  {
167  auto start = std::chrono::high_resolution_clock::now();
168 
169  BadParallelFor badParallelFor(microScheduler);
170  badParallelFor(0, elementCount,
171  [&dummyData](uint32_t start, uint32_t end, TaskContext const& ctx)
172  {
173  loopOverRange(start, end, dummyData, ctx.workerId.localId());
174  });
175 
176  auto end = std::chrono::high_resolution_clock::now();
177  samples += (end - start).count();
178  }
179  std::cout << "BadParallelFor time: " << samples / sampleCount << std::endl;
180 
181  //
182  // GOOD:
183  // Uses GTS's parallel for algorithm that uses divide-and-conquer to
184  // recursively subdivide workloads.
185 
186  samples = 0;
187 
188  for (size_t ii = 0; ii < sampleCount; ++ii)
189  {
190  auto start = std::chrono::high_resolution_clock::now();
191 
192  ParallelFor goodParallelFor(microScheduler);
193  goodParallelFor(
194  Range1d<size_t>(0, elementCount, 1),
195  [&dummyData](Range1d<size_t>& range, void* pData, TaskContext const& ctx)
196  {
197  loopOverRange(range.begin(), range.end(), dummyData, ctx.workerId.localId());
198  },
200  nullptr);
201 
202  auto end = std::chrono::high_resolution_clock::now();
203  samples += (end - start).count();
204  }
205  std::cout << "GoodParallelFor time: " << samples / sampleCount << std::endl;
206 }
207 
208 } // gts_examples
A empty Task that can be used as dummy or placeholder.
Definition: Task.h:355
A work-stealing task scheduler. The scheduler is executed by the WorkerPool it is initialized with.
Definition: MicroScheduler.h:81
bool initialize(WorkerPool *pWorkerPool)
Initializes the MicroScheduler and attaches it to pWorkPool, where each worker in pWorkPool will exec...
uint32_t workerCount() const
Get the worker count of the WorkerPool this scheduler is attached to.
An ID for owned objects. It indicates the owner's ID, an ID local to the owner, and an overall unique...
Definition: Utils.h:304
GTS_INLINE SubIdType localId() const
Definition: Utils.h:317
A construct that maps parallel-for behavior to a MicroScheduler.
Definition: ParallelFor.h:48
An iteration range over a 1D data set. Splits divide the range in two based unless the minimum size i...
Definition: Range1d.h:56
Recursively splits a range until it is no longer divisible.
Definition: Partitioners.h:46
A Task payload that embeds TFunc and TArgs into the Task's data. It makes it easy to construct a Task...
Definition: Task.h:120
GTS_INLINE int32_t addRef(int32_t count=1, gts::memory_order order=gts::memory_order::seq_cst)
Definition: Task.inl:84
GTS_INLINE void addChildTaskWithoutRef(Task *pChild)
Definition: Task.inl:30
void waitForAll()
A re-sizable array-like ADT. A subset of the STL Vector interface.
Definition: Vector.h:56
A collection of running Worker threads that a MicroScheduler can be run on.
Definition: WorkerPool.h:54
bool initialize(uint32_t threadCount=0)
Definition: 1_submissionOnSingleThread.h:54
Definition: 1_submissionOnSingleThread.h:48
static GTS_INLINE uint32_t getHardwareThreadCount()
Gets the number of logical processor on this system.
Definition: Thread.h:447
The context associated with the task being executed.
Definition: MicroSchedulerTypes.h:54
OwnedId workerId
The ID of the current Worker.
Definition: MicroSchedulerTypes.h:65