Libgomp: iter_ull.c Source File

00001 /* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
00002    Contributed by Richard Henderson <rth@redhat.com>.
00003 
00004    This file is part of the GNU OpenMP Library (libgomp).
00005 
00006    Libgomp is free software; you can redistribute it and/or modify it
00007    under the terms of the GNU General Public License as published by
00008    the Free Software Foundation; either version 3, or (at your option)
00009    any later version.
00010 
00011    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
00012    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00013    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
00014    more details.
00015 
00016    Under Section 7 of GPL version 3, you are granted additional
00017    permissions described in the GCC Runtime Library Exception, version
00018    3.1, as published by the Free Software Foundation.
00019 
00020    You should have received a copy of the GNU General Public License and
00021    a copy of the GCC Runtime Library Exception along with this program;
00022    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023    <http://www.gnu.org/licenses/>.  */
00024 
00025 /* This file contains routines for managing work-share iteration, both
00026    for loops and sections.  */
00027 
00028 #include "libgomp.h"
00029 #include <stdlib.h>
00030 
00031 typedef unsigned long long gomp_ull;
00032 
00033 /* This function implements the STATIC scheduling method.  The caller should
00034    iterate *pstart <= x < *pend.  Return zero if there are more iterations
00035    to perform; nonzero if not.  Return less than 0 if this thread had
00036    received the absolutely last iteration.  */
00037 
00038 int
00039 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
00040 {
00041   struct gomp_thread *thr = gomp_thread ();
00042   struct gomp_team *team = thr->ts.team;
00043   struct gomp_work_share *ws = thr->ts.work_share;
00044   unsigned long nthreads = team ? team->nthreads : 1;
00045 
00046   if (thr->ts.static_trip == -1)
00047     return -1;
00048 
00049   /* Quick test for degenerate teams and orphaned constructs.  */
00050   if (nthreads == 1)
00051     {
00052       *pstart = ws->next_ull;
00053       *pend = ws->end_ull;
00054       thr->ts.static_trip = -1;
00055       return ws->next_ull == ws->end_ull;
00056     }
00057 
00058   /* We interpret chunk_size zero as "unspecified", which means that we
00059      should break up the iterations such that each thread makes only one
00060      trip through the outer loop.  */
00061   if (ws->chunk_size_ull == 0)
00062     {
00063       gomp_ull n, q, i, s0, e0, s, e;
00064 
00065       if (thr->ts.static_trip > 0)
00066     return 1;
00067 
00068       /* Compute the total number of iterations.  */
00069       if (__builtin_expect (ws->mode, 0) == 0)
00070     n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
00071       else
00072     n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
00073       i = thr->ts.team_id;
00074 
00075       /* Compute the "zero-based" start and end points.  That is, as
00076      if the loop began at zero and incremented by one.  */
00077       q = n / nthreads;
00078       q += (q * nthreads != n);
00079       s0 = q * i;
00080       e0 = s0 + q;
00081       if (e0 > n)
00082     e0 = n;
00083 
00084       /* Notice when no iterations allocated for this thread.  */
00085       if (s0 >= e0)
00086     {
00087       thr->ts.static_trip = 1;
00088       return 1;
00089     }
00090 
00091       /* Transform these to the actual start and end numbers.  */
00092       s = s0 * ws->incr_ull + ws->next_ull;
00093       e = e0 * ws->incr_ull + ws->next_ull;
00094 
00095       *pstart = s;
00096       *pend = e;
00097       thr->ts.static_trip = (e0 == n ? -1 : 1);
00098       return 0;
00099     }
00100   else
00101     {
00102       gomp_ull n, s0, e0, i, c, s, e;
00103 
00104       /* Otherwise, each thread gets exactly chunk_size iterations
00105      (if available) each time through the loop.  */
00106 
00107       if (__builtin_expect (ws->mode, 0) == 0)
00108     n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
00109       else
00110     n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
00111       i = thr->ts.team_id;
00112       c = ws->chunk_size_ull;
00113 
00114       /* Initial guess is a C sized chunk positioned nthreads iterations
00115      in, offset by our thread number.  */
00116       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
00117       e0 = s0 + c;
00118 
00119       /* Detect overflow.  */
00120       if (s0 >= n)
00121     return 1;
00122       if (e0 > n)
00123     e0 = n;
00124 
00125       /* Transform these to the actual start and end numbers.  */
00126       s = s0 * ws->incr_ull + ws->next_ull;
00127       e = e0 * ws->incr_ull + ws->next_ull;
00128 
00129       *pstart = s;
00130       *pend = e;
00131 
00132       if (e0 == n)
00133     thr->ts.static_trip = -1;
00134       else
00135     thr->ts.static_trip++;
00136       return 0;
00137     }
00138 }
00139 
00140 
00141 /* This function implements the DYNAMIC scheduling method.  Arguments are
00142    as for gomp_iter_ull_static_next.  This function must be called with
00143    ws->lock held.  */
00144 
00145 bool
00146 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
00147 {
00148   struct gomp_thread *thr = gomp_thread ();
00149   struct gomp_work_share *ws = thr->ts.work_share;
00150   gomp_ull start, end, chunk, left;
00151 
00152   start = ws->next_ull;
00153   if (start == ws->end_ull)
00154     return false;
00155 
00156   chunk = ws->chunk_size_ull;
00157   left = ws->end_ull - start;
00158   if (__builtin_expect (ws->mode & 2, 0))
00159     {
00160       if (chunk < left)
00161     chunk = left;
00162     }
00163   else
00164     {
00165       if (chunk > left)
00166     chunk = left;
00167     }
00168   end = start + chunk;
00169 
00170   ws->next_ull = end;
00171   *pstart = start;
00172   *pend = end;
00173   return true;
00174 }
00175 
00176 
00177 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
00178 /* Similar, but doesn't require the lock held, and uses compare-and-swap
00179    instead.  Note that the only memory value that changes is ws->next_ull.  */
00180 
00181 bool
00182 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
00183 {
00184   struct gomp_thread *thr = gomp_thread ();
00185   struct gomp_work_share *ws = thr->ts.work_share;
00186   gomp_ull start, end, nend, chunk;
00187 
00188   end = ws->end_ull;
00189   chunk = ws->chunk_size_ull;
00190 
00191   if (__builtin_expect (ws->mode & 1, 1))
00192     {
00193       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
00194       if (__builtin_expect (ws->mode & 2, 0) == 0)
00195     {
00196       if (tmp >= end)
00197         return false;
00198       nend = tmp + chunk;
00199       if (nend > end)
00200         nend = end;
00201       *pstart = tmp;
00202       *pend = nend;
00203       return true;
00204     }
00205       else
00206     {
00207       if (tmp <= end)
00208         return false;
00209       nend = tmp + chunk;
00210       if (nend < end)
00211         nend = end;
00212       *pstart = tmp;
00213       *pend = nend;
00214       return true;
00215     }
00216     }
00217 
00218   start = ws->next_ull;
00219   while (1)
00220     {
00221       gomp_ull left = end - start;
00222       gomp_ull tmp;
00223 
00224       if (start == end)
00225     return false;
00226 
00227       if (__builtin_expect (ws->mode & 2, 0))
00228     {
00229       if (chunk < left)
00230         chunk = left;
00231     }
00232       else
00233     {
00234       if (chunk > left)
00235         chunk = left;
00236     }
00237       nend = start + chunk;
00238 
00239       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
00240       if (__builtin_expect (tmp == start, 1))
00241     break;
00242 
00243       start = tmp;
00244     }
00245 
00246   *pstart = start;
00247   *pend = nend;
00248   return true;
00249 }
00250 #endif /* HAVE_SYNC_BUILTINS */
00251 
00252 
00253 /* This function implements the GUIDED scheduling method.  Arguments are
00254    as for gomp_iter_ull_static_next.  This function must be called with the
00255    work share lock held.  */
00256 
00257 bool
00258 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
00259 {
00260   struct gomp_thread *thr = gomp_thread ();
00261   struct gomp_work_share *ws = thr->ts.work_share;
00262   struct gomp_team *team = thr->ts.team;
00263   gomp_ull nthreads = team ? team->nthreads : 1;
00264   gomp_ull n, q;
00265   gomp_ull start, end;
00266 
00267   if (ws->next_ull == ws->end_ull)
00268     return false;
00269 
00270   start = ws->next_ull;
00271   if (__builtin_expect (ws->mode, 0) == 0)
00272     n = (ws->end_ull - start) / ws->incr_ull;
00273   else
00274     n = (start - ws->end_ull) / -ws->incr_ull;
00275   q = (n + nthreads - 1) / nthreads;
00276 
00277   if (q < ws->chunk_size_ull)
00278     q = ws->chunk_size_ull;
00279   if (q <= n)
00280     end = start + q * ws->incr_ull;
00281   else
00282     end = ws->end_ull;
00283 
00284   ws->next_ull = end;
00285   *pstart = start;
00286   *pend = end;
00287   return true;
00288 }
00289 
00290 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
00291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
00292    instead.  Note that the only memory value that changes is ws->next_ull.  */
00293 
00294 bool
00295 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
00296 {
00297   struct gomp_thread *thr = gomp_thread ();
00298   struct gomp_work_share *ws = thr->ts.work_share;
00299   struct gomp_team *team = thr->ts.team;
00300   gomp_ull nthreads = team ? team->nthreads : 1;
00301   gomp_ull start, end, nend, incr;
00302   gomp_ull chunk_size;
00303 
00304   start = ws->next_ull;
00305   end = ws->end_ull;
00306   incr = ws->incr_ull;
00307   chunk_size = ws->chunk_size_ull;
00308 
00309   while (1)
00310     {
00311       gomp_ull n, q;
00312       gomp_ull tmp;
00313 
00314       if (start == end)
00315     return false;
00316 
00317       if (__builtin_expect (ws->mode, 0) == 0)
00318     n = (end - start) / incr;
00319       else
00320     n = (start - end) / -incr;
00321       q = (n + nthreads - 1) / nthreads;
00322 
00323       if (q < chunk_size)
00324     q = chunk_size;
00325       if (__builtin_expect (q <= n, 1))
00326     nend = start + q * incr;
00327       else
00328     nend = end;
00329 
00330       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
00331       if (__builtin_expect (tmp == start, 1))
00332     break;
00333 
00334       start = tmp;
00335     }
00336 
00337   *pstart = start;
00338   *pend = nend;
00339   return true;
00340 }
00341 #endif /* HAVE_SYNC_BUILTINS */