kmp_dispatch.cpp

Go to the documentation of this file.
00001 /*
00002  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
00003  * $Revision: 42195 $
00004  * $Date: 2013-03-27 16:10:35 -0500 (Wed, 27 Mar 2013) $
00005  */
00006 
00007 /* <copyright>
00008     Copyright (c) 1997-2013 Intel Corporation.  All Rights Reserved.
00009 
00010     Redistribution and use in source and binary forms, with or without
00011     modification, are permitted provided that the following conditions
00012     are met:
00013 
00014       * Redistributions of source code must retain the above copyright
00015         notice, this list of conditions and the following disclaimer.
00016       * Redistributions in binary form must reproduce the above copyright
00017         notice, this list of conditions and the following disclaimer in the
00018         documentation and/or other materials provided with the distribution.
00019       * Neither the name of Intel Corporation nor the names of its
00020         contributors may be used to endorse or promote products derived
00021         from this software without specific prior written permission.
00022 
00023     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00024     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00025     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00026     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00027     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00028     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00029     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00030     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00031     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00032     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00033     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00034 
00035 
00036 ------------------------------------------------------------------------
00037 
00038     Portions of this software are protected under the following patents:
00039         U.S. Patent 5,812,852
00040         U.S. Patent 6,792,599
00041         U.S. Patent 7,069,556
00042         U.S. Patent 7,328,433
00043         U.S. Patent 7,500,242
00044 
00045 </copyright> */
00046 
00047 /*
00048  * Dynamic scheduling initialization and dispatch.
00049  *
00050  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
00051  *       it may change values between parallel regions.  __kmp_max_nth
00052  *       is the largest value __kmp_nth may take, 1 is the smallest.
00053  *
00054  */
00055 
00056 /* ------------------------------------------------------------------------ */
00057 /* ------------------------------------------------------------------------ */
00058 
00059 #include "kmp.h"
00060 #include "kmp_i18n.h"
00061 #include "kmp_str.h"
00062 #include "kmp_error.h"
00063 #if KMP_OS_WINDOWS && KMP_ARCH_X86
00064     #include <float.h>
00065 #endif
00066 
00067 #if OMPT_SUPPORT
00068 #include "ompt-internal.h"
00069 #endif
00070 
00071 /* ------------------------------------------------------------------------ */
00072 /* ------------------------------------------------------------------------ */
00073 
00074 #ifdef KMP_STATIC_STEAL_ENABLED
00075 
00076     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
00077     template< typename T >
00078     struct dispatch_private_infoXX_template {
00079         typedef typename traits_t< T >::unsigned_t  UT;
00080         typedef typename traits_t< T >::signed_t    ST;
00081         UT count;                // unsigned
00082         T  ub;
00083         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
00084         T  lb;
00085         ST st;                   // signed
00086         UT tc;                   // unsigned
00087         T  static_steal_counter; // for static_steal only; maybe better to put after ub
00088 
00089         /* parm[1-4] are used in different ways by different scheduling algorithms */
00090 
00091         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
00092         //    a) parm3 is properly aligned and
00093         //    b) all parm1-4 are in the same cache line.
00094         // Because of parm1-4 are used together, performance seems to be better
00095         // if they are in the same line (not measured though).
00096 
00097         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
00098             T  parm1;
00099             T  parm2;
00100             T  parm3;
00101             T  parm4;
00102         };
00103 
00104         UT ordered_lower; // unsigned
00105         UT ordered_upper; // unsigned
00106         #if KMP_OS_WINDOWS
00107         T  last_upper;
00108         #endif /* KMP_OS_WINDOWS */
00109     };
00110 
00111 #else /* KMP_STATIC_STEAL_ENABLED */
00112 
00113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
00114     template< typename T >
00115     struct dispatch_private_infoXX_template {
00116         typedef typename traits_t< T >::unsigned_t  UT;
00117         typedef typename traits_t< T >::signed_t    ST;
00118         T  lb;
00119         T  ub;
00120         ST st;            // signed
00121         UT tc;            // unsigned
00122 
00123         T  parm1;
00124         T  parm2;
00125         T  parm3;
00126         T  parm4;
00127 
00128         UT count;         // unsigned
00129 
00130         UT ordered_lower; // unsigned
00131         UT ordered_upper; // unsigned
00132         #if KMP_OS_WINDOWS
00133     T  last_upper;
00134         #endif /* KMP_OS_WINDOWS */
00135     };
00136 
00137 #endif /* KMP_STATIC_STEAL_ENABLED */
00138 
00139 // replaces dispatch_private_info structure and dispatch_private_info_t type
00140 template< typename T >
00141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
00142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
00143     union KMP_ALIGN_CACHE private_info_tmpl {
00144         dispatch_private_infoXX_template< T > p;
00145         dispatch_private_info64_t             p64;
00146     } u;
00147     enum sched_type schedule;  /* scheduling algorithm */
00148     kmp_uint32      ordered;   /* ordered clause specified */
00149     kmp_uint32      ordered_bumped;
00150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
00151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
00152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
00153     kmp_uint32      type_size;
00154     enum cons_type  pushed_ws;
00155 };
00156 
00157 
00158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
00159 template< typename UT >
00160 struct dispatch_shared_infoXX_template {
00161     /* chunk index under dynamic, number of idle threads under static-steal;
00162        iteration index otherwise */
00163     volatile UT     iteration;
00164     volatile UT     num_done;
00165     volatile UT     ordered_iteration;
00166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
00167 };
00168 
00169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
00170 template< typename UT >
00171 struct dispatch_shared_info_template {
00172     // we need union here to keep the structure size
00173     union shared_info_tmpl {
00174         dispatch_shared_infoXX_template< UT >  s;
00175         dispatch_shared_info64_t               s64;
00176     } u;
00177     volatile kmp_uint32     buffer_index;
00178 };
00179 
00180 /* ------------------------------------------------------------------------ */
00181 /* ------------------------------------------------------------------------ */
00182 
00183 static void
00184 __kmp_static_delay( int arg )
00185 {
00186     /* Work around weird code-gen bug that causes assert to trip */
00187     #if KMP_ARCH_X86_64 && KMP_OS_LINUX
00188     #else
00189         KMP_ASSERT( arg >= 0 );
00190     #endif
00191 }
00192 
00193 static void
00194 __kmp_static_yield( int arg )
00195 {
00196     __kmp_yield( arg );
00197 }
00198 
00199 #undef USE_TEST_LOCKS
00200 
00201 // test_then_add template (general template should NOT be used)
00202 template< typename T >
00203 static __forceinline T
00204 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
00205 
00206 template<>
00207 __forceinline kmp_int32
00208 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
00209 {
00210     kmp_int32 r;
00211     r = KMP_TEST_THEN_ADD32( p, d );
00212     return r;
00213 }
00214 
00215 template<>
00216 __forceinline kmp_int64
00217 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
00218 {
00219     kmp_int64 r;
00220     r = KMP_TEST_THEN_ADD64( p, d );
00221     return r;
00222 }
00223 
00224 // test_then_inc_acq template (general template should NOT be used)
00225 template< typename T >
00226 static __forceinline T
00227 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
00228 
00229 template<>
00230 __forceinline kmp_int32
00231 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
00232 {
00233     kmp_int32 r;
00234     r = KMP_TEST_THEN_INC_ACQ32( p );
00235     return r;
00236 }
00237 
00238 template<>
00239 __forceinline kmp_int64
00240 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
00241 {
00242     kmp_int64 r;
00243     r = KMP_TEST_THEN_INC_ACQ64( p );
00244     return r;
00245 }
00246 
00247 // test_then_inc template (general template should NOT be used)
00248 template< typename T >
00249 static __forceinline T
00250 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
00251 
00252 template<>
00253 __forceinline kmp_int32
00254 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
00255 {
00256     kmp_int32 r;
00257     r = KMP_TEST_THEN_INC32( p );
00258     return r;
00259 }
00260 
00261 template<>
00262 __forceinline kmp_int64
00263 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
00264 {
00265     kmp_int64 r;
00266     r = KMP_TEST_THEN_INC64( p );
00267     return r;
00268 }
00269 
00270 // compare_and_swap template (general template should NOT be used)
00271 template< typename T >
00272 static __forceinline kmp_int32
00273 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
00274 
00275 template<>
00276 __forceinline kmp_int32
00277 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
00278 {
00279     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
00280 }
00281 
00282 template<>
00283 __forceinline kmp_int32
00284 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
00285 {
00286     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
00287 }
00288 
00289 /*
00290     Spin wait loop that first does pause, then yield.
00291     Waits until function returns non-zero when called with *spinner and check.
00292     Does NOT put threads to sleep.
00293 */
00294 template< typename UT >
00295 // ToDo: make inline function (move to header file for icl)
00296 static UT  // unsigned 4- or 8-byte type
00297 __kmp_wait_yield( volatile UT * spinner,
00298                   UT            checker,
00299                   kmp_uint32 (* pred)( UT, UT )
00300                   )
00301 {
00302     // note: we may not belong to a team at this point
00303     register volatile UT         * spin          = spinner;
00304     register          UT           check         = checker;
00305     register          kmp_uint32   spins;
00306     register          kmp_uint32 (*f) ( UT, UT ) = pred;
00307     register          UT           r;
00308 
00309     KMP_INIT_YIELD( spins );
00310     // main wait spin loop
00311     while(!f(r = *spin, check)) {
00312         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
00313            It causes problems with infinite recursion because of exit lock */
00314         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
00315             __kmp_abort_thread(); */
00316 
00317         __kmp_static_delay(TRUE);
00318 
00319         // if we are oversubscribed,
00320         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
00321         // pause is in the following code
00322         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
00323         KMP_YIELD_SPIN( spins );
00324     }
00325     return r;
00326 }
00327 
00328 template< typename UT >
00329 static kmp_uint32 __kmp_eq( UT value, UT checker) {
00330     return value == checker;
00331 }
00332 
00333 template< typename UT >
00334 static kmp_uint32 __kmp_neq( UT value, UT checker) {
00335     return value != checker;
00336 }
00337 
00338 template< typename UT >
00339 static kmp_uint32 __kmp_lt( UT value, UT checker) {
00340     return value < checker;
00341 }
00342 
00343 template< typename UT >
00344 static kmp_uint32 __kmp_ge( UT value, UT checker) {
00345     return value >= checker;
00346 }
00347 
00348 template< typename UT >
00349 static kmp_uint32 __kmp_le( UT value, UT checker) {
00350     return value <= checker;
00351 }
00352 
00353 
00354 /* ------------------------------------------------------------------------ */
00355 /* ------------------------------------------------------------------------ */
00356 
00357 static void
00358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
00359 {
00360     kmp_info_t *th;
00361 
00362     KMP_DEBUG_ASSERT( gtid_ref );
00363 
00364     if ( __kmp_env_consistency_check ) {
00365         th = __kmp_threads[*gtid_ref];
00366         if ( th -> th.th_root -> r.r_active
00367           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
00368             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
00369         }
00370     }
00371 }
00372 
00373 template< typename UT >
00374 static void
00375 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
00376 {
00377     typedef typename traits_t< UT >::signed_t    ST;
00378     dispatch_private_info_template< UT > * pr;
00379 
00380     int gtid = *gtid_ref;
00381 //    int  cid = *cid_ref;
00382     kmp_info_t *th = __kmp_threads[ gtid ];
00383     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
00384 
00385     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
00386     if ( __kmp_env_consistency_check ) {
00387         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
00388             ( th -> th.th_dispatch -> th_dispatch_pr_current );
00389         if ( pr -> pushed_ws != ct_none ) {
00390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
00391         }
00392     }
00393 
00394     if ( ! th -> th.th_team -> t.t_serialized ) {
00395         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
00396             ( th -> th.th_dispatch -> th_dispatch_sh_current );
00397         UT  lower;
00398 
00399         if ( ! __kmp_env_consistency_check ) {
00400                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
00401                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
00402         }
00403         lower = pr->u.p.ordered_lower;
00404 
00405         #if ! defined( KMP_GOMP_COMPAT )
00406             if ( __kmp_env_consistency_check ) {
00407                 if ( pr->ordered_bumped ) {
00408                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
00409                     __kmp_error_construct2(
00410                         kmp_i18n_msg_CnsMultipleNesting,
00411                         ct_ordered_in_pdo, loc_ref,
00412                         & p->stack_data[ p->w_top ]
00413                     );
00414                 }
00415             }
00416         #endif /* !defined(KMP_GOMP_COMPAT) */
00417 
00418         KMP_MB();
00419         #ifdef KMP_DEBUG
00420         {
00421             const char * buff;
00422             // create format specifiers before the debug output
00423             buff = __kmp_str_format(
00424                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
00425                 traits_t< UT >::spec, traits_t< UT >::spec );
00426             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
00427             __kmp_str_free( &buff );
00428         }
00429         #endif
00430 
00431         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
00432                                 );
00433         KMP_MB();  /* is this necessary? */
00434         #ifdef KMP_DEBUG
00435         {
00436             const char * buff;
00437             // create format specifiers before the debug output
00438             buff = __kmp_str_format(
00439                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
00440                 traits_t< UT >::spec, traits_t< UT >::spec );
00441             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
00442             __kmp_str_free( &buff );
00443         }
00444         #endif
00445     }
00446     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
00447 }
00448 
00449 static void
00450 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
00451 {
00452     kmp_info_t *th;
00453 
00454     if ( __kmp_env_consistency_check ) {
00455         th = __kmp_threads[*gtid_ref];
00456         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
00457             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
00458         }
00459     }
00460 }
00461 
00462 template< typename UT >
00463 static void
00464 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
00465 {
00466     typedef typename traits_t< UT >::signed_t    ST;
00467     dispatch_private_info_template< UT > * pr;
00468 
00469     int gtid = *gtid_ref;
00470 //    int  cid = *cid_ref;
00471     kmp_info_t *th = __kmp_threads[ gtid ];
00472     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
00473 
00474     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
00475     if ( __kmp_env_consistency_check ) {
00476         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
00477             ( th -> th.th_dispatch -> th_dispatch_pr_current );
00478         if ( pr -> pushed_ws != ct_none ) {
00479             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
00480         }
00481     }
00482 
00483     if ( ! th -> th.th_team -> t.t_serialized ) {
00484         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
00485             ( th -> th.th_dispatch -> th_dispatch_sh_current );
00486 
00487         if ( ! __kmp_env_consistency_check ) {
00488             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
00489                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
00490         }
00491 
00492         #if ! defined( KMP_GOMP_COMPAT )
00493             if ( __kmp_env_consistency_check ) {
00494                 if ( pr->ordered_bumped != 0 ) {
00495                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
00496                     /* How to test it? - OM */
00497                     __kmp_error_construct2(
00498                         kmp_i18n_msg_CnsMultipleNesting,
00499                         ct_ordered_in_pdo, loc_ref,
00500                         & p->stack_data[ p->w_top ]
00501                     );
00502                 }
00503             }
00504         #endif /* !defined(KMP_GOMP_COMPAT) */
00505 
00506         KMP_MB();       /* Flush all pending memory write invalidates.  */
00507 
00508         pr->ordered_bumped += 1;
00509 
00510         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
00511                         gtid, pr->ordered_bumped ) );
00512 
00513         KMP_MB();       /* Flush all pending memory write invalidates.  */
00514 
00515         /* TODO use general release procedure? */
00516         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
00517 
00518         KMP_MB();       /* Flush all pending memory write invalidates.  */
00519     }
00520     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
00521 }
00522 
00523 /* Computes and returns x to the power of y, where y must a non-negative integer */
00524 template< typename UT >
00525 static __forceinline long double
00526 __kmp_pow(long double x, UT y) {
00527     long double s=1.0L;
00528 
00529     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
00530     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
00531     while(y) {
00532         if ( y & 1 )
00533             s *= x;
00534         x *= x;
00535         y >>= 1;
00536     }
00537     return s;
00538 }
00539 
00540 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
00541    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
00542    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
00543    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
00544 */
00545 template< typename T >
00546 static __inline typename traits_t< T >::unsigned_t
00547 __kmp_dispatch_guided_remaining(
00548     T                                  tc,
00549     typename traits_t< T >::floating_t base,
00550     typename traits_t< T >::unsigned_t idx
00551 ) {
00552     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
00553        least for ICL 8.1, long double arithmetic may not really have 
00554        long double precision, even with /Qlong_double.  Currently, we
00555        workaround that in the caller code, by manipulating the FPCW for
00556        Windows* OS on IA-32 architecture.  The lack of precision is not
00557        expected to be a correctness issue, though.
00558     */
00559     typedef typename traits_t< T >::unsigned_t  UT;
00560 
00561     long double x = tc * __kmp_pow< UT >(base, idx);
00562     UT r = (UT) x;
00563     if ( x == r )
00564         return r;
00565     return r + 1;
00566 }
00567 
00568 // Parameters of the guided-iterative algorithm:
00569 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
00570 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
00571 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
00572 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
00573 static int guided_int_param = 2;
00574 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
00575 
00576 // UT - unsigned flavor of T, ST - signed flavor of T,
00577 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
00578 template< typename T >
00579 static void
00580 __kmp_dispatch_init(
00581     ident_t                        * loc,
00582     int                              gtid,
00583     enum sched_type                  schedule,
00584     T                                lb,
00585     T                                ub,
00586     typename traits_t< T >::signed_t st,
00587     typename traits_t< T >::signed_t chunk,
00588     int                              push_ws
00589 ) {
00590     typedef typename traits_t< T >::unsigned_t  UT;
00591     typedef typename traits_t< T >::signed_t    ST;
00592     typedef typename traits_t< T >::floating_t  DBL;
00593     static const int ___kmp_size_type = sizeof( UT );
00594 
00595     int                                            active;
00596     T                                              tc;
00597     kmp_info_t *                                   th;
00598     kmp_team_t *                                   team;
00599     kmp_uint32                                     my_buffer_index;
00600     dispatch_private_info_template< T >          * pr;
00601     dispatch_shared_info_template< UT > volatile * sh;
00602 
00603     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
00604     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
00605 
00606     if ( ! TCR_4( __kmp_init_parallel ) )
00607         __kmp_parallel_initialize();
00608 
00609     #ifdef KMP_DEBUG
00610     {
00611         const char * buff;
00612         // create format specifiers before the debug output
00613         buff = __kmp_str_format(
00614             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
00615             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
00616         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
00617         __kmp_str_free( &buff );
00618     }
00619     #endif
00620     /* setup data */
00621     th     = __kmp_threads[ gtid ];
00622     team   = th -> th.th_team;
00623     active = ! team -> t.t_serialized;
00624     th->th.th_ident = loc;
00625 
00626     if ( ! active ) {
00627         pr = reinterpret_cast< dispatch_private_info_template< T >* >
00628             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
00629     } else {
00630         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
00631                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
00632 
00633         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
00634 
00635         /* What happens when number of threads changes, need to resize buffer? */
00636         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
00637             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
00638         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
00639             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
00640     }
00641 
00642     /* Pick up the nomerge/ordered bits from the scheduling type */
00643     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
00644         pr->nomerge = TRUE;
00645         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
00646     } else {
00647         pr->nomerge = FALSE;
00648     }
00649     pr->type_size = ___kmp_size_type; // remember the size of variables
00650     if ( kmp_ord_lower & schedule ) {
00651         pr->ordered = TRUE;
00652         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
00653     } else {
00654         pr->ordered = FALSE;
00655     }
00656     if ( schedule == kmp_sch_static ) {
00657         schedule = __kmp_static;
00658     } else {
00659         if ( schedule == kmp_sch_runtime ) {
00660             #if OMP_30_ENABLED
00661                 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
00662                 schedule = team -> t.t_sched.r_sched_type;
00663                 // Detail the schedule if needed (global controls are differentiated appropriately)
00664                 if ( schedule == kmp_sch_guided_chunked ) {
00665                     schedule = __kmp_guided;
00666                 } else if ( schedule == kmp_sch_static ) {
00667                     schedule = __kmp_static;
00668                 }
00669                 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
00670                 chunk = team -> t.t_sched.chunk;
00671             #else
00672                 kmp_r_sched_t r_sched = __kmp_get_schedule_global();
00673                 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
00674                 schedule = r_sched.r_sched_type;
00675                 chunk    = r_sched.chunk;
00676             #endif
00677 
00678             #ifdef KMP_DEBUG
00679             {
00680                 const char * buff;
00681                 // create format specifiers before the debug output
00682                 buff = __kmp_str_format(
00683                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
00684                     traits_t< ST >::spec );
00685                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
00686                 __kmp_str_free( &buff );
00687             }
00688             #endif
00689         } else {
00690             if ( schedule == kmp_sch_guided_chunked ) {
00691                 schedule = __kmp_guided;
00692             }
00693             if ( chunk <= 0 ) {
00694                 chunk = KMP_DEFAULT_CHUNK;
00695             }
00696         }
00697 
00698         #if OMP_30_ENABLED
00699         if ( schedule == kmp_sch_auto ) {
00700             // mapping and differentiation: in the __kmp_do_serial_initialize()
00701             schedule = __kmp_auto;
00702             #ifdef KMP_DEBUG
00703             {
00704                 const char * buff;
00705                 // create format specifiers before the debug output
00706                 buff = __kmp_str_format(
00707                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
00708                     traits_t< ST >::spec );
00709                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
00710                 __kmp_str_free( &buff );
00711             }
00712             #endif
00713         }
00714         #endif // OMP_30_ENABLED
00715 
00716         /* guided analytical not safe for too many threads */
00717         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
00718             schedule = kmp_sch_guided_iterative_chunked;
00719             KMP_WARNING( DispatchManyThreads );
00720         }
00721         pr->u.p.parm1 = chunk;
00722     }
00723     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
00724                 "unknown scheduling type" );
00725 
00726     pr->u.p.count = 0;
00727 
00728     if ( __kmp_env_consistency_check ) {
00729         if ( st == 0 ) {
00730             __kmp_error_construct(
00731                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
00732                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
00733             );
00734         }
00735     }
00736 
00737     tc = ( ub - lb + st );
00738     if ( st != 1 ) {
00739         if ( st < 0 ) {
00740             if ( lb < ub ) {
00741                 tc = 0;            // zero-trip
00742             } else {   // lb >= ub
00743                 tc = (ST)tc / st;  // convert to signed division
00744             }
00745         } else {       // st > 0
00746             if ( ub < lb ) {
00747                 tc = 0;            // zero-trip
00748             } else {   // lb >= ub
00749                 tc /= st;
00750             }
00751         }
00752     } else if ( ub < lb ) {        // st == 1
00753         tc = 0;                    // zero-trip
00754     }
00755 
00756     pr->u.p.lb = lb;
00757     pr->u.p.ub = ub;
00758     pr->u.p.st = st;
00759     pr->u.p.tc = tc;
00760 
00761     #if KMP_OS_WINDOWS
00762     pr->u.p.last_upper = ub + st;
00763     #endif /* KMP_OS_WINDOWS */
00764 
00765     /* NOTE: only the active parallel region(s) has active ordered sections */
00766 
00767     if ( active ) {
00768         if ( pr->ordered == 0 ) {
00769             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
00770             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
00771         } else {
00772             pr->ordered_bumped = 0;
00773 
00774             pr->u.p.ordered_lower = 1;
00775             pr->u.p.ordered_upper = 0;
00776 
00777             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
00778             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
00779         }
00780     }
00781 
00782     if ( __kmp_env_consistency_check ) {
00783         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
00784         if ( push_ws ) {
00785             __kmp_push_workshare( gtid, ws, loc );
00786             pr->pushed_ws = ws;
00787         } else {
00788             __kmp_check_workshare( gtid, ws, loc );
00789             pr->pushed_ws = ct_none;
00790         }
00791     }
00792 
00793     switch ( schedule ) {
00794     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
00795     case kmp_sch_static_steal:
00796         {
00797             T nproc = team->t.t_nproc;
00798             T ntc, init;
00799 
00800             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
00801 
00802             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
00803             if ( nproc > 1 && ntc >= nproc ) {
00804                 T id = __kmp_tid_from_gtid(gtid);
00805                 T small_chunk, extras;
00806 
00807                 small_chunk = ntc / nproc;
00808                 extras = ntc % nproc;
00809 
00810                 init = id * small_chunk + ( id < extras ? id : extras );
00811                 pr->u.p.count = init;
00812                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
00813 
00814                 pr->u.p.parm2 = lb;
00815                 //pr->pfields.parm3 = 0; // it's not used in static_steal
00816                 pr->u.p.parm4 = id;
00817                 pr->u.p.st = st;
00818                 break;
00819             } else {
00820                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
00821                                gtid ) );
00822                 schedule = kmp_sch_static_balanced;
00823                 /* too few iterations: fall-through to kmp_sch_static_balanced */
00824             } // if
00825             /* FALL-THROUGH to static balanced */
00826         } // case
00827     #endif
00828     case kmp_sch_static_balanced:
00829         {
00830             T nproc = team->t.t_nproc;
00831             T init, limit;
00832 
00833             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
00834                             gtid ) );
00835 
00836             if ( nproc > 1 ) {
00837                 T id = __kmp_tid_from_gtid(gtid);
00838 
00839                 if ( tc < nproc ) {
00840                     if ( id < tc ) {
00841                         init = id;
00842                         limit = id;
00843                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
00844                     } else {
00845                         pr->u.p.count = 1;  /* means no more chunks to execute */
00846                         pr->u.p.parm1 = FALSE;
00847                         break;
00848                     }
00849                 } else {
00850                     T small_chunk = tc / nproc;
00851                     T extras = tc % nproc;
00852                     init = id * small_chunk + (id < extras ? id : extras);
00853                     limit = init + small_chunk - (id < extras ? 0 : 1);
00854                     pr->u.p.parm1 = (id == nproc - 1);
00855                 }
00856             } else {
00857                 if ( tc > 0 ) {
00858                     init = 0;
00859                     limit = tc - 1;
00860                     pr->u.p.parm1 = TRUE;
00861                 } else {
00862                     // zero trip count
00863                     pr->u.p.count = 1;  /* means no more chunks to execute */
00864                     pr->u.p.parm1 = FALSE;
00865                     break;
00866                 }
00867             }
00868             if ( st == 1 ) {
00869                 pr->u.p.lb = lb + init;
00870                 pr->u.p.ub = lb + limit;
00871             } else {
00872                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
00873                 pr->u.p.lb = lb + init * st;
00874                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
00875                 if ( st > 0 ) {
00876                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
00877                 } else {
00878                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
00879                 }
00880             }
00881             if ( pr->ordered ) {
00882                 pr->u.p.ordered_lower = init;
00883                 pr->u.p.ordered_upper = limit;
00884             }
00885             break;
00886         } // case
00887     case kmp_sch_guided_iterative_chunked :
00888         {
00889             int nproc = team->t.t_nproc;
00890             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
00891 
00892             if ( nproc > 1 ) {
00893                 if ( (2UL * chunk + 1 ) * nproc >= tc ) {
00894                     /* chunk size too large, switch to dynamic */
00895                     schedule = kmp_sch_dynamic_chunked;
00896                 } else {
00897                     // when remaining iters become less than parm2 - switch to dynamic
00898                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
00899                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
00900                 }
00901             } else {
00902                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
00903                 schedule = kmp_sch_static_greedy;
00904                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
00905                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
00906                 pr->u.p.parm1 = tc;
00907             } // if
00908         } // case
00909         break;
00910     case kmp_sch_guided_analytical_chunked:
00911         {
00912             int nproc = team->t.t_nproc;
00913             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
00914 
00915             if ( nproc > 1 ) {
00916                 if ( (2UL * chunk + 1 ) * nproc >= tc ) {
00917                     /* chunk size too large, switch to dynamic */
00918                     schedule = kmp_sch_dynamic_chunked;
00919                 } else {
00920                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
00921                     DBL x;
00922 
00923                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
00924                     /* Linux* OS already has 64-bit computation by default for
00925                long double, and on Windows* OS on Intel(R) 64,
00926                /Qlong_double doesn't work.  On Windows* OS 
00927                on IA-32 architecture, we need to set precision to
00928                64-bit instead of the default 53-bit. Even though long 
00929                double doesn't work on Windows* OS on Intel(R) 64, the
00930                resulting lack of precision is not expected to impact 
00931                the correctness of the algorithm, but this has not been
00932                mathematically proven.
00933                     */
00934                     // save original FPCW and set precision to 64-bit, as 
00935                     // Windows* OS on IA-32 architecture defaults to 53-bit
00936                     unsigned int oldFpcw = _control87(0,0x30000);
00937                     #endif
00938                     /* value used for comparison in solver for cross-over point */
00939                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
00940 
00941                     /* crossover point--chunk indexes equal to or greater than
00942                this point switch to dynamic-style scheduling */
00943                     UT   cross;
00944 
00945                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
00946                     x = (long double)1.0 - (long double)0.5 / nproc;
00947 
00948                     #ifdef KMP_DEBUG
00949                     { // test natural alignment
00950                         struct _test_a {
00951                             char a;
00952                             union {
00953                                 char b;
00954                                 DBL  d;
00955                             };
00956                         } t;
00957                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
00958                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
00959                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
00960                     }
00961                     #endif // KMP_DEBUG
00962 
00963                     /* save the term in thread private dispatch structure */
00964                     *(DBL*)&pr->u.p.parm3 = x;
00965 
00966                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
00967                     {
00968                         UT          left, right, mid;
00969                         long double p;
00970 
00971                         /* estimate initial upper and lower bound */
00972 
00973                         /* doesn't matter what value right is as long as it is positive, but
00974                            it affects performance of the solver
00975                         */
00976                         right = 229;
00977                         p = __kmp_pow< UT >(x,right);
00978                         if ( p > target ) {
00979                             do{
00980                                 p *= p;
00981                                 right <<= 1;
00982                             } while(p>target && right < (1<<27));
00983                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
00984                         } else {
00985                             left = 0;
00986                         }
00987 
00988                         /* bisection root-finding method */
00989                         while ( left + 1 < right ) {
00990                             mid = (left + right) / 2;
00991                             if ( __kmp_pow< UT >(x,mid) > target ) {
00992                                 left = mid;
00993                             } else {
00994                                 right = mid;
00995                             }
00996                         } // while
00997                         cross = right;
00998                     }
00999                     /* assert sanity of computed crossover point */
01000                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
01001 
01002                     /* save the crossover point in thread private dispatch structure */
01003                     pr->u.p.parm2 = cross;
01004 
01005                     // C75803
01006                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
01007                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
01008                     #else
01009                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
01010                     #endif
01011                     /* dynamic-style scheduling offset */
01012                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
01013                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
01014                         // restore FPCW
01015                         _control87(oldFpcw,0x30000);
01016                     #endif
01017                 } // if
01018             } else {
01019                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
01020                                gtid ) );
01021                 schedule = kmp_sch_static_greedy;
01022                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
01023                 pr->u.p.parm1 = tc;
01024             } // if
01025         } // case
01026         break;
01027     case kmp_sch_static_greedy:
01028         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
01029             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
01030                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
01031                 tc;
01032         break;
01033     case kmp_sch_static_chunked :
01034     case kmp_sch_dynamic_chunked :
01035         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
01036         break;
01037     case kmp_sch_trapezoidal :
01038         {
01039             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
01040 
01041             T parm1, parm2, parm3, parm4;
01042             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
01043 
01044             parm1 = chunk;
01045 
01046             /* F : size of the first cycle */
01047             parm2 = ( tc / (2 * team->t.t_nproc) );
01048 
01049             if ( parm2 < 1 ) {
01050                 parm2 = 1;
01051             }
01052 
01053             /* L : size of the last cycle.  Make sure the last cycle
01054              *     is not larger than the first cycle.
01055              */
01056             if ( parm1 < 1 ) {
01057                 parm1 = 1;
01058             } else if ( parm1 > parm2 ) {
01059                 parm1 = parm2;
01060             }
01061 
01062             /* N : number of cycles */
01063             parm3 = ( parm2 + parm1 );
01064             parm3 = ( 2 * tc + parm3 - 1) / parm3;
01065 
01066             if ( parm3 < 2 ) {
01067                 parm3 = 2;
01068             }
01069 
01070             /* sigma : decreasing incr of the trapezoid */
01071             parm4 = ( parm3 - 1 );
01072             parm4 = ( parm2 - parm1 ) / parm4;
01073 
01074             // pointless check, because parm4 >= 0 always
01075             //if ( parm4 < 0 ) {
01076             //    parm4 = 0;
01077             //}
01078 
01079             pr->u.p.parm1 = parm1;
01080             pr->u.p.parm2 = parm2;
01081             pr->u.p.parm3 = parm3;
01082             pr->u.p.parm4 = parm4;
01083         } // case
01084         break;
01085 
01086     default:
01087         {
01088             __kmp_msg(
01089                 kmp_ms_fatal,                        // Severity
01090                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
01091                 KMP_HNT( GetNewerLibrary ),          // Hint
01092                 __kmp_msg_null                       // Variadic argument list terminator
01093             );
01094         }
01095         break;
01096     } // switch
01097     pr->schedule = schedule;
01098     if ( active ) {
01099         /* The name of this buffer should be my_buffer_index when it's free to use it */
01100 
01101         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
01102                         gtid, my_buffer_index, sh->buffer_index) );
01103         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
01104                                         );
01105             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
01106             // *always* 32-bit integers.
01107         KMP_MB();  /* is this necessary? */
01108         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
01109                         gtid, my_buffer_index, sh->buffer_index) );
01110 
01111         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
01112         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
01113     }; // if
01114     #ifdef KMP_DEBUG
01115     {
01116         const char * buff;
01117         // create format specifiers before the debug output
01118         buff = __kmp_str_format(
01119             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
01120             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
01121             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
01122             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
01123             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
01124             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
01125             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
01126         KD_TRACE(10, ( buff,
01127             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
01128             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
01129             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
01130             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
01131         __kmp_str_free( &buff );
01132     }
01133     #endif
01134     #if ( KMP_STATIC_STEAL_ENABLED )
01135     if ( ___kmp_size_type < 8 ) {
01136       // It cannot be guaranteed that after execution of a loop with some other schedule kind
01137       // all the parm3 variables will contain the same value.
01138       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
01139       // rather than program life-time increment.
01140       // So the dedicated variable is required. The 'static_steal_counter' is used.
01141       if( schedule == kmp_sch_static_steal ) {
01142         // Other threads will inspect this variable when searching for a victim.
01143         // This is a flag showing that other threads may steal from this thread since then.
01144         volatile T * p = &pr->u.p.static_steal_counter;
01145         *p = *p + 1;
01146       }
01147     }
01148     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
01149 #if OMPT_SUPPORT
01150     int  tid = __kmp_tid_from_gtid( gtid );
01151     if ((ompt_status == ompt_status_track_callback)) {
01152       if (ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
01153         ompt_callbacks.ompt_callback(ompt_event_loop_begin)
01154           (team->t.ompt_team_info.parallel_id,
01155        team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
01156       }
01157     }
01158 #endif
01159 }
01160 
01161 /*
01162  * For ordered loops, either __kmp_dispatch_finish() should be called after
01163  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
01164  * every chunk of iterations.  If the ordered section(s) were not executed
01165  * for this iteration (or every iteration in this chunk), we need to set the
01166  * ordered iteration counters so that the next thread can proceed.
01167  */
01168 template< typename UT >
01169 static void
01170 __kmp_dispatch_finish( int gtid, ident_t *loc )
01171 {
01172     typedef typename traits_t< UT >::signed_t ST;
01173     kmp_info_t *th = __kmp_threads[ gtid ];
01174 
01175     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
01176     if ( ! th -> th.th_team -> t.t_serialized ) {
01177 
01178         dispatch_private_info_template< UT > * pr =
01179             reinterpret_cast< dispatch_private_info_template< UT >* >
01180             ( th->th.th_dispatch->th_dispatch_pr_current );
01181         dispatch_shared_info_template< UT > volatile * sh =
01182             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
01183             ( th->th.th_dispatch->th_dispatch_sh_current );
01184         KMP_DEBUG_ASSERT( pr );
01185         KMP_DEBUG_ASSERT( sh );
01186         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
01187                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
01188 
01189         if ( pr->ordered_bumped ) {
01190             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
01191                             gtid ) );
01192             pr->ordered_bumped = 0;
01193         } else {
01194             UT lower = pr->u.p.ordered_lower;
01195 
01196             #ifdef KMP_DEBUG
01197             {
01198                 const char * buff;
01199                 // create format specifiers before the debug output
01200                 buff = __kmp_str_format(
01201                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
01202                     traits_t< UT >::spec, traits_t< UT >::spec );
01203                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
01204                 __kmp_str_free( &buff );
01205             }
01206             #endif
01207 
01208             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
01209                                    );
01210             KMP_MB();  /* is this necessary? */
01211             #ifdef KMP_DEBUG
01212             {
01213                 const char * buff;
01214                 // create format specifiers before the debug output
01215                 buff = __kmp_str_format(
01216                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
01217                     traits_t< UT >::spec, traits_t< UT >::spec );
01218                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
01219                 __kmp_str_free( &buff );
01220             }
01221             #endif
01222 
01223             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
01224         } // if
01225     } // if
01226     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
01227 #if OMPT_SUPPORT
01228     kmp_info_t  *this_thr        = __kmp_threads[ gtid ];
01229     kmp_team_t  *team            = this_thr -> th.th_team;
01230     int  tid = __kmp_tid_from_gtid( gtid );
01231     if ((ompt_status == ompt_status_track_callback)) {
01232       if (ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
01233         ompt_callbacks.ompt_callback(ompt_event_loop_end)
01234       (team->t.ompt_team_info.parallel_id,
01235        team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
01236       }
01237     }
01238 #endif
01239 }
01240 
01241 #ifdef KMP_GOMP_COMPAT
01242 
01243 template< typename UT >
01244 static void
01245 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
01246 {
01247     typedef typename traits_t< UT >::signed_t ST;
01248     kmp_info_t *th = __kmp_threads[ gtid ];
01249 
01250     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
01251     if ( ! th -> th.th_team -> t.t_serialized ) {
01252 //        int cid;
01253         dispatch_private_info_template< UT > * pr =
01254             reinterpret_cast< dispatch_private_info_template< UT >* >
01255             ( th->th.th_dispatch->th_dispatch_pr_current );
01256         dispatch_shared_info_template< UT > volatile * sh =
01257             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
01258             ( th->th.th_dispatch->th_dispatch_sh_current );
01259         KMP_DEBUG_ASSERT( pr );
01260         KMP_DEBUG_ASSERT( sh );
01261         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
01262                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
01263 
01264 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
01265             UT lower = pr->u.p.ordered_lower;
01266             UT upper = pr->u.p.ordered_upper;
01267             UT inc = upper - lower + 1;
01268 
01269             if ( pr->ordered_bumped == inc ) {
01270                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
01271                   gtid ) );
01272                 pr->ordered_bumped = 0;
01273             } else {
01274                 inc -= pr->ordered_bumped;
01275 
01276                 #ifdef KMP_DEBUG
01277                 {
01278                     const char * buff;
01279                     // create format specifiers before the debug output
01280                     buff = __kmp_str_format(
01281                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
01282                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
01283                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
01284                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
01285                     __kmp_str_free( &buff );
01286                 }
01287                 #endif
01288 
01289                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
01290                                        );
01291 
01292                 KMP_MB();  /* is this necessary? */
01293                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
01294                   gtid ) );
01295                 pr->ordered_bumped = 0;
01297                 #ifdef KMP_DEBUG
01298                 {
01299                     const char * buff;
01300                     // create format specifiers before the debug output
01301                     buff = __kmp_str_format(
01302                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
01303                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
01304                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
01305                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
01306                     __kmp_str_free( &buff );
01307                 }
01308                 #endif
01309 
01310                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
01311             }
01312 //        }
01313     }
01314     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
01315 }
01316 
01317 #endif /* KMP_GOMP_COMPAT */
01318 
01319 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
01320  * (no more work), then tell OMPT the loop is over. In some cases
01321  * kmp_dispatch_fini() is not called. */
01322 #if OMPT_SUPPORT
01323 #define OMPT_LOOP_END \
01324     if (status == 0) { \
01325       kmp_info_t  *this_thr        = __kmp_threads[ gtid ]; \
01326       kmp_team_t  *team            = this_thr -> th.th_team; \
01327       int  tid = __kmp_tid_from_gtid( gtid ); \
01328       if ((ompt_status == ompt_status_track_callback)) { \
01329         if (ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
01330           ompt_callbacks.ompt_callback(ompt_event_loop_end) \
01331         (team->t.ompt_team_info.parallel_id,            \
01332          team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); \
01333         } \
01334       }\ 
01335     }
01336 #else
01337 #define OMPT_LOOP_END // no-op
01338 #endif
01339 
01340 template< typename T >
01341 static int
01342 __kmp_dispatch_next(
01343     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
01344 ) {
01345 
01346     typedef typename traits_t< T >::unsigned_t  UT;
01347     typedef typename traits_t< T >::signed_t    ST;
01348     typedef typename traits_t< T >::floating_t  DBL;
01349     static const int ___kmp_size_type = sizeof( UT );
01350 
01351     int                                   status;
01352     dispatch_private_info_template< T > * pr;
01353     kmp_info_t                          * th   = __kmp_threads[ gtid ];
01354     kmp_team_t                          * team = th -> th.th_team;
01355 
01356     #ifdef KMP_DEBUG
01357     {
01358         const char * buff;
01359         // create format specifiers before the debug output
01360         buff = __kmp_str_format(
01361             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
01362             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
01363         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
01364         __kmp_str_free( &buff );
01365     }
01366     #endif
01367 
01368     if ( team -> t.t_serialized ) {
01369         /* NOTE: serialize this dispatch becase we are not at the active level */
01370         pr = reinterpret_cast< dispatch_private_info_template< T >* >
01371             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
01372         KMP_DEBUG_ASSERT( pr );
01373 
01374         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
01375             *p_lb = 0;
01376             *p_ub = 0;
01377             if ( p_st != 0 ) {
01378                 *p_st = 0;
01379             }
01380             if ( __kmp_env_consistency_check ) {
01381                 if ( pr->pushed_ws != ct_none ) {
01382                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
01383                 }
01384             }
01385         } else if ( pr->nomerge ) {
01386             kmp_int32 last;
01387             T         start;
01388             UT        limit, trip, init;
01389             ST        incr;
01390             T         chunk = pr->u.p.parm1;
01391 
01392             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
01393 
01394             init = chunk * pr->u.p.count++;
01395             trip = pr->u.p.tc - 1;
01396 
01397             if ( (status = (init <= trip)) == 0 ) {
01398                 *p_lb = 0;
01399                 *p_ub = 0;
01400                 if ( p_st != 0 ) *p_st = 0;
01401                 if ( __kmp_env_consistency_check ) {
01402                     if ( pr->pushed_ws != ct_none ) {
01403                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
01404                     }
01405                 }
01406             } else {
01407                 start = pr->u.p.lb;
01408                 limit = chunk + init - 1;
01409                 incr  = pr->u.p.st;
01410 
01411                 if ( (last = (limit >= trip)) != 0 ) {
01412                     limit = trip;
01413                     #if KMP_OS_WINDOWS
01414                     pr->u.p.last_upper = pr->u.p.ub;
01415                     #endif /* KMP_OS_WINDOWS */
01416                 }
01417                 if ( p_last ) {
01418                     *p_last = last;
01419                 }
01420                 if ( p_st != 0 ) {
01421                     *p_st = incr;
01422                 }
01423                 if ( incr == 1 ) {
01424                     *p_lb = start + init;
01425                     *p_ub = start + limit;
01426                 } else {
01427                     *p_lb = start + init * incr;
01428                     *p_ub = start + limit * incr;
01429                 }
01430 
01431                 if ( pr->ordered ) {
01432                     pr->u.p.ordered_lower = init;
01433                     pr->u.p.ordered_upper = limit;
01434                     #ifdef KMP_DEBUG
01435                     {
01436                         const char * buff;
01437                         // create format specifiers before the debug output
01438                         buff = __kmp_str_format(
01439                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01440                             traits_t< UT >::spec, traits_t< UT >::spec );
01441                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01442                         __kmp_str_free( &buff );
01443                     }
01444                     #endif
01445                 } // if
01446             } // if
01447         } else {
01448             pr->u.p.tc = 0;
01449 
01450             *p_lb = pr->u.p.lb;
01451             *p_ub = pr->u.p.ub;
01452             #if KMP_OS_WINDOWS
01453             pr->u.p.last_upper = *p_ub;
01454             #endif /* KMP_OS_WINDOWS */
01455 
01456             if ( p_st != 0 ) {
01457                 *p_st = pr->u.p.st;
01458             }
01459             if ( p_last ) {
01460                 *p_last = TRUE;
01461             }
01462         } // if
01463         #ifdef KMP_DEBUG
01464         {
01465             const char * buff;
01466             // create format specifiers before the debug output
01467             buff = __kmp_str_format(
01468                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
01469                 "p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
01470                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
01471             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
01472             __kmp_str_free( &buff );
01473         }
01474         #endif
01475         OMPT_LOOP_END;
01476         return status;
01477     } else {
01478         kmp_int32 last = 0;
01479         dispatch_shared_info_template< UT > *sh;
01480         T         start;
01481         ST        incr;
01482         UT        limit, trip, init;
01483 
01484         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
01485                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
01486 
01487         pr = reinterpret_cast< dispatch_private_info_template< T >* >
01488             ( th->th.th_dispatch->th_dispatch_pr_current );
01489         KMP_DEBUG_ASSERT( pr );
01490         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
01491             ( th->th.th_dispatch->th_dispatch_sh_current );
01492         KMP_DEBUG_ASSERT( sh );
01493 
01494         if ( pr->u.p.tc == 0 ) {
01495             // zero trip count
01496             status = 0;
01497         } else {
01498             switch (pr->schedule) {
01499             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
01500             case kmp_sch_static_steal:
01501                 {
01502                     T chunk = pr->u.p.parm1;
01503 
01504                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
01505 
01506                     trip = pr->u.p.tc - 1;
01507 
01508                     if ( ___kmp_size_type > 4 ) {
01509                         // Other threads do not look into the data of this thread,
01510                         //  so it's not necessary to make volatile casting.
01511                         init   = ( pr->u.p.count )++;
01512                         status = ( init < pr->u.p.ub );
01513                     } else {
01514                         typedef union {
01515                             struct {
01516                                 UT count;
01517                                 T  ub;
01518                             } p;
01519                             kmp_int64 b;
01520                         } union_i4;
01521                         // All operations on 'count' or 'ub' must be combined atomically together.
01522                         // stealing implemented only for 4-byte indexes
01523                         {
01524                             union_i4 vold, vnew;
01525                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
01526                             vnew = vold;
01527                             vnew.p.count++;
01528                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
01529                                         ( volatile kmp_int64* )&pr->u.p.count,
01530                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
01531                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
01532                                 KMP_CPU_PAUSE();
01533                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
01534                                 vnew = vold;
01535                                 vnew.p.count++;
01536                             }
01537                             vnew = vold;
01538                             init   = vnew.p.count;
01539                             status = ( init < vnew.p.ub ) ;
01540                         }
01541 
01542                         if( !status ) {
01543                             kmp_info_t   **other_threads = team->t.t_threads;
01544                             int          while_limit = 10;
01545                             int          while_index = 0;
01546 
01547                             // TODO: algorithm of searching for a victim
01548                             // should be cleaned up and measured
01549                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
01550                                 union_i4  vold, vnew;
01551                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
01552                                 T         victimIdx    = pr->u.p.parm4;
01553                                 T         oldVictimIdx = victimIdx;
01554                                 dispatch_private_info_template< T > * victim;
01555 
01556                                 do {
01557                                     if( !victimIdx ) {
01558                                         victimIdx = team->t.t_nproc - 1;
01559                                     } else {
01560                                         --victimIdx;
01561                                     }
01562                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
01563                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
01564                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
01565                                 // TODO: think about a proper place of this test
01566                                 if ( ( !victim ) ||
01567                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
01568                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
01569                                     // TODO: delay would be nice
01570                                     continue;
01571                                     // the victim is not ready yet to participate in stealing
01572                                     // because the victim is still in kmp_init_dispatch
01573                                 }
01574                                 if ( oldVictimIdx == victimIdx ) {
01575                                     break;
01576                                 }
01577                                 pr->u.p.parm4 = victimIdx;
01578 
01579                                 while( 1 ) {
01580                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
01581                                     vnew = vold;
01582 
01583                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * chunk <= trip );
01584                                     if ( vnew.p.count >= vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
01585                                         break;
01586                                     }
01587                                     vnew.p.ub -= (remaining >> 2);
01588                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * chunk <= trip);
01589                                     #pragma warning( push )
01590                                     // disable warning on pointless comparison of unsigned with 0
01591                                     #pragma warning( disable: 186 )
01592                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
01593                                     #pragma warning( pop )
01594                                     // TODO: Should this be acquire or release?
01595                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
01596                                             ( volatile kmp_int64 * )&victim->u.p.count,
01597                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
01598                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
01599                                         status = 1;
01600                                         while_index = 0;
01601                                         // now update own count and ub
01602                                         #if KMP_ARCH_X86 
01603                                         // stealing executed on non-KMP_ARCH_X86 only
01604                                             // Atomic 64-bit write on ia32 is
01605                                             // unavailable, so we do this in steps.
01606                                             //     This code is not tested.
01607                                             init = vold.p.count;
01608                                             pr->u.p.ub = 0;
01609                                             pr->u.p.count = init + 1;
01610                                             pr->u.p.ub = vnew.p.count;
01611                                         #else
01612                                             init = vnew.p.ub;
01613                                             vold.p.count = init + 1;
01614                                             // TODO: is it safe and enough?
01615                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
01616                                         #endif // KMP_ARCH_X86
01617                                         break;
01618                                     } // if
01619                                 KMP_CPU_PAUSE();
01620                                 } // while (1)
01621                             } // while
01622                         } // if
01623                     } // if
01624                     if ( !status ) {
01625                         *p_lb = 0;
01626                         *p_ub = 0;
01627                         if ( p_st != 0 ) *p_st = 0;
01628                     } else {
01629                         start = pr->u.p.parm2;
01630                         init *= chunk;
01631                         limit = chunk + init - 1;
01632                         incr  = pr->u.p.st;
01633 
01634                         KMP_DEBUG_ASSERT(init <= trip);
01635                         if ( (last = (limit >= trip)) != 0 )
01636                             limit = trip;
01637                         if ( p_last ) {
01638                             *p_last = last;
01639                         }
01640                         if ( p_st != 0 ) *p_st = incr;
01641 
01642                         if ( incr == 1 ) {
01643                             *p_lb = start + init;
01644                             *p_ub = start + limit;
01645                         } else {
01646                             *p_lb = start + init * incr;
01647                             *p_ub = start + limit * incr;
01648                         }
01649 
01650                         if ( pr->ordered ) {
01651                             pr->u.p.ordered_lower = init;
01652                             pr->u.p.ordered_upper = limit;
01653                             #ifdef KMP_DEBUG
01654                             {
01655                                 const char * buff;
01656                                 // create format specifiers before the debug output
01657                                 buff = __kmp_str_format(
01658                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01659                                     traits_t< UT >::spec, traits_t< UT >::spec );
01660                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01661                                 __kmp_str_free( &buff );
01662                             }
01663                             #endif
01664                         } // if
01665                     } // if
01666                     break;
01667                 } // case
01668             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
01669             case kmp_sch_static_balanced:
01670                 {
01671                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
01672                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
01673                         pr->u.p.count = 1;
01674                         *p_lb = pr->u.p.lb;
01675                         *p_ub = pr->u.p.ub;
01676                         last = pr->u.p.parm1;
01677                         if ( p_last ) {
01678                             *p_last = last;
01679                         }
01680                         if ( p_st )
01681                             *p_st = pr->u.p.st;
01682                     } else {  /* no iterations to do */
01683                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
01684                     }
01685                     if ( pr->ordered ) {
01686                         #ifdef KMP_DEBUG
01687                         {
01688                             const char * buff;
01689                             // create format specifiers before the debug output
01690                             buff = __kmp_str_format(
01691                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01692                                 traits_t< UT >::spec, traits_t< UT >::spec );
01693                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01694                             __kmp_str_free( &buff );
01695                         }
01696                         #endif
01697                     } // if
01698                 } // case
01699                 break;
01700             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
01701             case kmp_sch_static_chunked:
01702                 {
01703                     T parm1;
01704 
01705                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
01706                                    gtid ) );
01707                     parm1 = pr->u.p.parm1;
01708 
01709                     trip  = pr->u.p.tc - 1;
01710                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
01711 
01712                     if ( (status = (init <= trip)) != 0 ) {
01713                         start = pr->u.p.lb;
01714                         incr  = pr->u.p.st;
01715                         limit = parm1 + init - 1;
01716 
01717                         if ( (last = (limit >= trip)) != 0 )
01718                             limit = trip;
01719 
01720                         if ( p_last ) {
01721                             *p_last = last;
01722                         }
01723                         if ( p_st != 0 ) *p_st = incr;
01724 
01725                         pr->u.p.count += team->t.t_nproc;
01726 
01727                         if ( incr == 1 ) {
01728                             *p_lb = start + init;
01729                             *p_ub = start + limit;
01730                         }
01731                         else {
01732                             *p_lb = start + init * incr;
01733                             *p_ub = start + limit * incr;
01734                         }
01735 
01736                         if ( pr->ordered ) {
01737                             pr->u.p.ordered_lower = init;
01738                             pr->u.p.ordered_upper = limit;
01739                             #ifdef KMP_DEBUG
01740                             {
01741                                 const char * buff;
01742                                 // create format specifiers before the debug output
01743                                 buff = __kmp_str_format(
01744                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01745                                     traits_t< UT >::spec, traits_t< UT >::spec );
01746                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01747                                 __kmp_str_free( &buff );
01748                             }
01749                             #endif
01750                         } // if
01751                     } // if
01752                 } // case
01753                 break;
01754 
01755             case kmp_sch_dynamic_chunked:
01756                 {
01757                     T chunk = pr->u.p.parm1;
01758 
01759                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
01760                                    gtid ) );
01761 
01762                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
01763                     trip = pr->u.p.tc - 1;
01764 
01765                     if ( (status = (init <= trip)) == 0 ) {
01766                         *p_lb = 0;
01767                         *p_ub = 0;
01768                         if ( p_st != 0 ) *p_st = 0;
01769                     } else {
01770                         start = pr->u.p.lb;
01771                         limit = chunk + init - 1;
01772                         incr  = pr->u.p.st;
01773 
01774                         if ( (last = (limit >= trip)) != 0 )
01775                             limit = trip;
01776                         if ( p_last ) {
01777                             *p_last = last;
01778                         }
01779                         if ( p_st != 0 ) *p_st = incr;
01780 
01781                         if ( incr == 1 ) {
01782                             *p_lb = start + init;
01783                             *p_ub = start + limit;
01784                         } else {
01785                             *p_lb = start + init * incr;
01786                             *p_ub = start + limit * incr;
01787                         }
01788 
01789                         if ( pr->ordered ) {
01790                             pr->u.p.ordered_lower = init;
01791                             pr->u.p.ordered_upper = limit;
01792                             #ifdef KMP_DEBUG
01793                             {
01794                                 const char * buff;
01795                                 // create format specifiers before the debug output
01796                                 buff = __kmp_str_format(
01797                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01798                                     traits_t< UT >::spec, traits_t< UT >::spec );
01799                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01800                                 __kmp_str_free( &buff );
01801                             }
01802                             #endif
01803                         } // if
01804                     } // if
01805                 } // case
01806                 break;
01807 
01808             case kmp_sch_guided_iterative_chunked:
01809                 {
01810                     T  chunkspec = pr->u.p.parm1;
01811                     KD_TRACE(100,
01812                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
01813                     trip  = pr->u.p.tc;
01814                     // Start atomic part of calculations
01815                     while(1) {
01816                         ST  remaining;             // signed, because can be < 0
01817                         init = sh->u.s.iteration;  // shared value
01818                         remaining = trip - init;
01819                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
01820                             // nothing to do, don't try atomic op
01821                             status = 0;
01822                             break;
01823                         }
01824                         if ( remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
01825                             // use dynamic-style shcedule
01826                             // atomically inrement iterations, get old value
01827                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
01828                             remaining = trip - init;
01829                             if (remaining <= 0) {
01830                                 status = 0;    // all iterations got by other threads
01831                             } else {
01832                                 // got some iterations to work on
01833                                 status = 1;
01834                                 if ( remaining > chunkspec ) {
01835                                     limit = init + chunkspec - 1;
01836                                 } else {
01837                                     last = 1;   // the last chunk
01838                                     limit = init + remaining - 1;
01839                                 } // if
01840                             } // if
01841                             break;
01842                         } // if
01843                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
01844                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
01845                             // CAS was successful, chunk obtained
01846                             status = 1;
01847                             --limit;
01848                             break;
01849                         } // if
01850                     } // while
01851                     if ( status != 0 ) {
01852                         start = pr->u.p.lb;
01853                         incr = pr->u.p.st;
01854                         if ( p_st != NULL )
01855                             *p_st = incr;
01856                         if ( p_last != NULL )
01857                             *p_last = last;
01858                         *p_lb = start + init * incr;
01859                         *p_ub = start + limit * incr;
01860                         if ( pr->ordered ) {
01861                             pr->u.p.ordered_lower = init;
01862                             pr->u.p.ordered_upper = limit;
01863                             #ifdef KMP_DEBUG
01864                             {
01865                                 const char * buff;
01866                                 // create format specifiers before the debug output
01867                                 buff = __kmp_str_format(
01868                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01869                                     traits_t< UT >::spec, traits_t< UT >::spec );
01870                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01871                                 __kmp_str_free( &buff );
01872                             }
01873                             #endif
01874                         } // if
01875                     } else {
01876                         *p_lb = 0;
01877                         *p_ub = 0;
01878                         if ( p_st != NULL )
01879                             *p_st = 0;
01880                     } // if
01881                 } // case
01882                 break;
01883 
01884             case kmp_sch_guided_analytical_chunked:
01885                 {
01886                     T   chunkspec = pr->u.p.parm1;
01887                     UT chunkIdx;
01888     #if KMP_OS_WINDOWS && KMP_ARCH_X86
01889                     /* for storing original FPCW value for Windows* OS on 
01890                IA-32 architecture 8-byte version */
01891                     unsigned int oldFpcw;
01892                     int fpcwSet = 0;
01893     #endif
01894                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
01895                                    gtid ) );
01896 
01897                     trip  = pr->u.p.tc;
01898 
01899                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
01900                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * team->t.t_nproc < trip);
01901 
01902                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
01903                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
01904                         if ( chunkIdx >= pr->u.p.parm2 ) {
01905                             --trip;
01906                             /* use dynamic-style scheduling */
01907                             init = chunkIdx * chunkspec + pr->u.p.count;
01908                             /* need to verify init > 0 in case of overflow in the above calculation */
01909                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
01910                                 limit = init + chunkspec -1;
01911 
01912                                 if ( (last = (limit >= trip)) != 0 )
01913                                     limit = trip;
01914                             }
01915                             break;
01916                         } else {
01917                             /* use exponential-style scheduling */
01918                             /* The following check is to workaround the lack of long double precision on Windows* OS.
01919                                This check works around the possible effect that init != 0 for chunkIdx == 0.
01920                              */
01921     #if KMP_OS_WINDOWS && KMP_ARCH_X86
01922                             /* If we haven't already done so, save original
01923                    FPCW and set precision to 64-bit, as Windows* OS
01924                    on IA-32 architecture defaults to 53-bit */
01925                             if ( !fpcwSet ) {
01926                                 oldFpcw = _control87(0,0x30000);
01927                                 fpcwSet = 0x30000;
01928                             }
01929     #endif
01930                             if ( chunkIdx ) {
01931                                 init = __kmp_dispatch_guided_remaining< T >(
01932                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
01933                                 KMP_DEBUG_ASSERT(init);
01934                                 init = trip - init;
01935                             } else
01936                                 init = 0;
01937                             limit = trip - __kmp_dispatch_guided_remaining< T >(
01938                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
01939                             KMP_ASSERT(init <= limit);
01940                             if ( init < limit ) {
01941                                 KMP_DEBUG_ASSERT(limit <= trip);
01942                                 --limit;
01943                                 status = 1;
01944                                 break;
01945                             } // if
01946                         } // if
01947                     } // while (1)
01948     #if KMP_OS_WINDOWS && KMP_ARCH_X86
01949                     /* restore FPCW if necessary */
01950                     if ( oldFpcw & fpcwSet != 0 )
01951                         _control87(oldFpcw,0x30000);
01952     #endif
01953                     if ( status != 0 ) {
01954                         start = pr->u.p.lb;
01955                         incr = pr->u.p.st;
01956                         if ( p_st != NULL )
01957                             *p_st = incr;
01958                         if ( p_last != NULL )
01959                             *p_last = last;
01960                         *p_lb = start + init * incr;
01961                         *p_ub = start + limit * incr;
01962                         if ( pr->ordered ) {
01963                             pr->u.p.ordered_lower = init;
01964                             pr->u.p.ordered_upper = limit;
01965                             #ifdef KMP_DEBUG
01966                             {
01967                                 const char * buff;
01968                                 // create format specifiers before the debug output
01969                                 buff = __kmp_str_format(
01970                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
01971                                     traits_t< UT >::spec, traits_t< UT >::spec );
01972                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
01973                                 __kmp_str_free( &buff );
01974                             }
01975                             #endif
01976                         }
01977                     } else {
01978                         *p_lb = 0;
01979                         *p_ub = 0;
01980                         if ( p_st != NULL )
01981                             *p_st = 0;
01982                     }
01983                 } // case
01984                 break;
01985 
01986             case kmp_sch_trapezoidal:
01987                 {
01988                     UT   index;
01989                     T    parm2 = pr->u.p.parm2;
01990                     T    parm3 = pr->u.p.parm3;
01991                     T    parm4 = pr->u.p.parm4;
01992                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
01993                                    gtid ) );
01994 
01995                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
01996 
01997                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
01998                     trip = pr->u.p.tc - 1;
01999 
02000                     if ( (status = (index < parm3 && init <= trip)) == 0 ) {
02001                         *p_lb = 0;
02002                         *p_ub = 0;
02003                         if ( p_st != 0 ) *p_st = 0;
02004                     } else {
02005                         start = pr->u.p.lb;
02006                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
02007                         incr  = pr->u.p.st;
02008 
02009                         if ( (last = (limit >= trip)) != 0 )
02010                             limit = trip;
02011 
02012                         if ( p_last != 0 ) {
02013                             *p_last = last;
02014                         }
02015                         if ( p_st != 0 ) *p_st = incr;
02016 
02017                         if ( incr == 1 ) {
02018                             *p_lb = start + init;
02019                             *p_ub = start + limit;
02020                         } else {
02021                             *p_lb = start + init * incr;
02022                             *p_ub = start + limit * incr;
02023                         }
02024 
02025                         if ( pr->ordered ) {
02026                             pr->u.p.ordered_lower = init;
02027                             pr->u.p.ordered_upper = limit;
02028                             #ifdef KMP_DEBUG
02029                             {
02030                                 const char * buff;
02031                                 // create format specifiers before the debug output
02032                                 buff = __kmp_str_format(
02033                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
02034                                     traits_t< UT >::spec, traits_t< UT >::spec );
02035                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
02036                                 __kmp_str_free( &buff );
02037                             }
02038                             #endif
02039                         } // if
02040                     } // if
02041                 } // case
02042                 break;
02043             } // switch
02044         } // if tc == 0;
02045 
02046         if ( status == 0 ) {
02047             UT   num_done;
02048 
02049             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
02050             #ifdef KMP_DEBUG
02051             {
02052                 const char * buff;
02053                 // create format specifiers before the debug output
02054                 buff = __kmp_str_format(
02055                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
02056                     traits_t< UT >::spec );
02057                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
02058                 __kmp_str_free( &buff );
02059             }
02060             #endif
02061 
02062             if ( num_done == team->t.t_nproc-1 ) {
02063                 /* NOTE: release this buffer to be reused */
02064 
02065                 KMP_MB();       /* Flush all pending memory write invalidates.  */
02066 
02067                 sh->u.s.num_done = 0;
02068                 sh->u.s.iteration = 0;
02069 
02070                 /* TODO replace with general release procedure? */
02071                 if ( pr->ordered ) {
02072                     sh->u.s.ordered_iteration = 0;
02073                 }
02074 
02075                 KMP_MB();       /* Flush all pending memory write invalidates.  */
02076 
02077                 sh -> buffer_index += KMP_MAX_DISP_BUF;
02078                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
02079                                 gtid, sh->buffer_index) );
02080 
02081                 KMP_MB();       /* Flush all pending memory write invalidates.  */
02082 
02083             } // if
02084             if ( __kmp_env_consistency_check ) {
02085                 if ( pr->pushed_ws != ct_none ) {
02086                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
02087                 }
02088             }
02089 
02090             th -> th.th_dispatch -> th_deo_fcn = NULL;
02091             th -> th.th_dispatch -> th_dxo_fcn = NULL;
02092             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
02093             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
02094         } // if (status == 0)
02095 #if KMP_OS_WINDOWS
02096         else if ( last ) {
02097             pr->u.p.last_upper = pr->u.p.ub;
02098         }
02099 #endif /* KMP_OS_WINDOWS */
02100     } // if
02101 
02102     #ifdef KMP_DEBUG
02103     {
02104         const char * buff;
02105         // create format specifiers before the debug output
02106         buff = __kmp_str_format(
02107             "__kmp_dispatch_next: T#%%d normal case: " \
02108             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
02109             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
02110         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
02111         __kmp_str_free( &buff );
02112     }
02113     #endif
02114     OMPT_LOOP_END;
02115     return status;
02116 }
02117 
02118 //-----------------------------------------------------------------------------------------
02119 // Dispatch routines
02120 //    Transfer call to template< type T >
02121 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
02122 //                         T lb, T ub, ST st, ST chunk )
02123 extern "C" {
02124 
02140 void
02141 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02142                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
02143 {
02144     KMP_DEBUG_ASSERT( __kmp_init_serial );
02145     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
02146 }
02150 void
02151 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02152                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
02153 {
02154     KMP_DEBUG_ASSERT( __kmp_init_serial );
02155     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
02156 }
02157 
02161 void
02162 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02163                         kmp_int64 lb, kmp_int64 ub,
02164                         kmp_int64 st, kmp_int64 chunk )
02165 {
02166     KMP_DEBUG_ASSERT( __kmp_init_serial );
02167     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
02168 }
02169 
02173 void
02174 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02175                          kmp_uint64 lb, kmp_uint64 ub,
02176                          kmp_int64 st, kmp_int64 chunk )
02177 {
02178     KMP_DEBUG_ASSERT( __kmp_init_serial );
02179     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
02180 }
02181 
02194 int
02195 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
02196                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
02197 {
02198     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
02199 }
02200 
02204 int
02205 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
02206                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
02207 {
02208     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
02209 }
02210 
02214 int
02215 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
02216                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
02217 {
02218     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
02219 }
02220 
02224 int
02225 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
02226                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
02227 {
02228     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
02229 }
02230 
02237 void
02238 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
02239 {
02240     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
02241 }
02242 
02246 void
02247 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
02248 {
02249     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
02250 }
02251 
02255 void
02256 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
02257 {
02258     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
02259 }
02260 
02264 void
02265 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
02266 {
02267     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
02268 }
02271 //-----------------------------------------------------------------------------------------
02272 //Non-template routines from kmp_dispatch.c used in other sources
02273 
02274 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
02275     return value == checker;
02276 }
02277 
02278 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
02279     return value != checker;
02280 }
02281 
02282 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
02283     return value < checker;
02284 }
02285 
02286 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
02287     return value >= checker;
02288 }
02289 
02290 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
02291     return value <= checker;
02292 }
02293 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
02294     return value == checker;
02295 }
02296 
02297 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
02298     return value != checker;
02299 }
02300 
02301 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
02302     return value < checker;
02303 }
02304 
02305 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
02306     return value >= checker;
02307 }
02308 
02309 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
02310     return value <= checker;
02311 }
02312 
02313 kmp_uint32
02314 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
02315                    kmp_uint32            checker,
02316                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
02317                    , void        * obj    // Higher-level synchronization object, or NULL.
02318                    )
02319 {
02320     // note: we may not belong to a team at this point
02321     register volatile kmp_uint32         * spin          = spinner;
02322     register          kmp_uint32           check         = checker;
02323     register          kmp_uint32   spins;
02324     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
02325     register          kmp_uint32           r;
02326 
02327     KMP_INIT_YIELD( spins );
02328     // main wait spin loop
02329     while(!f(r = TCR_4(*spin), check)) {
02330         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
02331            It causes problems with infinite recursion because of exit lock */
02332         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
02333             __kmp_abort_thread(); */
02334 
02335         __kmp_static_delay(TRUE);
02336 
02337         /* if we have waited a bit, or are oversubscribed, yield */
02338         /* pause is in the following code */
02339         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
02340         KMP_YIELD_SPIN( spins );
02341     }
02342     
02343     return r;
02344 }
02345 
02346 kmp_uint64
02347 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
02348                     kmp_uint64            checker,
02349                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
02350                     , void        * obj    // Higher-level synchronization object, or NULL.
02351                     )
02352 {
02353     // note: we may not belong to a team at this point
02354     register volatile kmp_uint64         * spin          = spinner;
02355     register          kmp_uint64           check         = checker;
02356     register          kmp_uint32   spins;
02357     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
02358     register          kmp_uint64           r;
02359 
02360     KMP_INIT_YIELD( spins );
02361     // main wait spin loop
02362     while(!f(r = *spin, check))
02363     {
02364         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
02365            It causes problems with infinite recursion because of exit lock */
02366         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
02367             __kmp_abort_thread(); */
02368         __kmp_static_delay(TRUE);
02369 
02370         // if we are oversubscribed,
02371         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
02372         // pause is in the following code
02373         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
02374         KMP_YIELD_SPIN( spins );
02375     }
02376 
02377     return r;
02378 }
02379 
02380 } // extern "C"
02381 
02382 #ifdef KMP_GOMP_COMPAT
02383 
02384 void
02385 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02386                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
02387                            kmp_int32 chunk, int push_ws )
02388 {
02389     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
02390                                       push_ws );
02391 }
02392 
02393 void
02394 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02395                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
02396                             kmp_int32 chunk, int push_ws )
02397 {
02398     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
02399                                        push_ws );
02400 }
02401 
02402 void
02403 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02404                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
02405                            kmp_int64 chunk, int push_ws )
02406 {
02407     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
02408                                       push_ws );
02409 }
02410 
02411 void
02412 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
02413                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
02414                             kmp_int64 chunk, int push_ws )
02415 {
02416     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
02417                                        push_ws );
02418 }
02419 
02420 void
02421 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
02422 {
02423     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
02424 }
02425 
02426 void
02427 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
02428 {
02429     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
02430 }
02431 
02432 void
02433 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
02434 {
02435     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
02436 }
02437 
02438 void
02439 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
02440 {
02441     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
02442 }
02443 
02444 #endif /* KMP_GOMP_COMPAT */
02445 
02446 /* ------------------------------------------------------------------------ */
02447 /* ------------------------------------------------------------------------ */
02448 

Generated on 25 Aug 2013 for libomp_oss by  doxygen 1.6.1