libflame  revision_anchor
Functions
FLASH_Queue_main_prototypes.h File Reference

(r)

Go to the source code of this file.

Functions

void FLASH_Queue_begin (void)
void FLASH_Queue_end (void)
unsigned int FLASH_Queue_stack_depth (void)
FLA_Error FLASH_Queue_enable (void)
FLA_Error FLASH_Queue_disable (void)
FLA_Bool FLASH_Queue_get_enabled (void)
void FLASH_Queue_set_num_threads (unsigned int n_threads)
unsigned int FLASH_Queue_get_num_threads (void)
void FLASH_Queue_init (void)
void FLASH_Queue_finalize (void)
unsigned int FLASH_Queue_get_num_tasks (void)
void FLASH_Queue_set_verbose_output (FLASH_Verbose verbose)
FLASH_Verbose FLASH_Queue_get_verbose_output (void)
void FLASH_Queue_set_sorting (FLA_Bool sorting)
FLA_Bool FLASH_Queue_get_sorting (void)
void FLASH_Queue_set_caching (FLA_Bool caching)
FLA_Bool FLASH_Queue_get_caching (void)
void FLASH_Queue_set_work_stealing (FLA_Bool work_stealing)
FLA_Bool FLASH_Queue_get_work_stealing (void)
void FLASH_Queue_set_data_affinity (FLASH_Data_aff data_affinity)
FLASH_Data_aff FLASH_Queue_get_data_affinity (void)
double FLASH_Queue_get_total_time (void)
double FLASH_Queue_get_parallel_time (void)
void FLASH_Queue_exec (void)
void FLASH_Queue_set_parallel_time (double dtime)
void FLASH_Queue_set_block_size (dim_t size)
dim_t FLASH_Queue_get_block_size (void)
void FLASH_Queue_set_cache_size (dim_t size)
dim_t FLASH_Queue_get_cache_size (void)
void FLASH_Queue_set_cache_line_size (dim_t size)
dim_t FLASH_Queue_get_cache_line_size (void)
void FLASH_Queue_set_cores_per_cache (int cores)
int FLASH_Queue_get_cores_per_cache (void)
void FLASH_Queue_set_cores_per_queue (int cores)
int FLASH_Queue_get_cores_per_queue (void)
void FLASH_Queue_reset (void)
FLASH_TaskFLASH_Queue_get_head_task (void)
FLASH_TaskFLASH_Queue_get_tail_task (void)
void FLASH_Queue_push (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args,...)
void FLASH_Queue_push_input (FLA_Obj obj, FLASH_Task *t)
void FLASH_Queue_push_output (FLA_Obj obj, FLASH_Task *t)
FLASH_TaskFLASH_Task_alloc (void *func, void *cntl, char *name, FLA_Bool enabled_gpu, int n_int_args, int n_fla_args, int n_input_args, int n_output_args)
void FLASH_Task_free (FLASH_Task *t)
void FLASH_Queue_exec_task (FLASH_Task *t)
void FLASH_Queue_verbose_output (void)
void FLASH_Queue_init_tasks (void *arg)
void FLASH_Queue_wait_enqueue (FLASH_Task *t, void *arg)
FLASH_TaskFLASH_Queue_wait_dequeue (int queue, int cache, void *arg)
FLASH_TaskFLASH_Queue_wait_dequeue_block (int queue, int cache, void *arg)
void FLASH_Queue_update_cache (FLASH_Task *t, void *arg)
void FLASH_Queue_update_cache_block (FLA_Obj obj, int cache, FLA_Bool output, void *arg)
void FLASH_Queue_prefetch (int cache, void *arg)
void FLASH_Queue_prefetch_block (FLA_Obj obj)
FLASH_TaskFLASH_Queue_work_stealing (int queue, void *arg)
void FLASH_Queue_create_gpu (int thread, void *arg)
void FLASH_Queue_destroy_gpu (int thread, void *arg)
FLA_Bool FLASH_Queue_exec_gpu (FLASH_Task *t, void *arg)
FLA_Bool FLASH_Queue_check_gpu (FLASH_Task *t, void *arg)
FLA_Bool FLASH_Queue_check_block_gpu (FLA_Obj obj, int thread, void *arg)
void FLASH_Queue_update_gpu (FLASH_Task *t, void **input_arg, void **output_arg, void *arg)
void FLASH_Queue_update_block_gpu (FLA_Obj obj, void **buffer_gpu, int thread, void *arg)
void FLASH_Queue_mark_gpu (FLASH_Task *t, void *arg)
void FLASH_Queue_invalidate_block_gpu (FLA_Obj obj, int thread, void *arg)
void FLASH_Queue_flush_block_gpu (FLA_Obj obj, int thread, void *arg)
void FLASH_Queue_flush_gpu (int thread, void *arg)
void FLASH_Queue_exec_parallel (void *arg)
void * FLASH_Queue_exec_parallel_function (void *arg)
FLASH_TaskFLASH_Task_update_dependencies (FLASH_Task *t, void *arg)
FLASH_TaskFLASH_Task_update_binding (FLASH_Task *t, FLASH_Task *r, void *arg)
void FLASH_Task_free_parallel (FLASH_Task *t, void *arg)
void FLASH_Queue_exec_simulation (void *arg)

Function Documentation

void FLASH_Queue_begin ( void  )
FLA_Bool FLASH_Queue_check_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_check_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool r_val = TRUE;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;

   if ( k < gpu_n_blocks )
   {
      // Request this block if it is dirty.
      if ( !args->gpu[thread * gpu_n_blocks + k].clean )
      {
         args->gpu[thread * gpu_n_blocks + k].request = TRUE;

         r_val = FALSE;
      }
   }

   // Check the victim block.
   if ( obj.base == args->victim[thread].obj.base )
      r_val = FALSE;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return r_val;
}
FLA_Bool FLASH_Queue_check_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   int i, j, k;
   int thread        = t->thread;
   int n_input_args  = t->n_input_args;
   int n_output_args = t->n_output_args;
   int n_threads     = FLASH_Queue_get_num_threads();
   FLA_Bool r_val    = TRUE;
   FLA_Bool t_val;
   FLA_Bool duplicate;
   FLA_Obj  obj;

   // Check the input and output arguments on the GPUs.
   for ( i = 0; i < n_input_args + n_output_args; i++ )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      // Find the correct input or output argument.
      if ( i < n_input_args )
      {
         obj = t->input_arg[i];

         for ( j = 0; j < n_output_args && !duplicate; j++ )
         {
            if ( obj.base == t->output_arg[j].base )
               duplicate = TRUE;
         }
         
         for ( j = 0; j < i && !duplicate; j++ )
         {
            if ( obj.base == t->input_arg[j].base )
               duplicate = TRUE;
         }
      }
      else
      {
         obj = t->output_arg[i - n_input_args];

         for ( j = 0; j < i - n_input_args && !duplicate; j++ )
         {
            if ( obj.base == t->output_arg[j].base )
               duplicate = TRUE;
         }        
      }

      // If the block has not been processed before.
      if ( !duplicate )
      {     
         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
            
            // Clear each block in macroblock.
            for ( jj = 0; jj < n; jj++ )
            {
               for ( kk = 0; kk < m; kk++ )
               {
                  obj = *( buf + jj * cs + kk );
                  
                  t_val = TRUE;
                  
                  // Check to see if the block is dirty on another GPU.
                  for ( k = 0; k < n_threads && t_val; k++ )
                     if ( k != thread )
                        t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
                  
                  r_val = r_val && t_val;
               }
            }
         }
         else
         {
            t_val = TRUE;
            
            // Check to see if the block is dirty on another GPU.
            for ( k = 0; k < n_threads && t_val; k++ )
               if ( k != thread )
                  t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
            
            r_val = r_val && t_val;
         }
      }
   }

   return r_val;
}
void FLASH_Queue_create_gpu ( int  thread,
void *  arg 
)

References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), and FLASH_Queue_variables::gpu.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i;
   dim_t gpu_n_blocks     = FLASH_Queue_get_gpu_num_blocks();
   dim_t block_size       = args->block_size;
   FLA_Datatype datatype  = args->datatype;

   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;
   
   // Bind thread to GPU.
   FLASH_Queue_bind_gpu( thread );

   // Allocate the memory on the GPU for all the blocks a priori.
   for ( i = 0; i < gpu_n_blocks; i++ )
      FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
   
   return;
}
void FLASH_Queue_destroy_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and FLA_Obj_gpu_struct::obj.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Obj_gpu gpu_obj;

   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;

   // Examine every block left on the GPU.
   for ( i = 0; i < gpu_n_blocks; i++ )
   {
      gpu_obj = args->gpu[thread * gpu_n_blocks + i];

      // Flush the blocks that are dirty.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
         FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );

      // Free the memory on the GPU for all the blocks.
      FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
   }

   return;
}

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

{
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Disable if not begin parallel region yet.
      flash_queue_enabled = FALSE;   
      return FLA_SUCCESS;      
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
#else
   // Allow disabling enqueuing even when SuperMatrix is not configured.
   flash_queue_enabled = FALSE;   
   return FLA_SUCCESS;
#endif
}

Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().

{
#ifdef FLA_ENABLE_SUPERMATRIX
   if ( flash_queue_stack == 0 )
   {
      // Enable if not begin parallel region yet.
      flash_queue_enabled = TRUE;   
      return FLA_SUCCESS;
   }
   else
   {
      // Cannot change status during parallel region.
      return FLA_FAILURE;
   }
#else
   // Raise an exception when SuperMatrix is not configured.
   FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
   return FLA_FAILURE;
#endif
}
void FLASH_Queue_end ( void  )
void FLASH_Queue_exec ( void  )

References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.

Referenced by FLASH_Queue_end().

{
   int         n_tasks    = FLASH_Queue_get_num_tasks();
   int         i;
   double      dtime;

   // All the necessary variables for the SuperMatrix mechanism.
   FLASH_Queue_vars args;

   // If the queue is empty, return early.
   if ( n_tasks == 0 )
      return;

   // Turn off all multiple queue implementations.
   FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
   FLASH_Queue_set_work_stealing( FALSE );
   // Do not use cache affinity yet.
   FLASH_Queue_set_caching( FALSE );

   // Allocate memory for task queues.
   args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
   args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
   args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
   args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
   args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );

   // Initialize data.
   if ( FLA_is_owner() )
   {
      args.n_wait[0] = 0;
      args.pc[0] = 0;
   }

   Synch_all();

   // Initialize tasks with critical information.
   FLASH_Queue_init_tasks( ( void* ) &args );

   // Display verbose output before free all tasks.
   if ( FLASH_Queue_get_verbose_output() )
      FLASH_Queue_verbose_output();

   // Start timing the parallel execution.
   dtime = RCCE_wtime();

   FLASH_Queue_exec_parallel_function( ( void* ) &args );

   // End timing the parallel execution.
   dtime = RCCE_wtime() - dtime;
   FLASH_Queue_set_parallel_time( dtime );

   // Free all tasks sequentially.
   for ( i = 0; i < n_tasks; i++ )
      FLASH_Task_free( args.task_queue[i] );

   // Free data.
   FLA_free( args.task_queue );
   FLA_shfree( args.n_ready );
   FLA_shfree( args.wait_queue );
   FLA_shfree( args.n_wait );
   FLA_shfree( args.pc );

   // Reset values for next call to FLASH_Queue_exec().
   FLASH_Queue_reset();

   return;
}
FLA_Bool FLASH_Queue_exec_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_parallel_function().

{
   void** input_arg;
   void** output_arg;

   if ( t == NULL )
      return TRUE;

   // If not using the GPU, then execute on CPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
   {
      FLASH_Queue_exec_task( t );

      return TRUE;
   }

   // Check if all the operands are ready and up to date.
   if ( !FLASH_Queue_check_gpu( t, arg ) )
   {
      FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
      int queue = t->queue;
      t->hit = FALSE;

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
#endif
      // Reenqueue the task if the blocks are not all flushed.
      FLASH_Queue_wait_enqueue( t, arg );

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_release( &(args->run_lock[queue]) ); // R ***    
#endif

      return FALSE;
   }

   // If GPU is enabled, but the task is not supported for GPU execution.
   if ( !t->enabled_gpu )
   {
      int i, j, k;
      int thread        = t->thread;
      int n_input_args  = t->n_input_args;
      int n_output_args = t->n_output_args;
      int n_threads     = FLASH_Queue_get_num_threads();
      FLA_Bool duplicate;
      FLA_Obj  obj;

      // Check the blocks on each GPU.
      for ( k = 0; k < n_threads; k++ )
      {
         // Check the input and output arguments on the GPUs.
         for ( i = 0; i < n_input_args + n_output_args; i++ )
         {
            // Check for duplicate blocks.
            duplicate = FALSE;
            
            // Find the correct input or output argument.
            if ( i < n_input_args )
            {
               obj = t->input_arg[i];
               
               for ( j = 0; j < n_output_args && !duplicate; j++ )
               {
                  if ( obj.base == t->output_arg[j].base )
                     duplicate = TRUE;
               }
               
               for ( j = 0; j < i && !duplicate; j++ )
               {
                  if ( obj.base == t->input_arg[j].base )
                     duplicate = TRUE;
               }
            }
            else
            {
               obj = t->output_arg[i - n_input_args];
               
               for ( j = 0; j < i - n_input_args && !duplicate; j++ )
               {
                  if ( obj.base == t->output_arg[j].base )
                     duplicate = TRUE;
               }        
            }
            
            // If the block has not been processed before.
            if ( !duplicate )
            {     
               // Macroblock is used.
               if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
               {
                  dim_t    jj, kk;
                  dim_t    m    = FLA_Obj_length( obj );
                  dim_t    n    = FLA_Obj_width( obj );
                  dim_t    cs   = FLA_Obj_col_stride( obj );
                  FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
                  
                  // Clear each block in macroblock.
                  for ( jj = 0; jj < n; jj++ )
                  {
                     for ( kk = 0; kk < m; kk++ )
                     {
                        obj = *( buf + jj * cs + kk );
                        
                        // Flush the block to main memory if it is on the GPU.
                        if ( k == thread )
                           FLASH_Queue_flush_block_gpu( obj, k, arg );
                        
                        // Invalidate output block on all GPUs.
                        if ( i >= n_input_args )
                           FLASH_Queue_invalidate_block_gpu( obj, k, arg );
                     }
                  }
               }
               else
               {
                  // Flush the block to main memory if it is on the GPU.
                  if ( k == thread )
                     FLASH_Queue_flush_block_gpu( obj, k, arg );
                  
                  // Invalidate output block on all GPUs.
                  if ( i >= n_input_args )
                     FLASH_Queue_invalidate_block_gpu( obj, k, arg );
               }
            }
         }
      }

      // Execute the task on CPU instead of GPU.
      FLASH_Queue_exec_task( t );

      return TRUE;
   }

   // Gather the pointers for the data on the GPU.
   input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
   output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );

   // Bring all the blocks to GPU.
   FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );

   // Execute the task on GPU.
   FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );

   // Mark all the output blocks as dirty.
   FLASH_Queue_mark_gpu( t, arg );

   // Free memory.
   FLA_free( input_arg );
   FLA_free( output_arg );

   return TRUE;
}
void FLASH_Queue_exec_parallel ( void *  arg)

References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_get_num_threads().

Referenced by FLASH_Queue_exec().

{
   int   i;
   int   n_threads = FLASH_Queue_get_num_threads();
   void* (*thread_entry_point)( void* );

   // Allocate the thread structures array. Here, an array of FLASH_Thread
   // structures of length n_threads is allocated and the fields of each
   // structure set to appropriate values.
   FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );

   // Initialize the thread structures array.
   for ( i = 0; i < n_threads; i++ )
   {
      // Save the thread's identifier.
      thread[i].id = i;

      // Save the pointer to the necessary variables with the thread.
      thread[i].args = arg;

      // The pthread object, if it was even compiled into the FLASH_Thread
      // structure, will be initialized by the pthread implementation when we
      // call pthread_create() and does not need to be touched at this time.
   }

   // Determine which function to send threads to.
   thread_entry_point = FLASH_Queue_exec_parallel_function;

#if FLA_MULTITHREADING_MODEL == FLA_OPENMP

   // An OpenMP parallel for region spawns n_threads threads. Each thread
   // executes the work function with a different FLASH_Thread argument.
   // An implicit synchronization point exists at the end of the curly
   // brace scope.
   #pragma omp parallel for \
           private( i ) \
           shared( thread, n_threads, thread_entry_point ) \
           schedule( static, 1 ) \
           num_threads( n_threads )
   for ( i = 0; i < n_threads; ++i )
   {
      thread_entry_point( ( void* ) &thread[i] );
   }

#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS

   // Create each POSIX thread needed in addition to the main thread.
   for ( i = 1; i < n_threads; i++ )
   {
      int pthread_e_val;

      // Create thread i with default attributes.
      pthread_e_val = pthread_create( &(thread[i].pthread_obj),
                                      NULL,
                                      thread_entry_point,
                                      ( void* ) &thread[i] );

#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
      FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
      FLA_Check_error_code( e_val );
#endif
   }

   // The main thread is assigned the role of thread 0. Here we manually
   // execute it as a worker thread.
   thread_entry_point( ( void* ) &thread[0] );

   // Wait for non-main threads to finish.
   for ( i = 1; i < n_threads; i++ )
   {
      // These two variables are declared local to this for loop since this
      // is the only place they are needed, and since they would show up as
      // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
      // Strangely, the Intel compiler produces code that results in an
      // "unaligned access" runtime message if thread_status is declared as
      // an int. Declaring it as a long or void* appears to force the
      // compiler (not surprisingly) into aligning it to an 8-byte boundary.
      int   pthread_e_val;
      void* thread_status;

      // Wait for thread i to invoke its respective pthread_exit().
      // The return value passed to pthread_exit() is provided to us
      // via status, if one was given.
      pthread_e_val = pthread_join( thread[i].pthread_obj,
                                    ( void** ) &thread_status );
      
#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
      FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
      FLA_Check_error_code( e_val );
#endif
   }
   
#endif

   FLA_free( thread );

   return;
}
void * FLASH_Queue_exec_parallel_function ( void *  arg)

References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int           i         = RCCE_ue();
   int           queue     = 0;
   int           cache     = 0;
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   int           n_threads = FLASH_Queue_get_num_threads();
   FLA_Bool      condition;
   FLA_Bool      available;
   FLASH_Task*   t = NULL;
   
   // Do not let extraneous cores execute.
   if ( i < n_threads )
      condition = TRUE;
   else
      condition = FALSE;

   // Loop until all the tasks have committed.
   while ( condition )
   {
      RCCE_acquire_lock( 0 );
      
      // Obtain task to execute.
      t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
      
      RCCE_release_lock( 0 );

      // Dequeued a task from the waiting queue.
      available = ( t != NULL );

      if ( available )
      {
         // Save the thread and cache that executes the task.
         t->thread = i;
         t->cache = cache;

         // Execute the task.
         FLASH_Queue_exec_task( t );
         
         // Update task dependencies.
         FLASH_Task_update_dependencies( t, ( void* ) args );
      }

      RCCE_acquire_lock( 0 );

      // Terminate loop.
      if ( args->pc[0] >= n_tasks )
         condition = FALSE;

      RCCE_release_lock( 0 );
   }

   return ( void* ) NULL;
}
void FLASH_Queue_exec_simulation ( void *  arg)

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, FLASH_Dep_s::task, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int           i, j;
   int           queue;
   int           cache;
   int           n_stages  = 0;
   int           n_queues  = args->n_queues;
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   int           n_threads = FLASH_Queue_get_num_threads();
   int           n_cores   = FLASH_Queue_get_cores_per_cache();
   FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
   FLASH_Task*   task;
   FLASH_Task*   t;
   FLASH_Dep*    d;

   // An array to hold tasks to be executed during of simulation.
#ifdef FLA_ENABLE_WINDOWS_BUILD
   FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
#else
   FLASH_Task* exec_array[n_threads];
#endif

   for ( i = 0; i < n_threads; i++ )
   {
      // Initialize all exec_array to NULL.
      exec_array[i] = NULL;

      // Prefetch blocks into the cache before execution.
      if ( i % n_cores == 0 )
         FLASH_Queue_prefetch( i, arg );
   }

   // Loop until all the tasks have committed.
   while ( args->pc < n_tasks )
   {
      for ( i = 0; i < n_threads; i++ )
      {
         // Update waiting queue with ready tasks.
         t = exec_array[i];
         
         if ( t != NULL )
         {
            // Check each dependent task.
            d = t->dep_arg_head;
            
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               task = d->task;              
               task->n_ready--;
               
               // Place newly ready tasks on waiting queue.
               if ( task->n_ready == 0 )
               {
                  FLASH_Queue_wait_enqueue( task, arg );
               }
               
               // Go to the next dep.
               d = d->next_dep;
            }

            // Free the task.
            FLASH_Task_free( t );
         }
      }      
      
      n_stages++;
      if ( !verbose )
         printf( "%7d", n_stages );
      
      // Move ready tasks from the waiting queue to execution queue.
      for ( i = 0; i < n_threads; i++ )
      {
         // Determine to which queue this thread belongs.
         queue = i / ( n_threads / n_queues );

         // Determine to which cache this thread belongs.
         cache = i / n_cores;

         // Dequeue a task.
     t = FLASH_Queue_wait_dequeue( queue, cache, arg );

         // Save the task for execution.
         exec_array[i] = t;
         
         if ( t != NULL )
         {
            // Save the thread and cache that executes the task.
            t->thread = i;
            t->cache = cache;

            // Increment program counter.
            args->pc++;
         }
      }

      // Execute independent tasks.
      for ( i = 0; i < n_threads; i++ )
      {
         t = exec_array[i];
         FLASH_Queue_update_cache( t, arg );
         FLASH_Queue_exec_task( t );
         
         if ( !verbose )
            printf( "%7s", ( t == NULL ? "     " : t->name ) );

         // Free the task if this is the last stage.
         if ( args->pc == n_tasks && t != NULL )
            FLASH_Task_free( t );
      }
      
      if ( !verbose ) 
         printf( "\n" );
   }

   if ( !verbose )
      printf( "\n" );

#ifdef FLA_ENABLE_WINDOWS_BUILD
   FLA_free( exec_array );
#endif

   return;
}

References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   // Define local function pointer types.

   // LAPACK-level
   typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
   typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
   typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
   typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
   typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
   typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
   typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
   typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
   typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
   typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
   typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
   typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
   typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
   typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
   typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
   typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
   typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
   typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
   typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
   typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
   typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
   typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
   typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);

   // Level-3 BLAS
   typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
   typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
   typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
   typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
   typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
   typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
   typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
   typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
   typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);

   // Level-2 BLAS
   typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
   typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);

   // Level-1 BLAS
   typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
   typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
   typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
   typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
   typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
   typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
   typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);

   // Base
   typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
   typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);

   // Only execute task if it is not NULL.
   if ( t == NULL )
      return;

   // Now "switch" between the various possible task functions.

   // FLA_LU_piv_macro
   if ( t->func == (void *) FLA_LU_piv_macro_task )
   {
      flash_lu_piv_macro_p func;
      func = (flash_lu_piv_macro_p) t->func;
      
      func(               t->output_arg[0],
                          t->output_arg[1],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Apply_pivots_macro
   else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
   {
      flash_apply_pivots_macro_p func;
      func = (flash_apply_pivots_macro_p) t->func;
      
      func( ( FLA_Side  )    t->int_arg[0],
            ( FLA_Trans )    t->int_arg[1],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_appiv_t* ) t->cntl );
   }
   // FLA_LU_piv
   else if ( t->func == (void *) FLA_LU_piv_task )
   {
      flash_lu_piv_p func;
      func = (flash_lu_piv_p) t->func;
      
      func(               t->output_arg[0],
                          t->fla_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_LU_piv_copy
   else if ( t->func == (void *) FLA_LU_piv_copy_task )
   {
      flash_lu_piv_copy_p func;
      func = (flash_lu_piv_copy_p) t->func;

      func(               t->output_arg[0],
                          t->fla_arg[0],
                          t->output_arg[1],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Trsm_piv
   else if ( t->func == (void *) FLA_Trsm_piv_task )
   {
      flash_trsm_piv_p func;
      func = (flash_trsm_piv_p) t->func;

      func(                 t->input_arg[0],
                            t->output_arg[0],
                            t->fla_arg[0],
            ( fla_trsm_t* ) t->cntl );
   }
   // FLA_SA_LU
   else if ( t->func == (void *) FLA_SA_LU_task )
   {
      flash_sa_lu_p func;
      func = (flash_sa_lu_p) t->func;
      
      func(               t->output_arg[1],
                          t->output_arg[0],
                          t->fla_arg[0],
                          t->fla_arg[1],
                          t->int_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_SA_FS
   else if ( t->func == (void *) FLA_SA_FS_task )
   {
      flash_sa_fs_p func;
      func = (flash_sa_fs_p) t->func;
         
      func(                 t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],                          
                            t->output_arg[1],
                            t->output_arg[0],
                            t->int_arg[0],
            ( fla_gemm_t* ) t->cntl );
   }
   // FLA_LU_nopiv
   else if ( t->func == (void *) FLA_LU_nopiv_task )
   {
      flash_lu_nopiv_p func;
      func = (flash_lu_nopiv_p) t->func;

      func(               t->output_arg[0],
            ( fla_lu_t* ) t->cntl );
   }
   // FLA_Trinv
   else if ( t->func == (void *) FLA_Trinv_task )
   {
      flash_trinv_p func;
      func = (flash_trinv_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Diag     ) t->int_arg[1],
                             t->output_arg[0],
            ( fla_trinv_t* ) t->cntl );
   }
   // FLA_Ttmm
   else if ( t->func == (void *) FLA_Ttmm_task )
   {   
      flash_ttmm_p func;
      func = (flash_ttmm_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
                            t->output_arg[0],
            ( fla_ttmm_t* ) t->cntl );
   }
   // FLA_Chol
   else if ( t->func == (void *) FLA_Chol_task )
   {   
      flash_chol_p func;
      func = (flash_chol_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
                            t->output_arg[0],
            ( fla_chol_t* ) t->cntl );
   }
   // FLA_Sylv
   else if ( t->func == (void *) FLA_Sylv_task )
   {   
      flash_sylv_p func;
      func = (flash_sylv_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->output_arg[0],
                            t->fla_arg[1],
            ( fla_sylv_t* ) t->cntl );
   }
   // FLA_Lyap
   else if ( t->func == (void *) FLA_Lyap_task )
   {   
      flash_lyap_p func;
      func = (flash_lyap_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
                            t->fla_arg[1],
            ( fla_lyap_t* ) t->cntl );
   }
   // FLA_QR_UT_macro
   else if ( t->func == (void *) FLA_QR_UT_macro_task )
   {   
      flash_qrut_macro_p func;
      func = (flash_qrut_macro_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR_UT
   else if ( t->func == (void *) FLA_QR_UT_task )
   {   
      flash_qrut_p func;
      func = (flash_qrut_p) t->func;
      
      func(                 t->output_arg[0],
                            t->fla_arg[0],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR_UT_copy
   else if ( t->func == (void *) FLA_QR_UT_copy_task )
   {   
      flash_qrutc_p func;
      func = (flash_qrutc_p) t->func;

      func(                 t->output_arg[0],
                            t->fla_arg[0],
                            t->output_arg[1],
            ( fla_qrut_t* ) t->cntl );
   }
   // FLA_QR2_UT
   else if ( t->func == (void *) FLA_QR2_UT_task )
   {   
      flash_qr2ut_p func;
      func = (flash_qr2ut_p) t->func;
         
      func(                 t->output_arg[1],
                            t->output_arg[0],
                            t->fla_arg[0],
           ( fla_qr2ut_t* ) t->cntl );
   }
   // FLA_LQ_UT_macro
   else if ( t->func == (void *) FLA_LQ_UT_macro_task )
   {   
      flash_lqut_macro_p func;
      func = (flash_lqut_macro_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
            ( fla_lqut_t* ) t->cntl );
   }
   // FLA_CAQR2_UT
   else if ( t->func == (void *) FLA_CAQR2_UT_task )
   {   
      flash_caqr2ut_p func;
      func = (flash_caqr2ut_p) t->func;
         
      func(                 t->output_arg[1],
                            t->output_arg[0],
                            t->fla_arg[0],
         ( fla_caqr2ut_t* ) t->cntl );
   }
   // FLA_UDdate_UT
   else if ( t->func == (void *) FLA_UDdate_UT_task )
   {   
      flash_uddateut_p func;
      func = (flash_uddateut_p) t->func;
      
      func(                 t->output_arg[0],
                            t->output_arg[1],
                            t->output_arg[2],
                            t->output_arg[3],
        ( fla_uddateut_t* ) t->cntl );
   }
   // FLA_Apply_Q_UT
   else if ( t->func == (void *) FLA_Apply_Q_UT_task )
   {   
      flash_apqut_p func;
      func = (flash_apqut_p) t->func;
      
      func( ( FLA_Side     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
            ( FLA_Direct   ) t->int_arg[2],
            ( FLA_Store    ) t->int_arg[3],
                             t->input_arg[0],
                             t->fla_arg[0],
                             t->output_arg[1],
                             t->output_arg[0],
            ( fla_apqut_t* ) t->cntl );
   }
   // FLA_Apply_Q2_UT
   else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
   {   
      flash_apq2ut_p func;
      func = (flash_apq2ut_p) t->func;
      
      func( ( FLA_Side      ) t->int_arg[0],
            ( FLA_Trans     ) t->int_arg[1],
            ( FLA_Direct    ) t->int_arg[2],
            ( FLA_Store     ) t->int_arg[3],
                              t->input_arg[0],
                              t->fla_arg[0],
                              t->output_arg[2],
                              t->output_arg[1],
                              t->output_arg[0],
            ( fla_apq2ut_t* ) t->cntl );
   }
   // FLA_Apply_CAQ2_UT
   else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
   {   
      flash_apcaq2ut_p func;
      func = (flash_apcaq2ut_p) t->func;
      
      func( ( FLA_Side      ) t->int_arg[0],
            ( FLA_Trans     ) t->int_arg[1],
            ( FLA_Direct    ) t->int_arg[2],
            ( FLA_Store     ) t->int_arg[3],
                              t->input_arg[0],
                              t->fla_arg[0],
                              t->output_arg[2],
                              t->output_arg[1],
                              t->output_arg[0],
          ( fla_apcaq2ut_t* ) t->cntl );
   }
   // FLA_Apply_QUD_UT
   else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
   {   
      flash_apqudut_p func;
      func = (flash_apqudut_p) t->func;
      
      func( ( FLA_Side       ) t->int_arg[0],
            ( FLA_Trans      ) t->int_arg[1],
            ( FLA_Direct     ) t->int_arg[2],
            ( FLA_Store      ) t->int_arg[3],
                               t->input_arg[0],
                               t->output_arg[0],
                               t->output_arg[1],
                               t->input_arg[1],
                               t->output_arg[2],
                               t->input_arg[2],
                               t->output_arg[3],
            ( fla_apqudut_t* ) t->cntl );
   }
   // FLA_Eig_gest
   else if ( t->func == (void *) FLA_Eig_gest_task )
   {   
      flash_eig_gest_p func;
      func = (flash_eig_gest_p) t->func;
      
      func( ( FLA_Inv         ) t->int_arg[0],
            ( FLA_Uplo        ) t->int_arg[1],
                                t->output_arg[1],
                                t->output_arg[0],
                                t->input_arg[0],
            ( fla_eig_gest_t* ) t->cntl );
   }
   // FLA_Gemm
   else if ( t->func == (void *) FLA_Gemm_task )
   {
      flash_gemm_p func;
      func = (flash_gemm_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_gemm_t* ) t->cntl );
   }
   // FLA_Hemm
   else if ( t->func == (void *) FLA_Hemm_task )
   {
      flash_hemm_p func;
      func = (flash_hemm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_hemm_t* ) t->cntl );
   }
   // FLA_Herk
   else if ( t->func == (void *) FLA_Herk_task )
   {
      flash_herk_p func;
      func = (flash_herk_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_herk_t* ) t->cntl );
   }
   // FLA_Her2k
   else if ( t->func == (void *) FLA_Her2k_task )
   {
      flash_her2k_p func;
      func = (flash_her2k_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
            ( fla_her2k_t* ) t->cntl );
   }
   // FLA_Symm
   else if ( t->func == (void *) FLA_Symm_task )
   {
      flash_symm_p func;
      func = (flash_symm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_symm_t* ) t->cntl );
   }
   // FLA_Syrk
   else if ( t->func == (void *) FLA_Syrk_task )
   {
      flash_syrk_p func;
      func = (flash_syrk_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_syrk_t* ) t->cntl );
   }
   // FLA_Syr2k
   else if ( t->func == (void *) FLA_Syr2k_task )
   {
      flash_syr2k_p func;
      func = (flash_syr2k_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
            ( FLA_Trans    ) t->int_arg[1],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->input_arg[1],
                             t->fla_arg[1],
                             t->output_arg[0],
            ( fla_syr2k_t* ) t->cntl );
   }
   // FLA_Trmm
   else if ( t->func == (void *) FLA_Trmm_task )
   {
      flash_trmm_p func;
      func = (flash_trmm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trmm_t* ) t->cntl );
   }
   // FLA_Trsm
   else if ( t->func == (void *) FLA_Trsm_task )
   {
      flash_trsm_p func;
      func = (flash_trsm_p) t->func;
      
      func( ( FLA_Side    ) t->int_arg[0],
            ( FLA_Uplo    ) t->int_arg[1],
            ( FLA_Trans   ) t->int_arg[2],
            ( FLA_Diag    ) t->int_arg[3],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trsm_t* ) t->cntl );
   }
   // FLA_Gemv
   else if ( t->func == (void *) FLA_Gemv_task )
   {
      flash_gemv_p func;
      func = (flash_gemv_p) t->func;
      
      func( ( FLA_Trans   ) t->int_arg[0],
                            t->fla_arg[0],
                            t->input_arg[0],
                            t->input_arg[1],
                            t->fla_arg[1],
                            t->output_arg[0],
            ( fla_gemv_t* ) t->cntl );
   }
   // FLA_Trsv
   else if ( t->func == (void *) FLA_Trsv_task )
   {
      flash_trsv_p func;
      func = (flash_trsv_p) t->func;
      
      func( ( FLA_Uplo    ) t->int_arg[0],
            ( FLA_Trans   ) t->int_arg[1],
            ( FLA_Diag    ) t->int_arg[2],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_trsv_t* ) t->cntl );
   }
   // FLA_Axpy
   else if ( t->func == (void *) FLA_Axpy_task )
   {
      flash_axpy_p func;
      func = (flash_axpy_p) t->func;
         
      func(                 t->fla_arg[0],
                            t->input_arg[0],
                            t->output_arg[0],
            ( fla_axpy_t* ) t->cntl );
   }
   // FLA_Axpyt
   else if ( t->func == (void *) FLA_Axpyt_task )
   {
      flash_axpyt_p func;
      func = (flash_axpyt_p) t->func;
      
      func( ( FLA_Trans    ) t->int_arg[0],
                             t->fla_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_axpyt_t* ) t->cntl );
   }
   // FLA_Copy
   else if ( t->func == (void *) FLA_Copy_task )
   {
      flash_copy_p func;
      func = (flash_copy_p) t->func;
         
      func(                 t->input_arg[0],
                            t->output_arg[0],
            ( fla_copy_t* ) t->cntl );
   }
   // FLA_Copyt
   else if ( t->func == (void *) FLA_Copyt_task )
   {
      flash_copyt_p func;
      func = (flash_copyt_p) t->func;
      
      func( ( FLA_Trans    ) t->int_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_copyt_t* ) t->cntl );
   }
   // FLA_Copyr
   else if ( t->func == (void *) FLA_Copyr_task )
   {
      flash_copyr_p func;
      func = (flash_copyr_p) t->func;
      
      func( ( FLA_Uplo     ) t->int_arg[0],
                             t->input_arg[0],
                             t->output_arg[0],
            ( fla_copyr_t* ) t->cntl );
   }
   // FLA_Scal
   else if ( t->func == (void *) FLA_Scal_task )
   {
      flash_scal_p func;
      func = (flash_scal_p) t->func;

      func(                 t->fla_arg[0],
                            t->output_arg[0],
            ( fla_scal_t* ) t->cntl );
   }
   // FLA_Scalr
   else if ( t->func == (void *) FLA_Scalr_task )
   {
      flash_scalr_p func;
      func = (flash_scalr_p) t->func;

      func( ( FLA_Uplo     ) t->int_arg[0],
                             t->fla_arg[0],
                             t->output_arg[0],
            ( fla_scalr_t* ) t->cntl );
   }
   // FLA_Obj_create_buffer
   else if ( t->func == (void *) FLA_Obj_create_buffer_task )
   {
      flash_obj_create_buffer_p func;
      func = (flash_obj_create_buffer_p) t->func;

      func( ( dim_t       ) t->int_arg[0],
            ( dim_t       ) t->int_arg[1],
                            t->output_arg[0],
                            t->cntl );
   }
   // FLA_Obj_free_buffer
   else if ( t->func == (void *) FLA_Obj_free_buffer_task )
   {
      flash_obj_free_buffer_p func;
      func = (flash_obj_free_buffer_p) t->func;

      func(                 t->output_arg[0],
                            t->cntl );
   }
   else
   {
      FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
   }
   
   return;
}
void FLASH_Queue_finalize ( void  )

References FLASH_Queue_finalize_gpu().

Referenced by FLA_Finalize().

{
   // Exit early if we're not already initialized.
   if ( flash_queue_initialized == FALSE )
      return;

   // Clear the initialized flag.
   flash_queue_initialized = FALSE;

#ifdef FLA_ENABLE_GPU
   FLASH_Queue_finalize_gpu();
#endif

   return;
}
void FLASH_Queue_flush_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool transfer = FALSE;
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   // The block is owned by the GPU.
   if ( k < gpu_n_blocks )
   {
      // Save the block that will be flushed.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // If the block is dirty, then flush it.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
         transfer = TRUE;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Exit early if a flush is not required.
   if ( !transfer )
      return;

   // Flush the block outside the critical section.
   FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );   

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
   
   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   if ( k < gpu_n_blocks )
   {
      // Update the bits for the flushed block.
      args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request = FALSE;
   }
   
#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}
void FLASH_Queue_flush_gpu ( int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   int n_transfer = 0;
   FLA_Obj_gpu gpu_obj;
   
   // Exit if not using GPU.
   if ( !FLASH_Queue_get_enabled_gpu() )
      return;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   for ( k = 0; k < gpu_n_blocks; k++ )
   {
      // Save the block that might be flushed.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // Flush the block if it is dirty and requested.
      if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
      {
         // Save the block for data transfer outside the critical section.
         args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
         n_transfer++;
      }
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Exit early if a flush is not required.   
   if ( n_transfer == 0 )
      return;
   
   // Flush the block outside the critical section.
   for ( i = 0; i < n_transfer; i++ )
   {
      gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
      FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Update the bits for each block that is flushed.
   for ( i = 0; i < n_transfer; i++ )
   {
      // Locate the position of the block on the GPU.
      for ( k = 0; k < gpu_n_blocks; k++ )
         if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base == 
              args->gpu[thread * gpu_n_blocks + k].obj.base )
            break;
      
      if ( k < gpu_n_blocks )
      {
         // The block is now clean.
         args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
         args->gpu[thread * gpu_n_blocks + k].request = FALSE;
      }
   }
   
#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}

Referenced by FLASH_Queue_exec().

{
   return flash_queue_block_size;
}

Referenced by FLASH_Queue_prefetch_block().

{
   return flash_queue_cache_line_size;
}

Referenced by FLASH_Queue_exec().

{
   return flash_queue_cache_size;
}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   return flash_queue_cores_per_cache;
}

Referenced by FLASH_Queue_exec().

{
   return flash_queue_cores_per_queue;
}

Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().

{ 
   return flash_queue_data_affinity;
}
unsigned int FLASH_Queue_get_num_tasks ( void  )
unsigned int FLASH_Queue_get_num_threads ( void  )
double FLASH_Queue_get_parallel_time ( void  )
{
   // Only return time if out of parallel region.
   if ( flash_queue_stack == 0 )
      return flash_queue_parallel_time;

   return 0.0;
}

Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().

{ 
   return flash_queue_sorting;
}

References FLASH_Queue_s::tail.

Referenced by FLASH_Queue_init_tasks().

{
   return _tq.tail;
}
double FLASH_Queue_get_total_time ( void  )
{
   // Only return time if out of parallel region.
   if ( flash_queue_stack == 0 )
      return flash_queue_total_time;

   return 0.0;
}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().

{ 
   return flash_queue_verbose;
}

Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().

{
   return flash_queue_work_stealing;
}
void FLASH_Queue_init ( void  )

References FLASH_Queue_init_gpu(), and FLASH_Queue_reset().

Referenced by FLA_Init().

{
   // Exit early if we're already initialized.
   if ( flash_queue_initialized == TRUE )
      return;
   
   // Reset all the initial values.
   FLASH_Queue_reset();

   // Set the initialized flag.
   flash_queue_initialized = TRUE;

#ifdef FLA_ENABLE_GPU
   FLASH_Queue_init_gpu();
#endif

   return;
}
void FLASH_Queue_init_tasks ( void *  arg)

References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.

Referenced by FLASH_Queue_exec().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int            i, j;
   int            n_tasks = FLASH_Queue_get_num_tasks();
   int            n_ready = 0;
   int            height;
   FLASH_Task*    t;
   FLASH_Dep*     d;

   // Grab the tail of the task queue.
   t = FLASH_Queue_get_tail_task();

   for ( i = n_tasks - 1; i >= 0; i-- )
   {
      // Save all the task pointers.
      args->task_queue[i] = t;

      // Only use a single queue implementation.
      t->queue = 0;

      // Determine the height of each task in the DAG.
      height = 0;
      d = t->dep_arg_head;

      // Take the maximum height of dependent tasks.
      for ( j = 0; j < t->n_dep_args; j++ )
      {
         height = max( height, d->task->height );
         d = d->next_dep;
      }

      t->height = height + 1;

      // Since freeing a task is always a leaf, we want to force it to execute 
      // earlier by giving it a greater height in order to reclaim memory.
      if ( t->func == (void *) FLA_Obj_free_buffer_task )
         t->height += n_tasks;

      // Find all ready tasks.
      t->n_ready += t->n_input_args + t->n_output_args + 
                    t->n_macro_args + t->n_war_args;

      if ( t->n_ready == 0 )
      {
         // Save the number of ready and available tasks.
         n_ready++;
      }

      if ( FLA_is_owner() )
      {
         // Record all the ready values.
         args->n_ready[i] = t->n_ready;
      }

      // Go to the previous task.
      t = t->prev_task;
   }

   // Only allow the first core to enqueue the initial ready tasks.
   if ( !FLA_is_owner() )
      return;
   
   // Grab the head of the task queue.
   t = FLASH_Queue_get_head_task();
   
   for ( i = 0; i < n_tasks && n_ready > 0; i++ )
   {
      if ( t->n_ready == 0 )
      {
         RCCE_acquire_lock( 0 );

         // Enqueue all the ready and available tasks.
         FLASH_Queue_wait_enqueue( t, arg );

         RCCE_release_lock( 0 );

         // Decrement the number of ready tasks left to be enqueued.
         n_ready--;
      }
      
      // Go to the next task.
      t = t->next_task;
   }

   return;
}
void FLASH_Queue_invalidate_block_gpu ( FLA_Obj  obj,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.

Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int j, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Locate the position of the block on the GPU.
   for ( k = 0; k < gpu_n_blocks; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;
   
   // The block is owned by other GPU.
   if ( k < gpu_n_blocks )
   {
      // Invalidate the block.
      args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;

      args->gpu[thread * gpu_n_blocks + k].clean    = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request  = FALSE;

      // Save the block that will be invalidated.
      gpu_obj = args->gpu[thread * gpu_n_blocks + k];
      
      // Shift all the blocks for the invalidated block.
      for ( j = k; j < gpu_n_blocks - 1; j++ )
         args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
      
      // Move to the LRU block.
      args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   return;
}
void FLASH_Queue_mark_gpu ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, j, k;
   int thread = t->thread;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool duplicate;
   FLA_Obj  obj;

   // Mark all the output blocks on the GPU as dirty.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      obj = t->output_arg[i];

      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( obj.base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
#ifdef FLA_ENABLE_MULTITHREADING
         FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif

         // Locate the position of the block on the GPU.
         for ( k = 0; k < gpu_n_blocks; k++ )
            if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
               break;
         
         if ( k < gpu_n_blocks )
         {
            // Change the bits for the new dirty block.
            args->gpu[thread * gpu_n_blocks + k].clean   = FALSE;
            args->gpu[thread * gpu_n_blocks + k].request = FALSE;
         }

#ifdef FLA_ENABLE_MULTITHREADING
         FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
      }
   }

   return;
}
void FLASH_Queue_prefetch ( int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;   
   int i;
   int size = args->size;
   FLA_Obj obj;

   // Prefetch blocks in opposite order to maintain LRU.
   for ( i = size - 1; i >= 0; i-- )
   {
      obj = args->prefetch[i];

      // Only prefetch if it is a valid block.
      if ( obj.base != NULL )
      {
         // Prefetch the block.
         FLASH_Queue_prefetch_block( obj );
         
         // Record the prefetched block in the cache.
         args->cache[cache * size + i] = obj;
      }
   }

   return;
}

References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), scomplex::real, and dcomplex::real.

Referenced by FLASH_Queue_prefetch().

{
   int          i, inc;
   int          line_size = FLASH_Queue_get_cache_line_size();
   int          elem_size = FLA_Obj_elem_size( obj );
   int          length    = FLA_Obj_length( obj );
   int          width     = FLA_Obj_width( obj );
   FLA_Datatype datatype  = FLA_Obj_datatype( obj );

   // Determine stride to prefetch block into cache.
   inc = line_size / elem_size;

   // Switch between the four different datatypes.
   switch ( datatype )
   {
      case FLA_FLOAT:
      {
         float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
         float access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      case FLA_DOUBLE:
      {
         double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
         double access;
         
         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      case FLA_COMPLEX:
      {
         scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
         scomplex access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access.real += 1.0;

         break;
      }
      case FLA_DOUBLE_COMPLEX:
      {
         dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
         dcomplex access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access.real += 1.0;

         break;
      }
      case FLA_INT:
      {
         int *buffer = ( int * ) FLA_INT_PTR( obj );
         int access;

         // Access each cache line of the block.
         for ( i = 0; i < length * width; i += inc )
            access = buffer[i];

         // Prevent dead code elimination.
         access += 1.0;

         break;
      }
      default:
         // This default case should never execute.
         FLA_Check_error_code( FLA_INVALID_DATATYPE );
   }

   return;
}
void FLASH_Queue_push ( void *  func,
void *  cntl,
char *  name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args,
  ... 
)

References FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Queue_push_output(), FLASH_Task_alloc(), FLASH_Queue_s::head, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::queue, FLASH_Queue_s::tail, and FLA_Obj_struct::write_task.

{
   int         i;
   va_list     var_arg_list;
   FLASH_Task* t;
   FLA_Obj     obj;

   // Allocate a new FLA_Task and populate its fields with appropriate values.
   t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
                         n_int_args, n_fla_args,
                         n_input_args, n_output_args );
   
   // Initialize variable argument environment. In case you're wondering, the
   // second argument in this macro invocation of va_start() is supposed to be
   // the parameter that immediately preceeds the variable argument list
   // (ie: the ... above ).
   va_start( var_arg_list, n_output_args );

   // Extract the integer arguments.
   for ( i = 0; i < n_int_args; i++ )
      t->int_arg[i] = va_arg( var_arg_list, int );
   
   // Extract the FLA_Obj arguments.
   for ( i = 0; i < n_fla_args; i++ )
      t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );

   // Extract the input FLA_Obj arguments.
   for ( i = 0; i < n_input_args; i++ )
   {
      obj = va_arg( var_arg_list, FLA_Obj );
      t->input_arg[i] = obj;

      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

         // Dependence analysis for each input block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );

         // Set the number of blocks in the macroblock subtracted by one
         // since we do not want to recount an operand for each n_input_arg.
         t->n_macro_args += m * n - 1;
      }      
      else // Regular block.
      {
         // Dependence analysis for input operand.
         FLASH_Queue_push_input( obj, t );
      }
   }

   // Extract the output FLA_Obj arguments.
   for ( i = 0; i < n_output_args; i++ )
   {
      obj = va_arg( var_arg_list, FLA_Obj );
      t->output_arg[i] = obj;

      // Only assign data affinity to the first output block.
      if ( i == 0 )
      {
         FLA_Obj buf = obj;

         // Use the top left block of the macroblock.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
            buf = *FLASH_OBJ_PTR_AT( obj );
         
         if ( buf.base->write_task == NULL )
            t->queue = flash_queue_n_write_blocks;
         else 
            t->queue = buf.base->write_task->queue;
      }

      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

         // Dependence analysis for each output block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );

         // Set the number of blocks in the macroblock subtracted by one
         // since we do not want to recount an operand for each n_output_arg.
         t->n_macro_args += m * n - 1;
      }      
      else // Regular block.
      {
         // Dependence analysis for output operand.
         FLASH_Queue_push_output( obj, t );
      }
   }      

   // Finalize the variable argument environment.
   va_end( var_arg_list );
  
   // Add the task to the tail of the queue (and the head if queue is empty).
   if ( _tq.n_tasks == 0 )
   {
      _tq.head = t;
      _tq.tail = t;
   }
   else
   {
      t->prev_task = _tq.tail;
      _tq.tail->next_task = t;
      _tq.tail            = t;

      // Determine the index of the task in the task queue.
      t->order = t->prev_task->order + 1;
   }
   
   // Increment the number of tasks.
   _tq.n_tasks++;

   return;
}
void FLASH_Queue_push_input ( FLA_Obj  obj,
FLASH_Task t 
)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_push().

{
   FLASH_Task* task;
   FLASH_Dep*  d;

   // Find dependence information.
   if ( obj.base->write_task == NULL )
   {
      t->n_ready--;
      
      // Add to number of blocks read if not written and not read before.
      if ( obj.base->n_read_tasks == 0 )
      {
         // Identify each read block with an id for freeing.
         obj.base->n_read_blocks = flash_queue_n_read_blocks;
         
         flash_queue_n_read_blocks++;            
      }
   }
   else
   { // Flow dependence.
      task = obj.base->write_task;
      
      d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
      
      d->task     = t;
      d->next_dep = NULL;
      
      if ( task->n_dep_args == 0 )
      {
         task->dep_arg_head = d;
         task->dep_arg_tail = d;
      }
      else
      {
         task->dep_arg_tail->next_dep = d;
         task->dep_arg_tail           = d;
      }
      
      task->n_dep_args++;
   }
   
   // Add task to the read task in the object if not already there.
   if ( obj.base->n_read_tasks == 0 ||
        obj.base->read_task_tail->task != t )
   { // Anti-dependence potentially.
      d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
      
      d->task     = t;
      d->next_dep = NULL;
      
      if ( obj.base->n_read_tasks == 0 )
      {
         obj.base->read_task_head = d;
         obj.base->read_task_tail = d;
      }
      else
      {
         obj.base->read_task_tail->next_dep = d;
         obj.base->read_task_tail           = d;
      }
      
      obj.base->n_read_tasks++;
   }      
   
   return;
}
void FLASH_Queue_push_output ( FLA_Obj  obj,
FLASH_Task t 
)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_free(), FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_push().

{
   int         i;
   FLASH_Task* task;
   FLASH_Dep*  d;
   FLASH_Dep*  next_dep;

   // Assign tasks to threads with data affinity.
   if ( obj.base->write_task == NULL )
   {
      t->n_ready--;
      
      // Save index in which this output block is first encountered.
      obj.base->n_write_blocks = flash_queue_n_write_blocks;
      
      // Number of blocks written if not written before.
      flash_queue_n_write_blocks++;
      
      // Add to number of blocks read if not written or read before.
      if ( obj.base->n_read_tasks == 0 )
      {
         // Identify each read block with an id for freeing.
         obj.base->n_read_blocks = flash_queue_n_read_blocks;
         
         flash_queue_n_read_blocks++;
      }
   }
   else
   { // Flow dependence potentially.
      // The last task to overwrite this block is not itself.
      if ( obj.base->write_task != t )
      {
         // Create dependency from task that last wrote the block.
         task = obj.base->write_task;
         
         d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
         
         d->task     = t;
         d->next_dep = NULL;
         
         if ( task->n_dep_args == 0 )
         {
            task->dep_arg_head = d;
            task->dep_arg_tail = d;
         }
         else
         {
            task->dep_arg_tail->next_dep = d;
            task->dep_arg_tail           = d;
         }
         
         task->n_dep_args++;
      }
      else
      {
         // No need to notify task twice for output block already seen.
         t->n_ready--;
      }
   }
   
   // Clear read task for next set of reads and record the anti-dependence.
   d = obj.base->read_task_head;
   
   for ( i = 0; i < obj.base->n_read_tasks; i++ )
   {
      task     = d->task;
      next_dep = d->next_dep;
      
      // If the last task to read is not the current task, add dependence.
      if ( task != t )
      {
         d->task     = t;
         d->next_dep = NULL;
         
         if ( task->n_dep_args == 0 )
         {
            task->dep_arg_head = d;
            task->dep_arg_tail = d;
         }
         else
         {
            task->dep_arg_tail->next_dep = d;
            task->dep_arg_tail           = d;
         }
         
         task->n_dep_args++;
         
         t->n_war_args++;
      }  
      else
      {
         FLA_free( d );
      }
      
      d = next_dep;
   }
   
   obj.base->n_read_tasks   = 0;
   obj.base->read_task_head = NULL;
   obj.base->read_task_tail = NULL;
   
   // Record this task as the last to write to this block.
   obj.base->write_task = t;
   
   return;
}
void FLASH_Queue_reset ( void  )

References FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_init().

{
   // Clear the other fields of the FLASH_Queue structure.
   _tq.n_tasks = 0;
   _tq.head    = NULL;
   _tq.tail    = NULL;

   // Reset the number of blocks.
   flash_queue_n_read_blocks  = 0;
   flash_queue_n_write_blocks = 0;

   return;
}

Referenced by FLASH_Obj_create_hierarchy().

{
   // Only adjust the block size if the new block is larger.
   if ( flash_queue_block_size < size )
      flash_queue_block_size = size;

   return;
}
{
   flash_queue_cache_line_size = size;

   return;
}
{
   flash_queue_cache_size = size;

   return;
}
void FLASH_Queue_set_caching ( FLA_Bool  caching)

Referenced by FLASH_Queue_exec().

{ 
   flash_queue_caching = caching; 

   return;
}
void FLASH_Queue_set_cores_per_cache ( int  cores)
{
   flash_queue_cores_per_cache = cores;

   return;
}
void FLASH_Queue_set_cores_per_queue ( int  cores)
{
   flash_queue_cores_per_queue = cores;

   return;
}

Referenced by FLASH_Queue_exec().

{ 
   flash_queue_data_affinity = data_affinity; 

   return;
}
void FLASH_Queue_set_num_threads ( unsigned int  n_threads)

References FLA_Check_num_threads().

{
   FLA_Error e_val;

   // Verify that the number of threads is positive. 
   e_val = FLA_Check_num_threads( n_threads );
   FLA_Check_error_code( e_val );

   // Keep track of the number of threads internally.
   flash_queue_n_threads = n_threads;

#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP

   // No additional action is necessary to set the number of OpenMP threads
   // since setting the number of threads is handled at the parallel for loop
   // with a num_threads() clause. This gives the user more flexibility since
   // he can use the OMP_NUM_THREADS environment variable or the
   // omp_set_num_threads() function to set the global number of OpenMP threads
   // independently of the number of SuperMatrix threads.
   
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS

   // No additional action is necessary to set the number of pthreads
   // since setting the number of threads is handled entirely on our end.

#endif

   return;
}
void FLASH_Queue_set_parallel_time ( double  dtime)

Referenced by FLASH_Queue_exec().

{
   flash_queue_parallel_time = dtime;

   return;
}
void FLASH_Queue_set_sorting ( FLA_Bool  sorting)
{ 
   flash_queue_sorting = sorting; 

   return;
}
{ 
   flash_queue_verbose = verbose;

   return;
}
void FLASH_Queue_set_work_stealing ( FLA_Bool  work_stealing)

Referenced by FLASH_Queue_exec().

{
   flash_queue_work_stealing = work_stealing;

   return;
}
unsigned int FLASH_Queue_stack_depth ( void  )
void FLASH_Queue_update_block_gpu ( FLA_Obj  obj,
void **  buffer_gpu,
int  thread,
void *  arg 
)

References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.

Referenced by FLASH_Queue_update_gpu().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int j, k;
   dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
   FLA_Bool transfer = FALSE;
   FLA_Bool evict = FALSE;
   FLA_Obj_gpu evict_obj;   
   FLA_Obj_gpu gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
   
   // Locate the position of the block on GPU.
   for ( k = 0; k < gpu_n_blocks - 1; k++ )
      if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
         break;

   // Save the pointer to the data on the GPU.
   buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;

   // Save the victim block.
   evict_obj = args->gpu[thread * gpu_n_blocks + k];

   // The block is not already in the GPU.
   if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
   {
      // Save for data transfer outside of critical section.
      transfer = TRUE;

      // Save for eviction outside of critical section.
      if ( evict_obj.obj.base != NULL && !evict_obj.clean )
      {
         evict = TRUE;
         args->victim[thread] = evict_obj;
      }      

      // Save the block in the data structure.
      args->gpu[thread * gpu_n_blocks + k].obj = obj;

      // Make sure the new block is clean.
      args->gpu[thread * gpu_n_blocks + k].clean   = TRUE;
      args->gpu[thread * gpu_n_blocks + k].request = FALSE;
   }       
   
   // Use the block on the GPU that is a hit or LRU.
   gpu_obj = args->gpu[thread * gpu_n_blocks + k];

   // Shift all the previous tasks for LRU replacement.
   for ( j = k; j > 0; j-- )
      args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
   
   // Place the block on the cache as the most recently used.
   args->gpu[thread * gpu_n_blocks] = gpu_obj;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif

   // Evict and flush the LRU dirty block.
   if ( evict )
   {
      FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );

#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
      
      args->victim[thread].obj.base = NULL;
      
#ifdef FLA_ENABLE_MULTITHREADING
      FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
   }

   // Move the block to the GPU.
   if ( transfer )
      FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );

   return;
}
void FLASH_Queue_update_cache ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, and FLASH_Task_s::output_arg.

Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().

{
   int      i, j;
   FLA_Bool duplicate;
   FLA_Obj  obj;

   if ( t == NULL )
      return;
   
   // Updating the input blocks.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < t->n_output_args && !duplicate; j++ )
      {
     if ( t->input_arg[i].base == t->output_arg[j].base )
        duplicate = TRUE;
      }

      for ( j = 0; j < i && !duplicate; j++ )
      {
     if ( t->input_arg[i].base == t->input_arg[j].base )
        duplicate = TRUE;
      }

      // If the input block has not been processed before.
      if ( !duplicate )
      {
         obj = t->input_arg[i];
         
         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

            // Dependence analysis for each input block in macroblock.
            for ( jj = 0; jj < n; jj++ )
               for ( kk = 0; kk < m; kk++ )
                  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), 
                                                  t->cache, FALSE, arg );
         }
         else // Regular block.
         {
            FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
         }
      }
   }

   // Updating the output blocks.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
     if ( t->output_arg[i].base == t->output_arg[j].base )
        duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
         obj = t->output_arg[i];

         // Macroblock is used.
         if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
         {
            dim_t    jj, kk;
            dim_t    m    = FLA_Obj_length( obj );
            dim_t    n    = FLA_Obj_width( obj );
            dim_t    cs   = FLA_Obj_col_stride( obj );
            FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

            // Dependence analysis for each input block in macroblock.
            for ( jj = 0; jj < n; jj++ )
               for ( kk = 0; kk < m; kk++ )
                  FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
                                                  t->cache, TRUE, arg );
         }
         else // Regular block.
         {
            FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
         }
      }
   }   
  
   return;
}
void FLASH_Queue_update_cache_block ( FLA_Obj  obj,
int  cache,
FLA_Bool  output,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.

Referenced by FLASH_Queue_update_cache().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int i, j, k;
   int n_caches = args->n_caches;
   int size     = args->size;

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***      
#endif

   // Locate the position of the block in the cache.
   for ( k = 0; k < size - 1; k++ )
   {
      if ( obj.base == args->cache[cache * size + k].base )
     break;
   }

   // Shift all the previous tasks for LRU replacement.
   for ( j = k; j > 0; j-- )
      args->cache[cache * size + j] = args->cache[cache * size + j - 1];

   // Place the block on the cache as the most recently used.
   args->cache[cache * size] = obj;

#ifdef FLA_ENABLE_MULTITHREADING      
   FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
#endif

   // Write invalidate if updating with output block.
   if ( output )
   {
      for ( i = 0; i < n_caches; i++ )
      {
         if ( i != cache )
         {
#ifdef FLA_ENABLE_MULTITHREADING
        FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***      
#endif
        // Locate the position of the block in the cache.
        for ( k = 0; k < size; k++ )
        {
           if ( obj.base == args->cache[i * size + k].base )
          break;
        }

        // The block is owned by other thread.
        if ( k < size )
        {
           // Shift all the blocks for the invalidated block.
           for ( j = k; j < size - 1; j++ )
          args->cache[i * size + j] = args->cache[i * size + j + 1];

           // Invalidate the block.
           args->cache[i * size + size - 1].base = NULL;
        }
#ifdef FLA_ENABLE_MULTITHREADING      
        FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
#endif
         }
      }
   }

   return;
}
void FLASH_Queue_update_gpu ( FLASH_Task t,
void **  input_arg,
void **  output_arg,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_exec_gpu().

{
   int i, j, k;
   int thread    = t->thread;
   int n_threads = FLASH_Queue_get_num_threads();
   FLA_Bool duplicate;

   // None of the arguments can be macroblocks yet.
   // Complicating factor is copying macroblock to contiguous memory on GPU.
   
   // Bring the input arguments to the GPU.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < t->n_output_args && !duplicate; j++ )
      {
         if ( t->input_arg[i].base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( t->input_arg[i].base == t->input_arg[j].base )
            duplicate = TRUE;
      }

      // If the input block has not been processed before.
      if ( !duplicate )
      {
         FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
      }
      else
      {
         input_arg[i] = NULL;
      }
   }

   // Bring the output arguments to the GPU.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      // Check for duplicate blocks.
      duplicate = FALSE;

      for ( j = 0; j < i && !duplicate; j++ )
      {
         if ( t->output_arg[i].base == t->output_arg[j].base )
            duplicate = TRUE;
      }

      // If the output block has not been processed before.
      if ( !duplicate )
      {
         FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );                                       
         
         // Invalidate output blocks on all other GPUs.
         for ( k = 0; k < n_threads; k++ )
            if ( k != thread )
               FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
      }
      else
      {
         output_arg[i] = NULL;
      }
   }

   // Check to see if there are any duplicates.
   for ( i = t->n_input_args - 1; i >= 0; i-- )
   {
      for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
      {
         if ( t->input_arg[i].base == t->output_arg[j].base )
            input_arg[i] = output_arg[j];
      }

      for ( j = 0; j < i && input_arg[i] == NULL; j++ )
      {
         if ( t->input_arg[i].base == t->input_arg[j].base )
            input_arg[i] = input_arg[j];
      }
   }

   // Check to see if there are any duplicates.
   for ( i = t->n_output_args - 1; i >= 0; i-- )
   {
      for ( j = 0; j < i && output_arg[i] == NULL; j++ )
      {
         if ( t->output_arg[i].base == t->output_arg[j].base )
            output_arg[i] = output_arg[j];
      }
   }

   return;
}
void FLASH_Queue_verbose_output ( void  )

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec().

{
   int           i, j, k;
   int           n_threads = FLASH_Queue_get_num_threads();
   int           n_tasks   = FLASH_Queue_get_num_tasks();
   FLASH_Verbose verbose   = FLASH_Queue_get_verbose_output();
   FLASH_Task*   t;
   FLASH_Dep*    d;

   // Grab the head of the task queue.
   t = FLASH_Queue_get_head_task();
   
   if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
   {
      // Iterate over linked list of tasks.
      for ( i = 0; i < n_tasks; i++ )
      {
         printf( "%d\t%s\t", t->order, t->name );
         
         for ( j = 0; j < t->n_output_args; j++ )
            printf( "%lu[%u,%u] ", t->output_arg[j].base->id,
                    t->output_arg[j].base->m_index, 
                    t->output_arg[j].base->n_index );
         
         printf( ":= " );
         
         for ( j = 0; j < t->n_output_args; j++ )
            printf( "%lu[%u,%u] ", t->output_arg[j].base->id,
                    t->output_arg[j].base->m_index, 
                    t->output_arg[j].base->n_index );
         
         for ( j = 0; j < t->n_input_args; j++ )
            printf( "%lu[%u,%u] ", t->input_arg[j].base->id,
                    t->input_arg[j].base->m_index, 
                    t->input_arg[j].base->n_index );

         printf( "\n" );
         
         // Go to the next task.
         t = t->next_task;
      }

      printf( "\n" );
   }
   else
   {
      printf( "digraph SuperMatrix {\n" );

      if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
      {
         // Iterate over linked list of tasks.
         for ( i = 0; i < n_tasks; i++ )
         {
            printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
            
            d = t->dep_arg_head;
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               printf( "%d;", d->task->order );
               d = d->next_dep;
            }

            printf( "};\n" );
            
            // Go to the next task.
            t = t->next_task;
         }
      }
      else
      {
         // Iterate over all the threads.
         for ( k = 0; k < n_threads; k++ )
         {
            printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );

            // Iterate over linked list of tasks.
            for ( i = 0; i < n_tasks; i++ )
            {  
               if ( t->queue == k )
                  printf( "%d [label=\"%s\"];\n", t->order, t->name );
               
               // Go to the next task.
               t = t->next_task;               
            }

            printf( "}\n" );

            // Grab the head of the task queue.
            t = FLASH_Queue_get_head_task();               
         }

         // Iterate over linked list of tasks.
         for ( i = 0; i < n_tasks; i++ )
         {
            printf( "%d -> {", t->order );
            
            d = t->dep_arg_head;
            for ( j = 0; j < t->n_dep_args; j++ )
            {
               printf( "%d;", d->task->order );
               d = d->next_dep;
            }

            printf( "};\n" );
            
            // Go to the next task.
            t = t->next_task;
         }
      }

      printf( "}\n\n" );
   }
   
   return;
}
FLASH_Task * FLASH_Queue_wait_dequeue ( int  queue,
int  cache,
void *  arg 
)
FLASH_Task* FLASH_Queue_wait_dequeue_block ( int  queue,
int  cache,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_wait_dequeue().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         i, j, k;
   int         size    = args->size;
   int         n_tasks = args->wait_queue[queue].n_tasks;
   FLA_Bool    enabled = FALSE;
   FLASH_Task* t;
   FLA_Obj     obj;
   FLA_Obj     mem;

#ifdef FLA_ENABLE_GPU
   enabled = FLASH_Queue_get_enabled_gpu();

   // If using GPUs, then only check GPU and not the cache.
   if ( enabled )
      size = FLASH_Queue_get_gpu_num_blocks();
#endif

   t = args->wait_queue[queue].head;
   
   // Check if any of the output blocks are in the cache.
   for ( i = 0; i < n_tasks; i++ )
   {
      for ( j = 0; j < size; j++ )
      {
         // Initialize the memory just in case.
         mem.base = NULL;
         
         // Determine if using GPU or not.
         if ( enabled )
         {
#ifdef FLA_ENABLE_GPU
            mem = args->gpu[cache * size + j].obj;
#endif
         }
         else
         {
            mem = args->cache[cache * size + j];
         }

     for ( k = 0; k < t->n_output_args; k++ )
     {  
            obj = t->output_arg[k];

            if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
               obj = *FLASH_OBJ_PTR_AT( obj );

            // Return the task if its output block is in cache.
            if ( mem.base == obj.base )
            {
               t->hit = TRUE;
               return t;
            }
     }
      }
      t = t->next_wait;
   }
   
   return args->wait_queue[queue].head;
}
void FLASH_Queue_wait_enqueue ( FLASH_Task t,
void *  arg 
)
FLASH_Task* FLASH_Queue_work_stealing ( int  queue,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         q;
   int         n_queues = args->n_queues;
   FLASH_Task* t = NULL;

   // Do not perform work stealing if there is only one queue.
   if ( n_queues == 1 )
      return t;

   // Find a random queue not equal to the current queue.
   do
   {
#ifdef FLA_ENABLE_WINDOWS_BUILD
      rand_s( &q );
      q = q % n_queues;
#else
#ifdef FLA_ENABLE_TIDSP
      q = rand() % n_queues;
#else
      q = lrand48() % n_queues;
#endif
#endif
   }
   while ( q == queue );

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
#endif

   // If there are tasks that this thread can steal.
   if ( args->wait_queue[q].n_tasks > 0 )
   {
      // Dequeue the last task.
      t = args->wait_queue[q].tail;

      if ( args->wait_queue[q].n_tasks == 1 )
      {
         // Clear the queue of its only task.
         args->wait_queue[q].head = NULL;
         args->wait_queue[q].tail = NULL;
      }
      else
      {
         // Adjust pointers in waiting queue.
         args->wait_queue[q].tail = t->prev_wait;
         args->wait_queue[q].tail->next_wait = NULL;
      }

      // Reset waiting queue data about the stolen task.
      t->queue = queue;
      t->prev_wait = NULL;
      t->next_wait = NULL;
      
      args->wait_queue[q].n_tasks--;
   }

#ifdef FLA_ENABLE_MULTITHREADING
   FLA_Lock_release( &(args->run_lock[q]) ); // R ***
#endif

   return t;
}
FLASH_Task* FLASH_Task_alloc ( void *  func,
void *  cntl,
char *  name,
FLA_Bool  enabled_gpu,
int  n_int_args,
int  n_fla_args,
int  n_input_args,
int  n_output_args 
)

References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.

Referenced by FLASH_Queue_push().

{
   FLASH_Task* t;

   // Allocate space for the task structure t.
   t             = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );

   // Allocate space for the task's integer arguments.
   t->int_arg    = (int *) FLA_malloc( n_int_args * sizeof(int) );

   // Allocate space for the task's FLA_Obj arguments.
   t->fla_arg    = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );

   // Allocate space for the task's input FLA_Obj arguments.
   t->input_arg  = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );

   // Allocate space for the task's output FLA_Obj arguments.
   t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
   
   // Initialize other fields of the structure.
   t->n_ready       = 0;
   t->order         = 0;
   t->queue         = 0;
   t->height        = 0;
   t->thread        = 0;
   t->cache         = 0;
   t->hit           = FALSE;

   t->func          = func;
   t->cntl          = cntl;
   t->name          = name;
   t->enabled_gpu   = enabled_gpu;
   t->n_int_args    = n_int_args;
   t->n_fla_args    = n_fla_args;
   t->n_input_args  = n_input_args;
   t->n_output_args = n_output_args;

   t->n_macro_args  = 0;
   t->n_war_args    = 0;
   t->n_dep_args    = 0;
   t->dep_arg_head  = NULL;
   t->dep_arg_tail  = NULL;
   t->prev_task     = NULL;
   t->next_task     = NULL;
   t->prev_wait     = NULL;
   t->next_wait     = NULL;
   
   // Return a pointer to the initialized structure.
   return t;
}
void FLASH_Task_free ( FLASH_Task t)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().

{
   int        i, j, k;
   FLA_Obj    obj;
   FLASH_Dep* d;
   FLASH_Dep* next_dep;

   // Clearing the last write task in each output block.
   for ( i = 0; i < t->n_output_args; i++ )
   {
      obj = t->output_arg[i];
      
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         
         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               ( buf + jj * cs + kk )->base->write_task = NULL;
      }
      else // Clear regular block.
      {
         obj.base->write_task = NULL;
      }
   }
   
   // Cleaning the last read tasks in each input block.
   for ( i = 0; i < t->n_input_args; i++ )
   {
      obj = t->input_arg[i];

      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
         {
            for ( kk = 0; kk < m; kk++ )
            {
               obj = *( buf + jj * cs + kk );
               
               k = obj.base->n_read_tasks;
               d = obj.base->read_task_head;

               obj.base->n_read_tasks   = 0;
               obj.base->read_task_head = NULL;
               obj.base->read_task_tail = NULL;

               for ( j = 0; j < k; j++ )
               {
                  next_dep = d->next_dep;
                  FLA_free( d );
                  d = next_dep;
               }
            }
         }
      }      
      else // Regular block.
      {     
         k = obj.base->n_read_tasks;
         d = obj.base->read_task_head;
         
         obj.base->n_read_tasks   = 0;
         obj.base->read_task_head = NULL;
         obj.base->read_task_tail = NULL;
         
         for ( j = 0; j < k; j++ )
         {
            next_dep = d->next_dep;
            FLA_free( d );
            d = next_dep;
         }
      }
   }

   // Free the dep_arg field of t.
   d = t->dep_arg_head;

   for ( i = 0; i < t->n_dep_args; i++ )
   {
      next_dep = d->next_dep;
      FLA_free( d );
      d = next_dep;
   }   

   // Free the int_arg field of t.
   FLA_free( t->int_arg );
   
   // Free the fla_arg field of t.
   FLA_free( t->fla_arg );

   // Free the input_arg field of t.
   FLA_free( t->input_arg );

   // Free the output_arg field of t.
   FLA_free( t->output_arg );

   // Finally, free the struct itself.
   FLA_free( t );

   return;
}
void FLASH_Task_free_parallel ( FLASH_Task t,
void *  arg 
)

References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;   
   int        i, j, k;
   int        thread;
   int        n_threads = FLASH_Queue_get_num_threads();
   FLASH_Dep* d;
   FLASH_Dep* next_dep;
   FLA_Obj    obj;

   // Clearing the last write task in each output block.
   for ( i = 0; i < t->n_output_args; i++ )
   {
      obj = t->output_arg[i];
      
      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );
         
         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
            for ( kk = 0; kk < m; kk++ )
               ( buf + jj * cs + kk )->base->write_task = NULL;
      }
      else // Clear regular block.
      {
         obj.base->write_task = NULL;
      }
   }
   
   // Cleaning the last read tasks in each input block.
   for ( i = 0; i < t->n_input_args; i++ )
   {
      obj = t->input_arg[i];

      // Macroblock is used.
      if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
      {
         dim_t    jj, kk;
         dim_t    m    = FLA_Obj_length( obj );
         dim_t    n    = FLA_Obj_width( obj );
         dim_t    cs   = FLA_Obj_col_stride( obj );
         FLA_Obj* buf  = FLASH_OBJ_PTR_AT( obj );

         // Clear each block in macroblock.
         for ( jj = 0; jj < n; jj++ )
         {
            for ( kk = 0; kk < m; kk++ )
            {
               obj = *( buf + jj * cs + kk );

               thread = obj.base->n_read_blocks % n_threads;

               FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***

               k = obj.base->n_read_tasks;
               d = obj.base->read_task_head;

               obj.base->n_read_tasks   = 0;
               obj.base->read_task_head = NULL;
               obj.base->read_task_tail = NULL;

               FLA_Lock_release( &(args->war_lock[thread]) ); // W ***

               for ( j = 0; j < k; j++ )
               {
                  next_dep = d->next_dep;
                  FLA_free( d );
                  d = next_dep;
               }
            }
         }
      }
      else // Regular block.
      {
         thread = obj.base->n_read_blocks % n_threads;

         FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***

         k = obj.base->n_read_tasks;
         d = obj.base->read_task_head;

         obj.base->n_read_tasks   = 0;
         obj.base->read_task_head = NULL;
         obj.base->read_task_tail = NULL;

         FLA_Lock_release( &(args->war_lock[thread]) ); // W ***

         for ( j = 0; j < k; j++ )
         {
            next_dep = d->next_dep;
            FLA_free( d );
            d = next_dep;
         }
      }
   }
   
   // Free the dep_arg field of t.
   d = t->dep_arg_head;

   for ( i = 0; i < t->n_dep_args; i++ )
   {
      next_dep = d->next_dep;
      FLA_free( d );
      d = next_dep;
   }

   // Free the int_arg field of t.
   FLA_free( t->int_arg );
   
   // Free the fla_arg field of t.
   FLA_free( t->fla_arg );

   // Free the input_arg field of t.
   FLA_free( t->input_arg );

   // Free the output_arg field of t.
   FLA_free( t->output_arg );

   // Finally, free the struct itself.
   FLA_free( t );

   return;
}
FLASH_Task* FLASH_Task_update_binding ( FLASH_Task t,
FLASH_Task r,
void *  arg 
)

References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.

Referenced by FLASH_Task_update_dependencies().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int queue;

   if ( r == NULL )
   {
      // There are no tasks on waiting queue, so bind the first task.
      r = t;
      r->hit = TRUE;
   }
   else
   {
      // Swap the binded task for the new ready task.
      if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
      {
         queue = r->queue;
         r->hit = FALSE;
         
         FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***

         // Place swapped task back onto waiting queue.
         FLASH_Queue_wait_enqueue( r, arg );
         
         FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
         
         // Bind the new ready task.
         r = t;
         r->hit = TRUE;
      }
      else // Keep the binded task and enqueue new ready task.
      {
         queue = t->queue;
         
         FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
         
         FLASH_Queue_wait_enqueue( t, arg );
         
         FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
      }
   }

   return r;
}

References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.

Referenced by FLASH_Queue_exec_parallel_function().

{
   FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
   int         i;
   int         n_threads = FLASH_Queue_get_num_threads();
   int         thread;
   FLA_Bool    available;
   FLASH_Task* task;
   FLASH_Task* r = NULL;
   FLASH_Dep*  d = t->dep_arg_head;
   
   // Check each dependent task.
   for ( i = 0; i < t->n_dep_args; i++ )
   {
      task = d->task;

      // Use the remaining locks except for the first one.
      thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );

      RCCE_acquire_lock( thread );
      
      args->n_ready[task->order]--;
      available = ( args->n_ready[task->order] == 0 );
      
      RCCE_release_lock( thread );

      // Place newly ready tasks on waiting queue.      
      if ( available )
      {
         RCCE_acquire_lock( 0 );
         
         FLASH_Queue_wait_enqueue( task, arg );
         
         RCCE_release_lock( 0 );
      }
      
      // Go to the next dep.
      d = d->next_dep;
   }

   return r;
}