libflame
revision_anchor
|
Go to the source code of this file.
void FLASH_Queue_begin | ( | void | ) |
References FLA_Clock().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{ #ifdef FLA_ENABLE_SUPERMATRIX if ( flash_queue_stack == 0 ) { // Save the starting time for the total execution time. flash_queue_total_time = FLA_Clock(); } #endif // Push onto the stack. flash_queue_stack++; return; }
FLA_Bool FLASH_Queue_check_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_check_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool r_val = TRUE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Request this block if it is dirty. if ( !args->gpu[thread * gpu_n_blocks + k].clean ) { args->gpu[thread * gpu_n_blocks + k].request = TRUE; r_val = FALSE; } } // Check the victim block. if ( obj.base == args->victim[thread].obj.base ) r_val = FALSE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return r_val; }
FLA_Bool FLASH_Queue_check_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ int i, j, k; int thread = t->thread; int n_input_args = t->n_input_args; int n_output_args = t->n_output_args; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool r_val = TRUE; FLA_Bool t_val; FLA_Bool duplicate; FLA_Obj obj; // Check the input and output arguments on the GPUs. for ( i = 0; i < n_input_args + n_output_args; i++ ) { // Check for duplicate blocks. duplicate = FALSE; // Find the correct input or output argument. if ( i < n_input_args ) { obj = t->input_arg[i]; for ( j = 0; j < n_output_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->input_arg[j].base ) duplicate = TRUE; } } else { obj = t->output_arg[i - n_input_args]; for ( j = 0; j < i - n_input_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } } // If the block has not been processed before. if ( !duplicate ) { // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); t_val = TRUE; // Check to see if the block is dirty on another GPU. for ( k = 0; k < n_threads && t_val; k++ ) if ( k != thread ) t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg ); r_val = r_val && t_val; } } } else { t_val = TRUE; // Check to see if the block is dirty on another GPU. for ( k = 0; k < n_threads && t_val; k++ ) if ( k != thread ) t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg ); r_val = r_val && t_val; } } } return r_val; }
void FLASH_Queue_create_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), and FLASH_Queue_variables::gpu.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); dim_t block_size = args->block_size; FLA_Datatype datatype = args->datatype; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; // Bind thread to GPU. FLASH_Queue_bind_gpu( thread ); // Allocate the memory on the GPU for all the blocks a priori. for ( i = 0; i < gpu_n_blocks; i++ ) FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) ); return; }
void FLASH_Queue_destroy_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and FLA_Obj_gpu_struct::obj.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Obj_gpu gpu_obj; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; // Examine every block left on the GPU. for ( i = 0; i < gpu_n_blocks; i++ ) { gpu_obj = args->gpu[thread * gpu_n_blocks + i]; // Flush the blocks that are dirty. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean ) FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); // Free the memory on the GPU for all the blocks. FLASH_Queue_free_gpu( gpu_obj.buffer_gpu ); } return; }
FLA_Error FLASH_Queue_disable | ( | void | ) |
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{ #ifdef FLA_ENABLE_SUPERMATRIX if ( flash_queue_stack == 0 ) { // Disable if not begin parallel region yet. flash_queue_enabled = FALSE; return FLA_SUCCESS; } else { // Cannot change status during parallel region. return FLA_FAILURE; } #else // Allow disabling enqueuing even when SuperMatrix is not configured. flash_queue_enabled = FALSE; return FLA_SUCCESS; #endif }
FLA_Error FLASH_Queue_enable | ( | void | ) |
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{ #ifdef FLA_ENABLE_SUPERMATRIX if ( flash_queue_stack == 0 ) { // Enable if not begin parallel region yet. flash_queue_enabled = TRUE; return FLA_SUCCESS; } else { // Cannot change status during parallel region. return FLA_FAILURE; } #else // Raise an exception when SuperMatrix is not configured. FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED ); return FLA_FAILURE; #endif }
void FLASH_Queue_end | ( | void | ) |
References FLA_Clock(), and FLASH_Queue_exec().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{ // Pop off the stack. flash_queue_stack--; #ifdef FLA_ENABLE_SUPERMATRIX if ( flash_queue_stack == 0 ) { // Execute tasks if encounter the outermost parallel region. FLASH_Queue_exec(); // Find the total execution time. flash_queue_total_time = FLA_Clock() - flash_queue_total_time; } #endif return; }
void FLASH_Queue_exec | ( | void | ) |
References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.
Referenced by FLASH_Queue_end().
{ int n_tasks = FLASH_Queue_get_num_tasks(); int i; double dtime; // All the necessary variables for the SuperMatrix mechanism. FLASH_Queue_vars args; // If the queue is empty, return early. if ( n_tasks == 0 ) return; // Turn off all multiple queue implementations. FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE ); FLASH_Queue_set_work_stealing( FALSE ); // Do not use cache affinity yet. FLASH_Queue_set_caching( FALSE ); // Allocate memory for task queues. args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) ); args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) ); args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) ); args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) ); args.pc = ( int* ) FLA_shmalloc( sizeof( int ) ); // Initialize data. if ( FLA_is_owner() ) { args.n_wait[0] = 0; args.pc[0] = 0; } Synch_all(); // Initialize tasks with critical information. FLASH_Queue_init_tasks( ( void* ) &args ); // Display verbose output before free all tasks. if ( FLASH_Queue_get_verbose_output() ) FLASH_Queue_verbose_output(); // Start timing the parallel execution. dtime = RCCE_wtime(); FLASH_Queue_exec_parallel_function( ( void* ) &args ); // End timing the parallel execution. dtime = RCCE_wtime() - dtime; FLASH_Queue_set_parallel_time( dtime ); // Free all tasks sequentially. for ( i = 0; i < n_tasks; i++ ) FLASH_Task_free( args.task_queue[i] ); // Free data. FLA_free( args.task_queue ); FLA_shfree( args.n_ready ); FLA_shfree( args.wait_queue ); FLA_shfree( args.n_wait ); FLA_shfree( args.pc ); // Reset values for next call to FLASH_Queue_exec(). FLASH_Queue_reset(); return; }
FLA_Bool FLASH_Queue_exec_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_parallel_function().
{ void** input_arg; void** output_arg; if ( t == NULL ) return TRUE; // If not using the GPU, then execute on CPU. if ( !FLASH_Queue_get_enabled_gpu() ) { FLASH_Queue_exec_task( t ); return TRUE; } // Check if all the operands are ready and up to date. if ( !FLASH_Queue_check_gpu( t, arg ) ) { FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int queue = t->queue; t->hit = FALSE; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** #endif // Reenqueue the task if the blocks are not all flushed. FLASH_Queue_wait_enqueue( t, arg ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->run_lock[queue]) ); // R *** #endif return FALSE; } // If GPU is enabled, but the task is not supported for GPU execution. if ( !t->enabled_gpu ) { int i, j, k; int thread = t->thread; int n_input_args = t->n_input_args; int n_output_args = t->n_output_args; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool duplicate; FLA_Obj obj; // Check the blocks on each GPU. for ( k = 0; k < n_threads; k++ ) { // Check the input and output arguments on the GPUs. for ( i = 0; i < n_input_args + n_output_args; i++ ) { // Check for duplicate blocks. duplicate = FALSE; // Find the correct input or output argument. if ( i < n_input_args ) { obj = t->input_arg[i]; for ( j = 0; j < n_output_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->input_arg[j].base ) duplicate = TRUE; } } else { obj = t->output_arg[i - n_input_args]; for ( j = 0; j < i - n_input_args && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } } // If the block has not been processed before. if ( !duplicate ) { // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); // Flush the block to main memory if it is on the GPU. if ( k == thread ) FLASH_Queue_flush_block_gpu( obj, k, arg ); // Invalidate output block on all GPUs. if ( i >= n_input_args ) FLASH_Queue_invalidate_block_gpu( obj, k, arg ); } } } else { // Flush the block to main memory if it is on the GPU. if ( k == thread ) FLASH_Queue_flush_block_gpu( obj, k, arg ); // Invalidate output block on all GPUs. if ( i >= n_input_args ) FLASH_Queue_invalidate_block_gpu( obj, k, arg ); } } } } // Execute the task on CPU instead of GPU. FLASH_Queue_exec_task( t ); return TRUE; } // Gather the pointers for the data on the GPU. input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) ); output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) ); // Bring all the blocks to GPU. FLASH_Queue_update_gpu( t, input_arg, output_arg, arg ); // Execute the task on GPU. FLASH_Queue_exec_task_gpu( t, input_arg, output_arg ); // Mark all the output blocks as dirty. FLASH_Queue_mark_gpu( t, arg ); // Free memory. FLA_free( input_arg ); FLA_free( output_arg ); return TRUE; }
void FLASH_Queue_exec_parallel | ( | void * | arg | ) |
References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_get_num_threads().
Referenced by FLASH_Queue_exec().
{ int i; int n_threads = FLASH_Queue_get_num_threads(); void* (*thread_entry_point)( void* ); // Allocate the thread structures array. Here, an array of FLASH_Thread // structures of length n_threads is allocated and the fields of each // structure set to appropriate values. FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) ); // Initialize the thread structures array. for ( i = 0; i < n_threads; i++ ) { // Save the thread's identifier. thread[i].id = i; // Save the pointer to the necessary variables with the thread. thread[i].args = arg; // The pthread object, if it was even compiled into the FLASH_Thread // structure, will be initialized by the pthread implementation when we // call pthread_create() and does not need to be touched at this time. } // Determine which function to send threads to. thread_entry_point = FLASH_Queue_exec_parallel_function; #if FLA_MULTITHREADING_MODEL == FLA_OPENMP // An OpenMP parallel for region spawns n_threads threads. Each thread // executes the work function with a different FLASH_Thread argument. // An implicit synchronization point exists at the end of the curly // brace scope. #pragma omp parallel for \ private( i ) \ shared( thread, n_threads, thread_entry_point ) \ schedule( static, 1 ) \ num_threads( n_threads ) for ( i = 0; i < n_threads; ++i ) { thread_entry_point( ( void* ) &thread[i] ); } #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS // Create each POSIX thread needed in addition to the main thread. for ( i = 1; i < n_threads; i++ ) { int pthread_e_val; // Create thread i with default attributes. pthread_e_val = pthread_create( &(thread[i].pthread_obj), NULL, thread_entry_point, ( void* ) &thread[i] ); #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val ); FLA_Check_error_code( e_val ); #endif } // The main thread is assigned the role of thread 0. Here we manually // execute it as a worker thread. thread_entry_point( ( void* ) &thread[0] ); // Wait for non-main threads to finish. for ( i = 1; i < n_threads; i++ ) { // These two variables are declared local to this for loop since this // is the only place they are needed, and since they would show up as // unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS. // Strangely, the Intel compiler produces code that results in an // "unaligned access" runtime message if thread_status is declared as // an int. Declaring it as a long or void* appears to force the // compiler (not surprisingly) into aligning it to an 8-byte boundary. int pthread_e_val; void* thread_status; // Wait for thread i to invoke its respective pthread_exit(). // The return value passed to pthread_exit() is provided to us // via status, if one was given. pthread_e_val = pthread_join( thread[i].pthread_obj, ( void** ) &thread_status ); #ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val ); FLA_Check_error_code( e_val ); #endif } #endif FLA_free( thread ); return; }
void * FLASH_Queue_exec_parallel_function | ( | void * | arg | ) |
References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i = RCCE_ue(); int queue = 0; int cache = 0; int n_tasks = FLASH_Queue_get_num_tasks(); int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool condition; FLA_Bool available; FLASH_Task* t = NULL; // Do not let extraneous cores execute. if ( i < n_threads ) condition = TRUE; else condition = FALSE; // Loop until all the tasks have committed. while ( condition ) { RCCE_acquire_lock( 0 ); // Obtain task to execute. t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args ); RCCE_release_lock( 0 ); // Dequeued a task from the waiting queue. available = ( t != NULL ); if ( available ) { // Save the thread and cache that executes the task. t->thread = i; t->cache = cache; // Execute the task. FLASH_Queue_exec_task( t ); // Update task dependencies. FLASH_Task_update_dependencies( t, ( void* ) args ); } RCCE_acquire_lock( 0 ); // Terminate loop. if ( args->pc[0] >= n_tasks ) condition = FALSE; RCCE_release_lock( 0 ); } return ( void* ) NULL; }
void FLASH_Queue_exec_simulation | ( | void * | arg | ) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, FLASH_Dep_s::task, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j; int queue; int cache; int n_stages = 0; int n_queues = args->n_queues; int n_tasks = FLASH_Queue_get_num_tasks(); int n_threads = FLASH_Queue_get_num_threads(); int n_cores = FLASH_Queue_get_cores_per_cache(); FLASH_Verbose verbose = FLASH_Queue_get_verbose_output(); FLASH_Task* task; FLASH_Task* t; FLASH_Dep* d; // An array to hold tasks to be executed during of simulation. #ifdef FLA_ENABLE_WINDOWS_BUILD FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) ); #else FLASH_Task* exec_array[n_threads]; #endif for ( i = 0; i < n_threads; i++ ) { // Initialize all exec_array to NULL. exec_array[i] = NULL; // Prefetch blocks into the cache before execution. if ( i % n_cores == 0 ) FLASH_Queue_prefetch( i, arg ); } // Loop until all the tasks have committed. while ( args->pc < n_tasks ) { for ( i = 0; i < n_threads; i++ ) { // Update waiting queue with ready tasks. t = exec_array[i]; if ( t != NULL ) { // Check each dependent task. d = t->dep_arg_head; for ( j = 0; j < t->n_dep_args; j++ ) { task = d->task; task->n_ready--; // Place newly ready tasks on waiting queue. if ( task->n_ready == 0 ) { FLASH_Queue_wait_enqueue( task, arg ); } // Go to the next dep. d = d->next_dep; } // Free the task. FLASH_Task_free( t ); } } n_stages++; if ( !verbose ) printf( "%7d", n_stages ); // Move ready tasks from the waiting queue to execution queue. for ( i = 0; i < n_threads; i++ ) { // Determine to which queue this thread belongs. queue = i / ( n_threads / n_queues ); // Determine to which cache this thread belongs. cache = i / n_cores; // Dequeue a task. t = FLASH_Queue_wait_dequeue( queue, cache, arg ); // Save the task for execution. exec_array[i] = t; if ( t != NULL ) { // Save the thread and cache that executes the task. t->thread = i; t->cache = cache; // Increment program counter. args->pc++; } } // Execute independent tasks. for ( i = 0; i < n_threads; i++ ) { t = exec_array[i]; FLASH_Queue_update_cache( t, arg ); FLASH_Queue_exec_task( t ); if ( !verbose ) printf( "%7s", ( t == NULL ? " " : t->name ) ); // Free the task if this is the last stage. if ( args->pc == n_tasks && t != NULL ) FLASH_Task_free( t ); } if ( !verbose ) printf( "\n" ); } if ( !verbose ) printf( "\n" ); #ifdef FLA_ENABLE_WINDOWS_BUILD FLA_free( exec_array ); #endif return; }
void FLASH_Queue_exec_task | ( | FLASH_Task * | t | ) |
References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{ // Define local function pointer types. // LAPACK-level typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl ); typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl); typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl); typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl); typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl); typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl); typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl); typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl); typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl); typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl); typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl); typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl); typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl); typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl); typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl); typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl); typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl); typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl); typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl); typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl); typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl); typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl); typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl); typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl); typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl); // Level-3 BLAS typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl); typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl); typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl); typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl); typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl); typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl); typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl); typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl); typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl); // Level-2 BLAS typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl); typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl); // Level-1 BLAS typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl); typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl); typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl); typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl); typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl); typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl); typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl); // Base typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl); typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl); // Only execute task if it is not NULL. if ( t == NULL ) return; // Now "switch" between the various possible task functions. // FLA_LU_piv_macro if ( t->func == (void *) FLA_LU_piv_macro_task ) { flash_lu_piv_macro_p func; func = (flash_lu_piv_macro_p) t->func; func( t->output_arg[0], t->output_arg[1], ( fla_lu_t* ) t->cntl ); } // FLA_Apply_pivots_macro else if ( t->func == (void *) FLA_Apply_pivots_macro_task ) { flash_apply_pivots_macro_p func; func = (flash_apply_pivots_macro_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->input_arg[0], t->output_arg[0], ( fla_appiv_t* ) t->cntl ); } // FLA_LU_piv else if ( t->func == (void *) FLA_LU_piv_task ) { flash_lu_piv_p func; func = (flash_lu_piv_p) t->func; func( t->output_arg[0], t->fla_arg[0], ( fla_lu_t* ) t->cntl ); } // FLA_LU_piv_copy else if ( t->func == (void *) FLA_LU_piv_copy_task ) { flash_lu_piv_copy_p func; func = (flash_lu_piv_copy_p) t->func; func( t->output_arg[0], t->fla_arg[0], t->output_arg[1], ( fla_lu_t* ) t->cntl ); } // FLA_Trsm_piv else if ( t->func == (void *) FLA_Trsm_piv_task ) { flash_trsm_piv_p func; func = (flash_trsm_piv_p) t->func; func( t->input_arg[0], t->output_arg[0], t->fla_arg[0], ( fla_trsm_t* ) t->cntl ); } // FLA_SA_LU else if ( t->func == (void *) FLA_SA_LU_task ) { flash_sa_lu_p func; func = (flash_sa_lu_p) t->func; func( t->output_arg[1], t->output_arg[0], t->fla_arg[0], t->fla_arg[1], t->int_arg[0], ( fla_lu_t* ) t->cntl ); } // FLA_SA_FS else if ( t->func == (void *) FLA_SA_FS_task ) { flash_sa_fs_p func; func = (flash_sa_fs_p) t->func; func( t->fla_arg[0], t->input_arg[0], t->fla_arg[1], t->output_arg[1], t->output_arg[0], t->int_arg[0], ( fla_gemm_t* ) t->cntl ); } // FLA_LU_nopiv else if ( t->func == (void *) FLA_LU_nopiv_task ) { flash_lu_nopiv_p func; func = (flash_lu_nopiv_p) t->func; func( t->output_arg[0], ( fla_lu_t* ) t->cntl ); } // FLA_Trinv else if ( t->func == (void *) FLA_Trinv_task ) { flash_trinv_p func; func = (flash_trinv_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Diag ) t->int_arg[1], t->output_arg[0], ( fla_trinv_t* ) t->cntl ); } // FLA_Ttmm else if ( t->func == (void *) FLA_Ttmm_task ) { flash_ttmm_p func; func = (flash_ttmm_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], t->output_arg[0], ( fla_ttmm_t* ) t->cntl ); } // FLA_Chol else if ( t->func == (void *) FLA_Chol_task ) { flash_chol_p func; func = (flash_chol_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], t->output_arg[0], ( fla_chol_t* ) t->cntl ); } // FLA_Sylv else if ( t->func == (void *) FLA_Sylv_task ) { flash_sylv_p func; func = (flash_sylv_p) t->func; func( ( FLA_Trans ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->output_arg[0], t->fla_arg[1], ( fla_sylv_t* ) t->cntl ); } // FLA_Lyap else if ( t->func == (void *) FLA_Lyap_task ) { flash_lyap_p func; func = (flash_lyap_p) t->func; func( ( FLA_Trans ) t->int_arg[0], t->fla_arg[0], t->input_arg[0], t->output_arg[0], t->fla_arg[1], ( fla_lyap_t* ) t->cntl ); } // FLA_QR_UT_macro else if ( t->func == (void *) FLA_QR_UT_macro_task ) { flash_qrut_macro_p func; func = (flash_qrut_macro_p) t->func; func( t->output_arg[0], t->output_arg[1], ( fla_qrut_t* ) t->cntl ); } // FLA_QR_UT else if ( t->func == (void *) FLA_QR_UT_task ) { flash_qrut_p func; func = (flash_qrut_p) t->func; func( t->output_arg[0], t->fla_arg[0], ( fla_qrut_t* ) t->cntl ); } // FLA_QR_UT_copy else if ( t->func == (void *) FLA_QR_UT_copy_task ) { flash_qrutc_p func; func = (flash_qrutc_p) t->func; func( t->output_arg[0], t->fla_arg[0], t->output_arg[1], ( fla_qrut_t* ) t->cntl ); } // FLA_QR2_UT else if ( t->func == (void *) FLA_QR2_UT_task ) { flash_qr2ut_p func; func = (flash_qr2ut_p) t->func; func( t->output_arg[1], t->output_arg[0], t->fla_arg[0], ( fla_qr2ut_t* ) t->cntl ); } // FLA_LQ_UT_macro else if ( t->func == (void *) FLA_LQ_UT_macro_task ) { flash_lqut_macro_p func; func = (flash_lqut_macro_p) t->func; func( t->output_arg[0], t->output_arg[1], ( fla_lqut_t* ) t->cntl ); } // FLA_CAQR2_UT else if ( t->func == (void *) FLA_CAQR2_UT_task ) { flash_caqr2ut_p func; func = (flash_caqr2ut_p) t->func; func( t->output_arg[1], t->output_arg[0], t->fla_arg[0], ( fla_caqr2ut_t* ) t->cntl ); } // FLA_UDdate_UT else if ( t->func == (void *) FLA_UDdate_UT_task ) { flash_uddateut_p func; func = (flash_uddateut_p) t->func; func( t->output_arg[0], t->output_arg[1], t->output_arg[2], t->output_arg[3], ( fla_uddateut_t* ) t->cntl ); } // FLA_Apply_Q_UT else if ( t->func == (void *) FLA_Apply_Q_UT_task ) { flash_apqut_p func; func = (flash_apqut_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], ( FLA_Direct ) t->int_arg[2], ( FLA_Store ) t->int_arg[3], t->input_arg[0], t->fla_arg[0], t->output_arg[1], t->output_arg[0], ( fla_apqut_t* ) t->cntl ); } // FLA_Apply_Q2_UT else if ( t->func == (void *) FLA_Apply_Q2_UT_task ) { flash_apq2ut_p func; func = (flash_apq2ut_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], ( FLA_Direct ) t->int_arg[2], ( FLA_Store ) t->int_arg[3], t->input_arg[0], t->fla_arg[0], t->output_arg[2], t->output_arg[1], t->output_arg[0], ( fla_apq2ut_t* ) t->cntl ); } // FLA_Apply_CAQ2_UT else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task ) { flash_apcaq2ut_p func; func = (flash_apcaq2ut_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], ( FLA_Direct ) t->int_arg[2], ( FLA_Store ) t->int_arg[3], t->input_arg[0], t->fla_arg[0], t->output_arg[2], t->output_arg[1], t->output_arg[0], ( fla_apcaq2ut_t* ) t->cntl ); } // FLA_Apply_QUD_UT else if ( t->func == (void *) FLA_Apply_QUD_UT_task ) { flash_apqudut_p func; func = (flash_apqudut_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], ( FLA_Direct ) t->int_arg[2], ( FLA_Store ) t->int_arg[3], t->input_arg[0], t->output_arg[0], t->output_arg[1], t->input_arg[1], t->output_arg[2], t->input_arg[2], t->output_arg[3], ( fla_apqudut_t* ) t->cntl ); } // FLA_Eig_gest else if ( t->func == (void *) FLA_Eig_gest_task ) { flash_eig_gest_p func; func = (flash_eig_gest_p) t->func; func( ( FLA_Inv ) t->int_arg[0], ( FLA_Uplo ) t->int_arg[1], t->output_arg[1], t->output_arg[0], t->input_arg[0], ( fla_eig_gest_t* ) t->cntl ); } // FLA_Gemm else if ( t->func == (void *) FLA_Gemm_task ) { flash_gemm_p func; func = (flash_gemm_p) t->func; func( ( FLA_Trans ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_gemm_t* ) t->cntl ); } // FLA_Hemm else if ( t->func == (void *) FLA_Hemm_task ) { flash_hemm_p func; func = (flash_hemm_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Uplo ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_hemm_t* ) t->cntl ); } // FLA_Herk else if ( t->func == (void *) FLA_Herk_task ) { flash_herk_p func; func = (flash_herk_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->fla_arg[1], t->output_arg[0], ( fla_herk_t* ) t->cntl ); } // FLA_Her2k else if ( t->func == (void *) FLA_Her2k_task ) { flash_her2k_p func; func = (flash_her2k_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_her2k_t* ) t->cntl ); } // FLA_Symm else if ( t->func == (void *) FLA_Symm_task ) { flash_symm_p func; func = (flash_symm_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Uplo ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_symm_t* ) t->cntl ); } // FLA_Syrk else if ( t->func == (void *) FLA_Syrk_task ) { flash_syrk_p func; func = (flash_syrk_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->fla_arg[1], t->output_arg[0], ( fla_syrk_t* ) t->cntl ); } // FLA_Syr2k else if ( t->func == (void *) FLA_Syr2k_task ) { flash_syr2k_p func; func = (flash_syr2k_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_syr2k_t* ) t->cntl ); } // FLA_Trmm else if ( t->func == (void *) FLA_Trmm_task ) { flash_trmm_p func; func = (flash_trmm_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Uplo ) t->int_arg[1], ( FLA_Trans ) t->int_arg[2], ( FLA_Diag ) t->int_arg[3], t->fla_arg[0], t->input_arg[0], t->output_arg[0], ( fla_trmm_t* ) t->cntl ); } // FLA_Trsm else if ( t->func == (void *) FLA_Trsm_task ) { flash_trsm_p func; func = (flash_trsm_p) t->func; func( ( FLA_Side ) t->int_arg[0], ( FLA_Uplo ) t->int_arg[1], ( FLA_Trans ) t->int_arg[2], ( FLA_Diag ) t->int_arg[3], t->fla_arg[0], t->input_arg[0], t->output_arg[0], ( fla_trsm_t* ) t->cntl ); } // FLA_Gemv else if ( t->func == (void *) FLA_Gemv_task ) { flash_gemv_p func; func = (flash_gemv_p) t->func; func( ( FLA_Trans ) t->int_arg[0], t->fla_arg[0], t->input_arg[0], t->input_arg[1], t->fla_arg[1], t->output_arg[0], ( fla_gemv_t* ) t->cntl ); } // FLA_Trsv else if ( t->func == (void *) FLA_Trsv_task ) { flash_trsv_p func; func = (flash_trsv_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], ( FLA_Trans ) t->int_arg[1], ( FLA_Diag ) t->int_arg[2], t->input_arg[0], t->output_arg[0], ( fla_trsv_t* ) t->cntl ); } // FLA_Axpy else if ( t->func == (void *) FLA_Axpy_task ) { flash_axpy_p func; func = (flash_axpy_p) t->func; func( t->fla_arg[0], t->input_arg[0], t->output_arg[0], ( fla_axpy_t* ) t->cntl ); } // FLA_Axpyt else if ( t->func == (void *) FLA_Axpyt_task ) { flash_axpyt_p func; func = (flash_axpyt_p) t->func; func( ( FLA_Trans ) t->int_arg[0], t->fla_arg[0], t->input_arg[0], t->output_arg[0], ( fla_axpyt_t* ) t->cntl ); } // FLA_Copy else if ( t->func == (void *) FLA_Copy_task ) { flash_copy_p func; func = (flash_copy_p) t->func; func( t->input_arg[0], t->output_arg[0], ( fla_copy_t* ) t->cntl ); } // FLA_Copyt else if ( t->func == (void *) FLA_Copyt_task ) { flash_copyt_p func; func = (flash_copyt_p) t->func; func( ( FLA_Trans ) t->int_arg[0], t->input_arg[0], t->output_arg[0], ( fla_copyt_t* ) t->cntl ); } // FLA_Copyr else if ( t->func == (void *) FLA_Copyr_task ) { flash_copyr_p func; func = (flash_copyr_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], t->input_arg[0], t->output_arg[0], ( fla_copyr_t* ) t->cntl ); } // FLA_Scal else if ( t->func == (void *) FLA_Scal_task ) { flash_scal_p func; func = (flash_scal_p) t->func; func( t->fla_arg[0], t->output_arg[0], ( fla_scal_t* ) t->cntl ); } // FLA_Scalr else if ( t->func == (void *) FLA_Scalr_task ) { flash_scalr_p func; func = (flash_scalr_p) t->func; func( ( FLA_Uplo ) t->int_arg[0], t->fla_arg[0], t->output_arg[0], ( fla_scalr_t* ) t->cntl ); } // FLA_Obj_create_buffer else if ( t->func == (void *) FLA_Obj_create_buffer_task ) { flash_obj_create_buffer_p func; func = (flash_obj_create_buffer_p) t->func; func( ( dim_t ) t->int_arg[0], ( dim_t ) t->int_arg[1], t->output_arg[0], t->cntl ); } // FLA_Obj_free_buffer else if ( t->func == (void *) FLA_Obj_free_buffer_task ) { flash_obj_free_buffer_p func; func = (flash_obj_free_buffer_p) t->func; func( t->output_arg[0], t->cntl ); } else { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } return; }
void FLASH_Queue_finalize | ( | void | ) |
References FLASH_Queue_finalize_gpu().
Referenced by FLA_Finalize().
{ // Exit early if we're not already initialized. if ( flash_queue_initialized == FALSE ) return; // Clear the initialized flag. flash_queue_initialized = FALSE; #ifdef FLA_ENABLE_GPU FLASH_Queue_finalize_gpu(); #endif return; }
void FLASH_Queue_flush_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool transfer = FALSE; FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // The block is owned by the GPU. if ( k < gpu_n_blocks ) { // Save the block that will be flushed. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // If the block is dirty, then flush it. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean ) transfer = TRUE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Exit early if a flush is not required. if ( !transfer ) return; // Flush the block outside the critical section. FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Update the bits for the flushed block. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
void FLASH_Queue_flush_gpu | ( | int | thread, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); int n_transfer = 0; FLA_Obj_gpu gpu_obj; // Exit if not using GPU. if ( !FLASH_Queue_get_enabled_gpu() ) return; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif for ( k = 0; k < gpu_n_blocks; k++ ) { // Save the block that might be flushed. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Flush the block if it is dirty and requested. if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request ) { // Save the block for data transfer outside the critical section. args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj; n_transfer++; } } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Exit early if a flush is not required. if ( n_transfer == 0 ) return; // Flush the block outside the critical section. for ( i = 0; i < n_transfer; i++ ) { gpu_obj = args->gpu_log[thread * gpu_n_blocks + i]; FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Update the bits for each block that is flushed. for ( i = 0; i < n_transfer; i++ ) { // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // The block is now clean. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
dim_t FLASH_Queue_get_block_size | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_block_size;
}
dim_t FLASH_Queue_get_cache_line_size | ( | void | ) |
Referenced by FLASH_Queue_prefetch_block().
{
return flash_queue_cache_line_size;
}
dim_t FLASH_Queue_get_cache_size | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_cache_size;
}
FLA_Bool FLASH_Queue_get_caching | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_wait_dequeue(), and FLASH_Task_update_dependencies().
{
return flash_queue_caching;
}
int FLASH_Queue_get_cores_per_cache | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
return flash_queue_cores_per_cache;
}
int FLASH_Queue_get_cores_per_queue | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_cores_per_queue;
}
Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
{
return flash_queue_data_affinity;
}
FLA_Bool FLASH_Queue_get_enabled | ( | void | ) |
Referenced by FLA_Apply_CAQ2_UT_internal(), FLA_Apply_pivots_internal(), FLA_Apply_Q2_UT_internal(), FLA_Apply_Q_UT_internal(), FLA_Apply_QUD_UT_internal(), FLA_Axpy_internal(), FLA_Axpyt_internal(), FLA_CAQR2_UT_internal(), FLA_Chol_internal(), FLA_Copy_internal(), FLA_Copyr_internal(), FLA_Copyt_internal(), FLA_Eig_gest_internal(), FLA_Gemm_internal(), FLA_Gemv_internal(), FLA_Hemm_internal(), FLA_Her2k_internal(), FLA_Herk_internal(), FLA_LQ_UT_internal(), FLA_LU_nopiv_internal(), FLA_LU_piv_internal(), FLA_Lyap_internal(), FLA_QR2_UT_internal(), FLA_QR_UT_copy_internal(), FLA_QR_UT_internal(), FLA_Scal_internal(), FLA_Scalr_internal(), FLA_Sylv_internal(), FLA_Symm_internal(), FLA_Syr2k_internal(), FLA_Syrk_internal(), FLA_Trinv_internal(), FLA_Trmm_internal(), FLA_Trsm_internal(), FLA_Trsv_internal(), FLA_Ttmm_internal(), FLA_UDdate_UT_internal(), FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_LU_incpiv_var1(), FLASH_LU_incpiv_var2(), FLASH_Queue_enable_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_SA_FS(), FLASH_SA_LU(), FLASH_Scal(), FLASH_Scalr(), FLASH_Trsm_piv(), and FLASH_Trsv().
{ // Return if enabled, but always false if SuperMatrix is not configured. #ifdef FLA_ENABLE_SUPERMATRIX return flash_queue_enabled; #else return FALSE; #endif }
FLASH_Task* FLASH_Queue_get_head_task | ( | void | ) |
References FLASH_Queue_s::head.
Referenced by FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
unsigned int FLASH_Queue_get_num_tasks | ( | void | ) |
References FLASH_Queue_s::n_tasks.
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
unsigned int FLASH_Queue_get_num_threads | ( | void | ) |
Referenced by FLASH_Queue_check_gpu(), FLASH_Queue_exec(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_update_gpu(), FLASH_Queue_verbose_output(), FLASH_Task_free_parallel(), and FLASH_Task_update_dependencies().
{
return flash_queue_n_threads;
}
double FLASH_Queue_get_parallel_time | ( | void | ) |
{ // Only return time if out of parallel region. if ( flash_queue_stack == 0 ) return flash_queue_parallel_time; return 0.0; }
FLA_Bool FLASH_Queue_get_sorting | ( | void | ) |
Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().
{
return flash_queue_sorting;
}
FLASH_Task* FLASH_Queue_get_tail_task | ( | void | ) |
double FLASH_Queue_get_total_time | ( | void | ) |
{ // Only return time if out of parallel region. if ( flash_queue_stack == 0 ) return flash_queue_total_time; return 0.0; }
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().
{
return flash_queue_verbose;
}
FLA_Bool FLASH_Queue_get_work_stealing | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().
{
return flash_queue_work_stealing;
}
void FLASH_Queue_init | ( | void | ) |
References FLASH_Queue_init_gpu(), and FLASH_Queue_reset().
Referenced by FLA_Init().
{ // Exit early if we're already initialized. if ( flash_queue_initialized == TRUE ) return; // Reset all the initial values. FLASH_Queue_reset(); // Set the initialized flag. flash_queue_initialized = TRUE; #ifdef FLA_ENABLE_GPU FLASH_Queue_init_gpu(); #endif return; }
void FLASH_Queue_init_tasks | ( | void * | arg | ) |
References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.
Referenced by FLASH_Queue_exec().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j; int n_tasks = FLASH_Queue_get_num_tasks(); int n_ready = 0; int height; FLASH_Task* t; FLASH_Dep* d; // Grab the tail of the task queue. t = FLASH_Queue_get_tail_task(); for ( i = n_tasks - 1; i >= 0; i-- ) { // Save all the task pointers. args->task_queue[i] = t; // Only use a single queue implementation. t->queue = 0; // Determine the height of each task in the DAG. height = 0; d = t->dep_arg_head; // Take the maximum height of dependent tasks. for ( j = 0; j < t->n_dep_args; j++ ) { height = max( height, d->task->height ); d = d->next_dep; } t->height = height + 1; // Since freeing a task is always a leaf, we want to force it to execute // earlier by giving it a greater height in order to reclaim memory. if ( t->func == (void *) FLA_Obj_free_buffer_task ) t->height += n_tasks; // Find all ready tasks. t->n_ready += t->n_input_args + t->n_output_args + t->n_macro_args + t->n_war_args; if ( t->n_ready == 0 ) { // Save the number of ready and available tasks. n_ready++; } if ( FLA_is_owner() ) { // Record all the ready values. args->n_ready[i] = t->n_ready; } // Go to the previous task. t = t->prev_task; } // Only allow the first core to enqueue the initial ready tasks. if ( !FLA_is_owner() ) return; // Grab the head of the task queue. t = FLASH_Queue_get_head_task(); for ( i = 0; i < n_tasks && n_ready > 0; i++ ) { if ( t->n_ready == 0 ) { RCCE_acquire_lock( 0 ); // Enqueue all the ready and available tasks. FLASH_Queue_wait_enqueue( t, arg ); RCCE_release_lock( 0 ); // Decrement the number of ready tasks left to be enqueued. n_ready--; } // Go to the next task. t = t->next_task; } return; }
void FLASH_Queue_invalidate_block_gpu | ( | FLA_Obj | obj, |
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int j, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // The block is owned by other GPU. if ( k < gpu_n_blocks ) { // Invalidate the block. args->gpu[thread * gpu_n_blocks + k].obj.base = NULL; args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; // Save the block that will be invalidated. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Shift all the blocks for the invalidated block. for ( j = k; j < gpu_n_blocks - 1; j++ ) args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1]; // Move to the LRU block. args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif return; }
void FLASH_Queue_mark_gpu | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int thread = t->thread; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool duplicate; FLA_Obj obj; // Mark all the output blocks on the GPU as dirty. for ( i = t->n_output_args - 1; i >= 0; i-- ) { obj = t->output_arg[i]; // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( obj.base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on the GPU. for ( k = 0; k < gpu_n_blocks; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; if ( k < gpu_n_blocks ) { // Change the bits for the new dirty block. args->gpu[thread * gpu_n_blocks + k].clean = FALSE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif } } return; }
void FLASH_Queue_prefetch | ( | int | cache, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; int size = args->size; FLA_Obj obj; // Prefetch blocks in opposite order to maintain LRU. for ( i = size - 1; i >= 0; i-- ) { obj = args->prefetch[i]; // Only prefetch if it is a valid block. if ( obj.base != NULL ) { // Prefetch the block. FLASH_Queue_prefetch_block( obj ); // Record the prefetched block in the cache. args->cache[cache * size + i] = obj; } } return; }
void FLASH_Queue_prefetch_block | ( | FLA_Obj | obj | ) |
References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), scomplex::real, and dcomplex::real.
Referenced by FLASH_Queue_prefetch().
{ int i, inc; int line_size = FLASH_Queue_get_cache_line_size(); int elem_size = FLA_Obj_elem_size( obj ); int length = FLA_Obj_length( obj ); int width = FLA_Obj_width( obj ); FLA_Datatype datatype = FLA_Obj_datatype( obj ); // Determine stride to prefetch block into cache. inc = line_size / elem_size; // Switch between the four different datatypes. switch ( datatype ) { case FLA_FLOAT: { float *buffer = ( float * ) FLA_FLOAT_PTR( obj ); float access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } case FLA_DOUBLE: { double *buffer = ( double * ) FLA_DOUBLE_PTR( obj ); double access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } case FLA_COMPLEX: { scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj ); scomplex access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access.real += 1.0; break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj ); dcomplex access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access.real += 1.0; break; } case FLA_INT: { int *buffer = ( int * ) FLA_INT_PTR( obj ); int access; // Access each cache line of the block. for ( i = 0; i < length * width; i += inc ) access = buffer[i]; // Prevent dead code elimination. access += 1.0; break; } default: // This default case should never execute. FLA_Check_error_code( FLA_INVALID_DATATYPE ); } return; }
void FLASH_Queue_push | ( | void * | func, |
void * | cntl, | ||
char * | name, | ||
FLA_Bool | enabled_gpu, | ||
int | n_int_args, | ||
int | n_fla_args, | ||
int | n_input_args, | ||
int | n_output_args, | ||
... | |||
) |
References FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Queue_push_output(), FLASH_Task_alloc(), FLASH_Queue_s::head, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::queue, FLASH_Queue_s::tail, and FLA_Obj_struct::write_task.
{ int i; va_list var_arg_list; FLASH_Task* t; FLA_Obj obj; // Allocate a new FLA_Task and populate its fields with appropriate values. t = FLASH_Task_alloc( func, cntl, name, enabled_gpu, n_int_args, n_fla_args, n_input_args, n_output_args ); // Initialize variable argument environment. In case you're wondering, the // second argument in this macro invocation of va_start() is supposed to be // the parameter that immediately preceeds the variable argument list // (ie: the ... above ). va_start( var_arg_list, n_output_args ); // Extract the integer arguments. for ( i = 0; i < n_int_args; i++ ) t->int_arg[i] = va_arg( var_arg_list, int ); // Extract the FLA_Obj arguments. for ( i = 0; i < n_fla_args; i++ ) t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj ); // Extract the input FLA_Obj arguments. for ( i = 0; i < n_input_args; i++ ) { obj = va_arg( var_arg_list, FLA_Obj ); t->input_arg[i] = obj; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each input block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_push_input( *( buf + jj * cs + kk ), t ); // Set the number of blocks in the macroblock subtracted by one // since we do not want to recount an operand for each n_input_arg. t->n_macro_args += m * n - 1; } else // Regular block. { // Dependence analysis for input operand. FLASH_Queue_push_input( obj, t ); } } // Extract the output FLA_Obj arguments. for ( i = 0; i < n_output_args; i++ ) { obj = va_arg( var_arg_list, FLA_Obj ); t->output_arg[i] = obj; // Only assign data affinity to the first output block. if ( i == 0 ) { FLA_Obj buf = obj; // Use the top left block of the macroblock. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) buf = *FLASH_OBJ_PTR_AT( obj ); if ( buf.base->write_task == NULL ) t->queue = flash_queue_n_write_blocks; else t->queue = buf.base->write_task->queue; } // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each output block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_push_output( *( buf + jj * cs + kk ), t ); // Set the number of blocks in the macroblock subtracted by one // since we do not want to recount an operand for each n_output_arg. t->n_macro_args += m * n - 1; } else // Regular block. { // Dependence analysis for output operand. FLASH_Queue_push_output( obj, t ); } } // Finalize the variable argument environment. va_end( var_arg_list ); // Add the task to the tail of the queue (and the head if queue is empty). if ( _tq.n_tasks == 0 ) { _tq.head = t; _tq.tail = t; } else { t->prev_task = _tq.tail; _tq.tail->next_task = t; _tq.tail = t; // Determine the index of the task in the task queue. t->order = t->prev_task->order + 1; } // Increment the number of tasks. _tq.n_tasks++; return; }
void FLASH_Queue_push_input | ( | FLA_Obj | obj, |
FLASH_Task * | t | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{ FLASH_Task* task; FLASH_Dep* d; // Find dependence information. if ( obj.base->write_task == NULL ) { t->n_ready--; // Add to number of blocks read if not written and not read before. if ( obj.base->n_read_tasks == 0 ) { // Identify each read block with an id for freeing. obj.base->n_read_blocks = flash_queue_n_read_blocks; flash_queue_n_read_blocks++; } } else { // Flow dependence. task = obj.base->write_task; d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) ); d->task = t; d->next_dep = NULL; if ( task->n_dep_args == 0 ) { task->dep_arg_head = d; task->dep_arg_tail = d; } else { task->dep_arg_tail->next_dep = d; task->dep_arg_tail = d; } task->n_dep_args++; } // Add task to the read task in the object if not already there. if ( obj.base->n_read_tasks == 0 || obj.base->read_task_tail->task != t ) { // Anti-dependence potentially. d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) ); d->task = t; d->next_dep = NULL; if ( obj.base->n_read_tasks == 0 ) { obj.base->read_task_head = d; obj.base->read_task_tail = d; } else { obj.base->read_task_tail->next_dep = d; obj.base->read_task_tail = d; } obj.base->n_read_tasks++; } return; }
void FLASH_Queue_push_output | ( | FLA_Obj | obj, |
FLASH_Task * | t | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_free(), FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{ int i; FLASH_Task* task; FLASH_Dep* d; FLASH_Dep* next_dep; // Assign tasks to threads with data affinity. if ( obj.base->write_task == NULL ) { t->n_ready--; // Save index in which this output block is first encountered. obj.base->n_write_blocks = flash_queue_n_write_blocks; // Number of blocks written if not written before. flash_queue_n_write_blocks++; // Add to number of blocks read if not written or read before. if ( obj.base->n_read_tasks == 0 ) { // Identify each read block with an id for freeing. obj.base->n_read_blocks = flash_queue_n_read_blocks; flash_queue_n_read_blocks++; } } else { // Flow dependence potentially. // The last task to overwrite this block is not itself. if ( obj.base->write_task != t ) { // Create dependency from task that last wrote the block. task = obj.base->write_task; d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) ); d->task = t; d->next_dep = NULL; if ( task->n_dep_args == 0 ) { task->dep_arg_head = d; task->dep_arg_tail = d; } else { task->dep_arg_tail->next_dep = d; task->dep_arg_tail = d; } task->n_dep_args++; } else { // No need to notify task twice for output block already seen. t->n_ready--; } } // Clear read task for next set of reads and record the anti-dependence. d = obj.base->read_task_head; for ( i = 0; i < obj.base->n_read_tasks; i++ ) { task = d->task; next_dep = d->next_dep; // If the last task to read is not the current task, add dependence. if ( task != t ) { d->task = t; d->next_dep = NULL; if ( task->n_dep_args == 0 ) { task->dep_arg_head = d; task->dep_arg_tail = d; } else { task->dep_arg_tail->next_dep = d; task->dep_arg_tail = d; } task->n_dep_args++; t->n_war_args++; } else { FLA_free( d ); } d = next_dep; } obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; // Record this task as the last to write to this block. obj.base->write_task = t; return; }
void FLASH_Queue_reset | ( | void | ) |
References FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_init().
void FLASH_Queue_set_block_size | ( | dim_t | size | ) |
Referenced by FLASH_Obj_create_hierarchy().
{ // Only adjust the block size if the new block is larger. if ( flash_queue_block_size < size ) flash_queue_block_size = size; return; }
void FLASH_Queue_set_cache_line_size | ( | dim_t | size | ) |
{
flash_queue_cache_line_size = size;
return;
}
void FLASH_Queue_set_cache_size | ( | dim_t | size | ) |
{
flash_queue_cache_size = size;
return;
}
void FLASH_Queue_set_caching | ( | FLA_Bool | caching | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_caching = caching;
return;
}
void FLASH_Queue_set_cores_per_cache | ( | int | cores | ) |
{
flash_queue_cores_per_cache = cores;
return;
}
void FLASH_Queue_set_cores_per_queue | ( | int | cores | ) |
{
flash_queue_cores_per_queue = cores;
return;
}
void FLASH_Queue_set_data_affinity | ( | FLASH_Data_aff | data_affinity | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_data_affinity = data_affinity;
return;
}
void FLASH_Queue_set_num_threads | ( | unsigned int | n_threads | ) |
References FLA_Check_num_threads().
{ FLA_Error e_val; // Verify that the number of threads is positive. e_val = FLA_Check_num_threads( n_threads ); FLA_Check_error_code( e_val ); // Keep track of the number of threads internally. flash_queue_n_threads = n_threads; #if FLA_MULTITHREADING_MODEL == FLA_OPENMP // No additional action is necessary to set the number of OpenMP threads // since setting the number of threads is handled at the parallel for loop // with a num_threads() clause. This gives the user more flexibility since // he can use the OMP_NUM_THREADS environment variable or the // omp_set_num_threads() function to set the global number of OpenMP threads // independently of the number of SuperMatrix threads. #elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS // No additional action is necessary to set the number of pthreads // since setting the number of threads is handled entirely on our end. #endif return; }
void FLASH_Queue_set_parallel_time | ( | double | dtime | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_parallel_time = dtime;
return;
}
void FLASH_Queue_set_sorting | ( | FLA_Bool | sorting | ) |
{
flash_queue_sorting = sorting;
return;
}
void FLASH_Queue_set_verbose_output | ( | FLASH_Verbose | verbose | ) |
{
flash_queue_verbose = verbose;
return;
}
void FLASH_Queue_set_work_stealing | ( | FLA_Bool | work_stealing | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_work_stealing = work_stealing;
return;
}
unsigned int FLASH_Queue_stack_depth | ( | void | ) |
Referenced by FLASH_Eig_gest(), FLASH_LU_incpiv(), FLASH_QR_UT_inc(), FLASH_Queue_disable_gpu(), and FLASH_Queue_enable_gpu().
{
return flash_queue_stack;
}
void FLASH_Queue_update_block_gpu | ( | FLA_Obj | obj, |
void ** | buffer_gpu, | ||
int | thread, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_update_gpu().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int j, k; dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks(); FLA_Bool transfer = FALSE; FLA_Bool evict = FALSE; FLA_Obj_gpu evict_obj; FLA_Obj_gpu gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif // Locate the position of the block on GPU. for ( k = 0; k < gpu_n_blocks - 1; k++ ) if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base ) break; // Save the pointer to the data on the GPU. buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu; // Save the victim block. evict_obj = args->gpu[thread * gpu_n_blocks + k]; // The block is not already in the GPU. if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base ) { // Save for data transfer outside of critical section. transfer = TRUE; // Save for eviction outside of critical section. if ( evict_obj.obj.base != NULL && !evict_obj.clean ) { evict = TRUE; args->victim[thread] = evict_obj; } // Save the block in the data structure. args->gpu[thread * gpu_n_blocks + k].obj = obj; // Make sure the new block is clean. args->gpu[thread * gpu_n_blocks + k].clean = TRUE; args->gpu[thread * gpu_n_blocks + k].request = FALSE; } // Use the block on the GPU that is a hit or LRU. gpu_obj = args->gpu[thread * gpu_n_blocks + k]; // Shift all the previous tasks for LRU replacement. for ( j = k; j > 0; j-- ) args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1]; // Place the block on the cache as the most recently used. args->gpu[thread * gpu_n_blocks] = gpu_obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif // Evict and flush the LRU dirty block. if ( evict ) { FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G *** #endif args->victim[thread].obj.base = NULL; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->gpu_lock[thread]) ); // G *** #endif } // Move the block to the GPU. if ( transfer ) FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu ); return; }
void FLASH_Queue_update_cache | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{ int i, j; FLA_Bool duplicate; FLA_Obj obj; if ( t == NULL ) return; // Updating the input blocks. for ( i = t->n_input_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < t->n_output_args && !duplicate; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) duplicate = TRUE; } // If the input block has not been processed before. if ( !duplicate ) { obj = t->input_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each input block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), t->cache, FALSE, arg ); } else // Regular block. { FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg ); } } } // Updating the output blocks. for ( i = t->n_output_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { obj = t->output_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Dependence analysis for each input block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ), t->cache, TRUE, arg ); } else // Regular block. { FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg ); } } } return; }
void FLASH_Queue_update_cache_block | ( | FLA_Obj | obj, |
int | cache, | ||
FLA_Bool | output, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_update_cache().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int n_caches = args->n_caches; int size = args->size; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C *** #endif // Locate the position of the block in the cache. for ( k = 0; k < size - 1; k++ ) { if ( obj.base == args->cache[cache * size + k].base ) break; } // Shift all the previous tasks for LRU replacement. for ( j = k; j > 0; j-- ) args->cache[cache * size + j] = args->cache[cache * size + j - 1]; // Place the block on the cache as the most recently used. args->cache[cache * size] = obj; #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->cac_lock[cache]) ); // C *** #endif // Write invalidate if updating with output block. if ( output ) { for ( i = 0; i < n_caches; i++ ) { if ( i != cache ) { #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->cac_lock[i]) ); // C *** #endif // Locate the position of the block in the cache. for ( k = 0; k < size; k++ ) { if ( obj.base == args->cache[i * size + k].base ) break; } // The block is owned by other thread. if ( k < size ) { // Shift all the blocks for the invalidated block. for ( j = k; j < size - 1; j++ ) args->cache[i * size + j] = args->cache[i * size + j + 1]; // Invalidate the block. args->cache[i * size + size - 1].base = NULL; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->cac_lock[i]) ); // C *** #endif } } } return; }
void FLASH_Queue_update_gpu | ( | FLASH_Task * | t, |
void ** | input_arg, | ||
void ** | output_arg, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{ int i, j, k; int thread = t->thread; int n_threads = FLASH_Queue_get_num_threads(); FLA_Bool duplicate; // None of the arguments can be macroblocks yet. // Complicating factor is copying macroblock to contiguous memory on GPU. // Bring the input arguments to the GPU. for ( i = t->n_input_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < t->n_output_args && !duplicate; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } for ( j = 0; j < i && !duplicate; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) duplicate = TRUE; } // If the input block has not been processed before. if ( !duplicate ) { FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg ); } else { input_arg[i] = NULL; } } // Bring the output arguments to the GPU. for ( i = t->n_output_args - 1; i >= 0; i-- ) { // Check for duplicate blocks. duplicate = FALSE; for ( j = 0; j < i && !duplicate; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) duplicate = TRUE; } // If the output block has not been processed before. if ( !duplicate ) { FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg ); // Invalidate output blocks on all other GPUs. for ( k = 0; k < n_threads; k++ ) if ( k != thread ) FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg ); } else { output_arg[i] = NULL; } } // Check to see if there are any duplicates. for ( i = t->n_input_args - 1; i >= 0; i-- ) { for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ ) { if ( t->input_arg[i].base == t->output_arg[j].base ) input_arg[i] = output_arg[j]; } for ( j = 0; j < i && input_arg[i] == NULL; j++ ) { if ( t->input_arg[i].base == t->input_arg[j].base ) input_arg[i] = input_arg[j]; } } // Check to see if there are any duplicates. for ( i = t->n_output_args - 1; i >= 0; i-- ) { for ( j = 0; j < i && output_arg[i] == NULL; j++ ) { if ( t->output_arg[i].base == t->output_arg[j].base ) output_arg[i] = output_arg[j]; } } return; }
void FLASH_Queue_verbose_output | ( | void | ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec().
{ int i, j, k; int n_threads = FLASH_Queue_get_num_threads(); int n_tasks = FLASH_Queue_get_num_tasks(); FLASH_Verbose verbose = FLASH_Queue_get_verbose_output(); FLASH_Task* t; FLASH_Dep* d; // Grab the head of the task queue. t = FLASH_Queue_get_head_task(); if ( verbose == FLASH_QUEUE_VERBOSE_READABLE ) { // Iterate over linked list of tasks. for ( i = 0; i < n_tasks; i++ ) { printf( "%d\t%s\t", t->order, t->name ); for ( j = 0; j < t->n_output_args; j++ ) printf( "%lu[%u,%u] ", t->output_arg[j].base->id, t->output_arg[j].base->m_index, t->output_arg[j].base->n_index ); printf( ":= " ); for ( j = 0; j < t->n_output_args; j++ ) printf( "%lu[%u,%u] ", t->output_arg[j].base->id, t->output_arg[j].base->m_index, t->output_arg[j].base->n_index ); for ( j = 0; j < t->n_input_args; j++ ) printf( "%lu[%u,%u] ", t->input_arg[j].base->id, t->input_arg[j].base->m_index, t->input_arg[j].base->n_index ); printf( "\n" ); // Go to the next task. t = t->next_task; } printf( "\n" ); } else { printf( "digraph SuperMatrix {\n" ); if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE ) { // Iterate over linked list of tasks. for ( i = 0; i < n_tasks; i++ ) { printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order); d = t->dep_arg_head; for ( j = 0; j < t->n_dep_args; j++ ) { printf( "%d;", d->task->order ); d = d->next_dep; } printf( "};\n" ); // Go to the next task. t = t->next_task; } } else { // Iterate over all the threads. for ( k = 0; k < n_threads; k++ ) { printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k ); // Iterate over linked list of tasks. for ( i = 0; i < n_tasks; i++ ) { if ( t->queue == k ) printf( "%d [label=\"%s\"];\n", t->order, t->name ); // Go to the next task. t = t->next_task; } printf( "}\n" ); // Grab the head of the task queue. t = FLASH_Queue_get_head_task(); } // Iterate over linked list of tasks. for ( i = 0; i < n_tasks; i++ ) { printf( "%d -> {", t->order ); d = t->dep_arg_head; for ( j = 0; j < t->n_dep_args; j++ ) { printf( "%d;", d->task->order ); d = d->next_dep; } printf( "};\n" ); // Go to the next task. t = t->next_task; } } printf( "}\n\n" ); } return; }
FLASH_Task * FLASH_Queue_wait_dequeue | ( | int | queue, |
int | cache, | ||
void * | arg | ||
) |
References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; FLASH_Task* t = NULL; if ( args->n_wait[0] > 0 ) { // Grab the head of the queue. t = args->task_queue[args->wait_queue[args->pc[0]]]; // Decrement number of tasks on waiting queue. args->n_wait[0]--; // Increment the program counter. args->pc[0]++; } return t; }
FLASH_Task* FLASH_Queue_wait_dequeue_block | ( | int | queue, |
int | cache, | ||
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_wait_dequeue().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int size = args->size; int n_tasks = args->wait_queue[queue].n_tasks; FLA_Bool enabled = FALSE; FLASH_Task* t; FLA_Obj obj; FLA_Obj mem; #ifdef FLA_ENABLE_GPU enabled = FLASH_Queue_get_enabled_gpu(); // If using GPUs, then only check GPU and not the cache. if ( enabled ) size = FLASH_Queue_get_gpu_num_blocks(); #endif t = args->wait_queue[queue].head; // Check if any of the output blocks are in the cache. for ( i = 0; i < n_tasks; i++ ) { for ( j = 0; j < size; j++ ) { // Initialize the memory just in case. mem.base = NULL; // Determine if using GPU or not. if ( enabled ) { #ifdef FLA_ENABLE_GPU mem = args->gpu[cache * size + j].obj; #endif } else { mem = args->cache[cache * size + j]; } for ( k = 0; k < t->n_output_args; k++ ) { obj = t->output_arg[k]; if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) obj = *FLASH_OBJ_PTR_AT( obj ); // Return the task if its output block is in cache. if ( mem.base == obj.base ) { t->hit = TRUE; return t; } } } t = t->next_wait; } return args->wait_queue[queue].head; }
void FLASH_Queue_wait_enqueue | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i = args->n_wait[0] + args->pc[0]; // Insertion sort of tasks in waiting queue. if ( FLASH_Queue_get_sorting() ) { for ( ; i > args->pc[0]; i-- ) { if ( args->task_queue[args->wait_queue[i-1]]->height > args->task_queue[t->order]->height ) break; args->wait_queue[i] = args->wait_queue[i-1]; } } args->wait_queue[i] = t->order; // Increment number of tasks on waiting queue. args->n_wait[0]++; return; }
FLASH_Task* FLASH_Queue_work_stealing | ( | int | queue, |
void * | arg | ||
) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int q; int n_queues = args->n_queues; FLASH_Task* t = NULL; // Do not perform work stealing if there is only one queue. if ( n_queues == 1 ) return t; // Find a random queue not equal to the current queue. do { #ifdef FLA_ENABLE_WINDOWS_BUILD rand_s( &q ); q = q % n_queues; #else #ifdef FLA_ENABLE_TIDSP q = rand() % n_queues; #else q = lrand48() % n_queues; #endif #endif } while ( q == queue ); #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_acquire( &(args->run_lock[q]) ); // R *** #endif // If there are tasks that this thread can steal. if ( args->wait_queue[q].n_tasks > 0 ) { // Dequeue the last task. t = args->wait_queue[q].tail; if ( args->wait_queue[q].n_tasks == 1 ) { // Clear the queue of its only task. args->wait_queue[q].head = NULL; args->wait_queue[q].tail = NULL; } else { // Adjust pointers in waiting queue. args->wait_queue[q].tail = t->prev_wait; args->wait_queue[q].tail->next_wait = NULL; } // Reset waiting queue data about the stolen task. t->queue = queue; t->prev_wait = NULL; t->next_wait = NULL; args->wait_queue[q].n_tasks--; } #ifdef FLA_ENABLE_MULTITHREADING FLA_Lock_release( &(args->run_lock[q]) ); // R *** #endif return t; }
FLASH_Task* FLASH_Task_alloc | ( | void * | func, |
void * | cntl, | ||
char * | name, | ||
FLA_Bool | enabled_gpu, | ||
int | n_int_args, | ||
int | n_fla_args, | ||
int | n_input_args, | ||
int | n_output_args | ||
) |
References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_push().
{ FLASH_Task* t; // Allocate space for the task structure t. t = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) ); // Allocate space for the task's integer arguments. t->int_arg = (int *) FLA_malloc( n_int_args * sizeof(int) ); // Allocate space for the task's FLA_Obj arguments. t->fla_arg = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) ); // Allocate space for the task's input FLA_Obj arguments. t->input_arg = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) ); // Allocate space for the task's output FLA_Obj arguments. t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) ); // Initialize other fields of the structure. t->n_ready = 0; t->order = 0; t->queue = 0; t->height = 0; t->thread = 0; t->cache = 0; t->hit = FALSE; t->func = func; t->cntl = cntl; t->name = name; t->enabled_gpu = enabled_gpu; t->n_int_args = n_int_args; t->n_fla_args = n_fla_args; t->n_input_args = n_input_args; t->n_output_args = n_output_args; t->n_macro_args = 0; t->n_war_args = 0; t->n_dep_args = 0; t->dep_arg_head = NULL; t->dep_arg_tail = NULL; t->prev_task = NULL; t->next_task = NULL; t->prev_wait = NULL; t->next_wait = NULL; // Return a pointer to the initialized structure. return t; }
void FLASH_Task_free | ( | FLASH_Task * | t | ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().
{ int i, j, k; FLA_Obj obj; FLASH_Dep* d; FLASH_Dep* next_dep; // Clearing the last write task in each output block. for ( i = 0; i < t->n_output_args; i++ ) { obj = t->output_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) ( buf + jj * cs + kk )->base->write_task = NULL; } else // Clear regular block. { obj.base->write_task = NULL; } } // Cleaning the last read tasks in each input block. for ( i = 0; i < t->n_input_args; i++ ) { obj = t->input_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } } else // Regular block. { k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } // Free the dep_arg field of t. d = t->dep_arg_head; for ( i = 0; i < t->n_dep_args; i++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } // Free the int_arg field of t. FLA_free( t->int_arg ); // Free the fla_arg field of t. FLA_free( t->fla_arg ); // Free the input_arg field of t. FLA_free( t->input_arg ); // Free the output_arg field of t. FLA_free( t->output_arg ); // Finally, free the struct itself. FLA_free( t ); return; }
void FLASH_Task_free_parallel | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i, j, k; int thread; int n_threads = FLASH_Queue_get_num_threads(); FLASH_Dep* d; FLASH_Dep* next_dep; FLA_Obj obj; // Clearing the last write task in each output block. for ( i = 0; i < t->n_output_args; i++ ) { obj = t->output_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) for ( kk = 0; kk < m; kk++ ) ( buf + jj * cs + kk )->base->write_task = NULL; } else // Clear regular block. { obj.base->write_task = NULL; } } // Cleaning the last read tasks in each input block. for ( i = 0; i < t->n_input_args; i++ ) { obj = t->input_arg[i]; // Macroblock is used. if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX ) { dim_t jj, kk; dim_t m = FLA_Obj_length( obj ); dim_t n = FLA_Obj_width( obj ); dim_t cs = FLA_Obj_col_stride( obj ); FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj ); // Clear each block in macroblock. for ( jj = 0; jj < n; jj++ ) { for ( kk = 0; kk < m; kk++ ) { obj = *( buf + jj * cs + kk ); thread = obj.base->n_read_blocks % n_threads; FLA_Lock_acquire( &(args->war_lock[thread]) ); // W *** k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; FLA_Lock_release( &(args->war_lock[thread]) ); // W *** for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } } else // Regular block. { thread = obj.base->n_read_blocks % n_threads; FLA_Lock_acquire( &(args->war_lock[thread]) ); // W *** k = obj.base->n_read_tasks; d = obj.base->read_task_head; obj.base->n_read_tasks = 0; obj.base->read_task_head = NULL; obj.base->read_task_tail = NULL; FLA_Lock_release( &(args->war_lock[thread]) ); // W *** for ( j = 0; j < k; j++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } } } // Free the dep_arg field of t. d = t->dep_arg_head; for ( i = 0; i < t->n_dep_args; i++ ) { next_dep = d->next_dep; FLA_free( d ); d = next_dep; } // Free the int_arg field of t. FLA_free( t->int_arg ); // Free the fla_arg field of t. FLA_free( t->fla_arg ); // Free the input_arg field of t. FLA_free( t->input_arg ); // Free the output_arg field of t. FLA_free( t->output_arg ); // Finally, free the struct itself. FLA_free( t ); return; }
FLASH_Task* FLASH_Task_update_binding | ( | FLASH_Task * | t, |
FLASH_Task * | r, | ||
void * | arg | ||
) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.
Referenced by FLASH_Task_update_dependencies().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int queue; if ( r == NULL ) { // There are no tasks on waiting queue, so bind the first task. r = t; r->hit = TRUE; } else { // Swap the binded task for the new ready task. if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) ) { queue = r->queue; r->hit = FALSE; FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** // Place swapped task back onto waiting queue. FLASH_Queue_wait_enqueue( r, arg ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** // Bind the new ready task. r = t; r->hit = TRUE; } else // Keep the binded task and enqueue new ready task. { queue = t->queue; FLA_Lock_acquire( &(args->run_lock[queue]) ); // R *** FLASH_Queue_wait_enqueue( t, arg ); FLA_Lock_release( &(args->run_lock[queue]) ); // R *** } } return r; }
FLASH_Task * FLASH_Task_update_dependencies | ( | FLASH_Task * | t, |
void * | arg | ||
) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec_parallel_function().
{ FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg; int i; int n_threads = FLASH_Queue_get_num_threads(); int thread; FLA_Bool available; FLASH_Task* task; FLASH_Task* r = NULL; FLASH_Dep* d = t->dep_arg_head; // Check each dependent task. for ( i = 0; i < t->n_dep_args; i++ ) { task = d->task; // Use the remaining locks except for the first one. thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 ); RCCE_acquire_lock( thread ); args->n_ready[task->order]--; available = ( args->n_ready[task->order] == 0 ); RCCE_release_lock( thread ); // Place newly ready tasks on waiting queue. if ( available ) { RCCE_acquire_lock( 0 ); FLASH_Queue_wait_enqueue( task, arg ); RCCE_release_lock( 0 ); } // Go to the next dep. d = d->next_dep; } return r; }