What is the proper way to calculate execution times within a single core for a multi-core application using OpenMP?
In a single-core application, I use the Timestamp_get32() function to count cycles between lines of code, although this doesn't seem to return the correct value for code inside the #pragma omp parallel private(nthreads, tid) block in my code.
I ran the multiplication for a Hanning window both inside and outside the OMP pragma and roughly got the results I was seeing on my single-core application. Time benchmarks within the pragma are around 6-8 times what they are outside the pragma. Check out the simple C code below. There isn’t anything fancy going on, this is built around the HelloWorld template for OpenMP. This time difference is irrelevant if I set this application for 1 core, 4 cores, or 8 cores.
Can Timestamp_get32() be trusted within the pragma statement?
/******************************************************************************
* FILE: omp_hello.c
* DESCRIPTION:
* OpenMP Example - Hello World - C/C++ Version
* In this simple example, the master thread forks a parallel region.
* All threads in the team obtain their unique thread number and print it.
* The master thread only prints the total number of threads. Two OpenMP
* library routines are used to obtain the number of threads and each
* thread's number.
* AUTHOR: BlaiseBarney 5/99
* LAST REVISED: 04/06/05
******************************************************************************/
#include<ti/omp/omp.h>
#include<string.h>
#include<assert.h>
#include<stdio.h>
#include<time.h>
#include<stdint.h>
#include<xdc/std.h>
#include<xdc/runtime/System.h>
#include<ti/sysbios/BIOS.h>
#include<xdc/runtime/Log.h>
#include<xdc/runtime/Timestamp.h>
#include<math.h>
#define NTHREADS 1
#define FFT_MAX_L 2048
#define PI 3.14159265358979323846
float multiplier[ 2048 ];
voidgenerateHanningLookup( void )
{
int32_t i;
for (i = 0; i < 2048; i++)
{
// equation from stackoverflow.com
multiplier[ i ] = 0.5 * ( 1 - cos( 2 * PI * i / ( FFT_MAX_L - 1 ) ) );
}
}
voidmain()
{
int nthreads, tid;
nthreads = NTHREADS;
omp_set_num_threads(NTHREADS);
int16_t windowOutputData[ FFT_MAX_L ];
uint16_t j;
uint32_t start, totalTime;
generateHanningLookup();
start = Timestamp_get32();
for( j = 0; j < FFT_MAX_L; j++ )
{
windowOutputData[ j ] = j * multiplier[ j ];
}
printf( "HANNING#1 = [ %u ] cycles \n", ( Timestamp_get32() - start ) );
totalTime = Timestamp_get32();
/* Fork a team of threads giving them their own copies of variables */
#pragma omp parallel private(nthreads, tid)
{
/* Obtain thread number */
tid = omp_get_thread_num();
printf("Hello World from thread = %d\n", tid);
/* Only master thread does this */
if (tid == 0)
{
nthreads = omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
// Hanning Window
start = Timestamp_get32();
for( j = 0; j < FFT_MAX_L; j++ )
{
windowOutputData[ j ] = j * multiplier[ j ];
}
printf( "HANNING#OMP = [ %u ] cycles \n", ( Timestamp_get32() - start ) );
}
else
{
tid = omp_get_thread_num();
uint32_t startMp = Timestamp_get32();
for( j = 0; j < FFT_MAX_L; j++ )
{
windowOutputData[ j ] = j * multiplier[ j ];
}
printf( "HANNING#%u = [ %u ] cycles \n", tid, ( Timestamp_get32() - startMp ) );
}
} /* All threads join master thread and disband */
printf( "OMP TIME = [ %u ] cycles \n", ( Timestamp_get32() - totalTime ) );
start = Timestamp_get32();
for( j = 0; j < FFT_MAX_L; j++ )
{
windowOutputData[ j ] = j * multiplier[ j ];
}
printf( "HANNING#2 = [ %u ] cycles \n", ( Timestamp_get32() - start ) );
}