// ivs_demo_saxpy.cpp : This file contains the 'main' function. Program execution begins and ends there.
//

#include <iostream>
#include <chrono>
#include <omp.h>

void Saxpy(float *pB, float *pC, float a, size_t size)
{
	#pragma omp parallel for simd schedule(static)
	for(size_t i = 0; i < size; ++i)
	{
		pC[i] = a * pB[i] + pC[i];
	}
}

float DotProduct(float *pB, float *pC, size_t size)
{
	float dot = 0.0f;

	#pragma omp parallel for simd schedule(static) reduction(+:dot)
	for(size_t i = 0; i < size; ++i)
	{
		dot += pB[i] * pC[i];
	}

	return dot;
}

float SaxpyDotMerged(float *pB, float *pC, float a, size_t size)
{
	float dot = 0.0f;

	#pragma omp parallel for simd schedule(static) reduction(+:dot)
	for(size_t i = 0; i < size; ++i)
	{
		pC[i] = a * pB[i] + pC[i];
		dot += pB[i] * pC[i];
	}

	return dot;
}

template<typename T>
std::chrono::milliseconds PerfClockDurationMs(const T &dur) {
	return std::chrono::duration_cast<std::chrono::milliseconds>(dur);
}

int main()
{
	typedef std::chrono::steady_clock PerfClock_t;

    // Allocate two arrays of 256*(2^20) floats (1 GB each)
	const size_t size = 256 * 1024 * 1024;
	const unsigned repeats = 10;
	const float a = 5.0f;

	float *pB = new float[size];
	float *pC = new float[size];

    // Fill arrays with some data
	#pragma omp parallel for simd schedule(static)
	for(size_t i = 0; i < size; ++i)
	{
		pB[i] = float(i);
		pC[i] = 1.0f / float(i + 1);
	}

	std::cout << "Running SAXPY: ";
	auto startTime = PerfClock_t::now();

	for(unsigned i = 0; i < repeats; ++i)
		Saxpy(pB, pC, a, size);

	std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms" << std::endl;


	std::cout << "Running DotProduct: ";
	startTime = PerfClock_t::now();

	float dot = 0.0f;
	for(unsigned i = 0; i < repeats; ++i)
		dot = DotProduct(pB, pC, size);

	std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (" << dot << ")" << std::endl;

	std::cout << "Running SaxpyDotMerged: ";
	startTime = PerfClock_t::now();

    dot = 0.0f;
	for (unsigned i = 0; i < repeats; ++i)
		dot = SaxpyDotMerged(pB, pC, a, size);

	std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (" << dot << ")" << std::endl;

	delete[] pB;
	delete[] pC;

	return 0;
}