#include <stdlib.h>
#include <iostream>
#include <chrono>
#include <omp.h>

template<typename T>
std::chrono::milliseconds PerfClockDurationMs(const T &dur) {
	return std::chrono::duration_cast<std::chrono::milliseconds>(dur);
}

void TestMemory(float *pD, size_t size, size_t blockSize)
{
    // Iterate "size" times over array of floats wrapping around after
    // "blockSize" elements.
    // Tounching only every 16th element (1 element per cache-line) to 
    // save strain memory subsystem.
    for(size_t i = 0; i < size; i += 16)
        pD[i % blockSize] += 1.0f;
}

int main(int argc, char *argv[])
{
    typedef std::chrono::steady_clock PerfClock_t;
    
    // Allocate array of 64*(2^20) floats (256 MB)
    const size_t size = 64 * 1024 * 1024;
    const unsigned repeats = 100;
    float *pD = new float[size];
    
    // Test L1 cache by iterating 64*(2^20) times over block of 8*(2^10)
    // floats (32 KB)
    {
        const size_t blockSize = 8 * 1024;
        std::cout << "Tesing memory: ";
        auto startTime = PerfClock_t::now();
        
        for(unsigned i = 0; i < repeats; ++i)
            TestMemory(pD, size, blockSize);
        
        std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (blockSize: " << 4*blockSize / 1024 << " KB)" << std::endl;
    }
    
    // Test L2 cache by iterating 64*(2^20) times over block of 9*(2^10)
    // floats (36 KB)
    {
        const size_t blockSize = 9 * 1024;
        std::cout << "Tesing memory: ";
        auto startTime = PerfClock_t::now();
        
        for(unsigned i = 0; i < repeats; ++i)
            TestMemory(pD, size, blockSize);
        
        std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (blockSize: " << 4*blockSize / 1024 << " KB)" << std::endl;
    }
    
    // Test L3 cache by iterating 64*(2^20) times over block of 96*(2^10)
    // floats (384 KB)
    {
        const size_t blockSize = 96 * 1024;
        std::cout << "Tesing memory: ";
        auto startTime = PerfClock_t::now();
        
        for(unsigned i = 0; i < repeats; ++i)
            TestMemory(pD, size, blockSize);
        
        std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (blockSize: " << 4*blockSize / 1024 << " KB)" << std::endl;
    }
    
    // Test DRAM by iterating 64*(2^20) times over block of 6*(2^20)
    // floats (24 MB)
    {
        const size_t blockSize = 6 * 1024 * 1024;
        std::cout << "Tesing memory: ";
        auto startTime = PerfClock_t::now();
        
        for(unsigned i = 0; i < repeats; ++i)
            TestMemory(pD, size, blockSize);
        
        std::cout << PerfClockDurationMs(PerfClock_t::now() - startTime).count() << " ms (blockSize: " << 4*blockSize / (1024*1024) << " MB)" << std::endl;
    }
    
    delete[] pD;
    
    return 0;
}