#include <bits/stdc++.h>
 #include <chrono>
 #include <ctime>
 #include <thread>
 
 using namespace std;

__global__ void addition(int *a, int *b, int *c, int n){
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if(i < n){
    c[i] = a[i] + b[i];
  }
}


 int main(){
  srand(time(0));
  int n = 10;
  int *a = new int[n];
  int *b = new int[n];
  int *c = new int[n];

  for(int i = 0; i < n; i++){
    a[i] = rand()%1000;
    b[i] = rand()%1000;
  }
  
  int *d_a, *d_b, *d_c;
  
  cudaMalloc((void**)&d_a, n*sizeof(int));
  cudaMalloc((void**)&d_b, n*sizeof(int));
  cudaMalloc((void**)&d_c, n*sizeof(int));

  cudaMemcpy(d_a, a, n*sizeof(n), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, n*sizeof(n), cudaMemcpyHostToDevice);
  
  clock_t parallel_start = clock();

  addition<<<n/2,2>>>(d_a, d_b, d_c, n);

  cudaDeviceSynchronize();
  
  cudaMemcpy(c, d_c, n*sizeof(n), cudaMemcpyDeviceToHost);

  clock_t parallel_end = clock();

  double time_taken = double(parallel_end - parallel_start)/CLOCKS_PER_SEC;

  for(int i = 0; i < n; i++)      printf("%d ", a[i]);
  printf("\n");
  for(int i = 0; i < n; i++)      printf("%d ", b[i]);
  printf("\n");
  for(int i = 0; i < n; i++)      printf("%d ", c[i]);
  printf("\n");

  printf("Time taken : %2.7f ms\n", time_taken);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  delete[] a;
  delete[] b;
  delete[] c;
 }