/* matrix is stored in row major order.
   each thread performs more than 1 multiplication in case the dim is larger than 500
   for example thread 0 computes the product of columns 0,500,1000, etc .. of the matrix
*/

#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#include <cuda.h>
#define Dim 5000
#define Blocksize 500

__global__ void MatvectMult(int *dmat,int *dvect,int *db)
{
	int i,row,col,result = 0;
	__shared__ int tfinal[Blocksize];
	i = threadIdx.x;
	row = blockIdx.x;
	
	for(col = i;col < Dim;col += Blocksize)
	 result += dmat[col + row*Dim]*dvect[col];
	
	tfinal[i] = result;
	__syncthreads();
	
	if(i == 0)
	 {
		result = 0;
		for(col=0;col<Blocksize;col++)
		 result += tfinal[col];
		db[row] = result;
	 }
}	
		

int main()
{
	
	int *mat,*vect,*b,*pb;
	int *dmat,*dvect,*db; 
	int i,j;
	long int itrnsftime,comptime,otrnsftime;
	struct timeval start,end;

	mat = (int *)malloc(Dim*Dim*sizeof(int));
	if(mat == NULL)
	 {
		printf("Unable to allocate memory!!\n");
		return -1;
	 }

	vect = (int *)malloc(Dim*sizeof(int));
	if(vect == NULL)
	 {
		printf("Unable to allocate memory!!\n");
		return -1;
	 }

	b = (int *)malloc(Dim*sizeof(int));
	if(b == NULL)
	 {
		printf("Unable to allocate memory!!\n");
		return -1;
	 }

	pb = (int *)malloc(Dim*sizeof(int));
	if(pb == NULL)
	 {
		printf("Unable to allocate memory!!\n");
		return -1;
	 }

	//printf("The matrix is \n");
	for(i=0;i<Dim;i++)
	 {
		for(j=0;j<Dim;j++)
		 {
			 mat[i*Dim + j] = rand()%9;
	//		 printf(" %d",mat[i*Dim + j]);
		 }
	//	printf("\n");
	 }
	//printf("\n");

	//printf("The vector is \n");
	for(i=0;i<Dim;i++)
	 {
		vect[i] = rand()%9;
		b[i] = 0;
		pb[i] = 0;
	//	printf(" %d",vect[i]);
	 }
	//printf("\n");

	gettimeofday(&start,NULL);
	for(i=0;i<Dim;i++)
	 {
		for(j=0;j<Dim;j++)
		 {
			b[i] += mat[i*Dim + j]*vect[j];
		 }
	 }
	gettimeofday(&end,NULL);
/*
	printf("The resultant vector is : \n");
	for(i=0;i<Dim;i++)
	 {
		printf(" %d",b[i]);
	 }
	printf("\n");
*/
	printf("Time taken for calculation is %ld microseconds \n",(end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));


	cudaMalloc((void **)&dmat,Dim*Dim*sizeof(int));
	cudaMalloc((void **)&dvect,Dim*sizeof(int));
	cudaMalloc((void **)&db,Dim*sizeof(int));

	gettimeofday(&start,NULL);
	cudaMemcpy(dmat,mat,Dim*Dim*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dvect,vect,Dim*sizeof(int),cudaMemcpyHostToDevice);
	gettimeofday(&end,NULL);
	itrnsftime = (end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec);

	dim3 dimblk(Blocksize,1);
	dim3 dimgrid(Dim,1);
	
	gettimeofday(&start,NULL);
	MatvectMult<<< dimgrid, dimblk >>>(dmat,dvect,db);
	cudaThreadSynchronize();
	gettimeofday(&end,NULL);
	comptime = (end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec);

	gettimeofday(&start,NULL);
	cudaMemcpy(pb,db,Dim*sizeof(int),cudaMemcpyDeviceToHost);
	gettimeofday(&end,NULL);
	otrnsftime = (end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec);


	//printf("The resultant vector is : \n");
	for(i=0;i<Dim;i++)
	 {
		//printf(" %d",pb[i]);
		if(pb[i] != b[i])
		 printf("wrong answers!!\n");
	 }
	//printf("\n");

	printf("The time for input transfer is  %ld milliseconds\n",itrnsftime);
	printf("The time for computation is     %ld milliseconds\n",comptime);
	printf("The time for output transfer is %ld milliseconds\n",otrnsftime);
	printf("Total Time is %ld micoseconds\n",itrnsftime+comptime+otrnsftime);

	free(mat);
	free(vect);
	free(b);
	cudaFree(dmat);
	cudaFree(dvect);
	cudaFree(db);	
	
	return 0;
}
