videní: 1.5K
#include “cuda_runtime.h”
#include “cuda.h”
#include “device_launch_parameters.h”
#include “device_functions.h”
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <chrono>
#include <iostream>
#define THREADS_PER_BLOCK 256
#define BLOCKS_COUNT 2
#define ARRAY_SIZE 2000
void generateRandomValues(int* values) {
srand((unsigned)time(NULL));
for (int i = 0; i < ARRAY_SIZE; i++) {
values = rand() % (ARRAY_SIZE * 10);
}
}
//TODO: deklaruj funkciu is_prime s navratovym typom int tak, aby bola spustitelna z kernelu
__device__ int is_prime(int num) {
if (num <= 1) return 0;
if (num % 2 == 0 && num > 2) return 0;
for (int i = 3; i < num / 2; i += 2) {
if (num % i == 0) {
return 0;
}
}
return 1;
}
// kernel – hladanie prvocisel pomocou GPU paralelne
//TODO: deklaruj funkciu gpu_parallel_prime_numbers ako kernel, nezabudni na navratovy typ
__global__ void gpu_parallel_prime_numbers(int *values, int *result) {
int index = blockDim.x * blockIdx.x + threadIdx.x;
int numbers_per_thread = ARRAY_SIZE / (BLOCKS_COUNT * THREADS_PER_BLOCK) + 1;
int start = index * numbers_per_thread;
for (int i = start; i < start + numbers_per_thread; i++) {
if (i < ARRAY_SIZE) {
result = 0;
if (is_prime(values) == 1) {
result = values;
}
}
}
}
int get_count(int *result) {
int count = 0;
for (int i = 0; i < ARRAY_SIZE; i++) {
if (result != 0) {
count++;
}
}
return count;
}
int main() {
using namespace std::chrono;
//generovanie nahodnych hodnot
int h_values;
generateRandomValues(h_values);
//alokacia pamate na hoste
int *h_result = (int*)malloc(sizeof(int) * ARRAY_SIZE);
int *h_count = (int*)malloc(sizeof(int));
//alokacia pamate na device
int *d_values;
int *d_result;
//TODO: alokuj pamat na GPU (device) pre polia integerov d_values a d_result o dlzke pola ARRAY_SIZE
cudaMalloc((void **)&d_values, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void **)&d_result, sizeof(int) * ARRAY_SIZE);
//kopirovanie hodnot z hosta na device
cudaMemcpy(d_values, h_values, sizeof(int) * ARRAY_SIZE, cudaMemcpyHostToDevice);
//GPU parallel
cudaEvent_t gpu_start, gpu_end;
float gpu_elapsed;
//eventy na meranie casu
cudaEventCreate(&gpu_start);
cudaEventCreate(&gpu_end);
cudaEventRecord(gpu_start, 0);
//paralelne spracovanie s BLOCKS_COUNT blokmi a THREADS_PER_BLOCK vlaknami
//TODO: spusti kernel pre pocet blokov = BLOCKS_COUNT, vlakien = THREADS_PER_BLOCK, nezabudni na parametre funkcie
gpu_parallel_prime_numbers << <BLOCKS_COUNT, THREADS_PER_BLOCK >> > (d_values, d_result);
cudaEventRecord(gpu_end, 0);
cudaEventSynchronize(gpu_end);
cudaEventElapsedTime(&gpu_elapsed, gpu_start, gpu_end);
cudaEventDestroy(gpu_start);
cudaEventDestroy(gpu_end);
//kopirovanie vysledkov z device na hosta
//TODO: skopiruj obsah pola d_values (host) do pola h_values (device)
cudaMemcpy(h_values, d_values, sizeof(int) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
printf(“Parralel GPU took: %f sec\n”, gpu_elapsed / 1000);
printf(“Number of prime numbers: %d\n”, get_count(h_result));
//uvolnenie pamate
free(h_result);
free(h_count);
//TODO: uvolni pamat pre premenne GPUcka (device) – d_values, d_result
cudaFree(d_values);
cudaFree(d_result);
return 0;
}