1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897 |
- /* Plugin for NVPTX execution.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
- Contributed by Mentor Embedded.
- This file is part of the GNU Offloading and Multi Processing Library
- (libgomp).
- Libgomp is free software; you can redistribute it and/or modify it
- under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3, or (at your option)
- any later version.
- Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
- Under Section 7 of GPL version 3, you are granted additional
- permissions described in the GCC Runtime Library Exception, version
- 3.1, as published by the Free Software Foundation.
- You should have received a copy of the GNU General Public License and
- a copy of the GCC Runtime Library Exception along with this program;
- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- <http://www.gnu.org/licenses/>. */
- /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
- library appears to hold some implicit state, but the documentation
- is not clear as to what that state might be. Or how one might
- propagate it from one thread to another. */
- #include "openacc.h"
- #include "config.h"
- #include "libgomp-plugin.h"
- #include "oacc-ptx.h"
- #include "oacc-plugin.h"
- #include <pthread.h>
- #include <cuda.h>
- #include <stdbool.h>
- #include <stdint.h>
- #include <string.h>
- #include <stdio.h>
- #include <dlfcn.h>
- #include <unistd.h>
- #include <assert.h>
- #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
- static struct
- {
- CUresult r;
- char *m;
- } cuda_errlist[]=
- {
- { CUDA_ERROR_INVALID_VALUE, "invalid value" },
- { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
- { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
- { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
- { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
- { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
- { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
- { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
- { CUDA_ERROR_NO_DEVICE, "no device" },
- { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
- { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
- { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
- { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
- { CUDA_ERROR_MAP_FAILED, "map error" },
- { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
- { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
- { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
- { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
- { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
- { CUDA_ERROR_NOT_MAPPED, "not mapped" },
- { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
- { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
- { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
- { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
- { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
- { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
- { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
- { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
- { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- "shared object symbol not found" },
- { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
- { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
- { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
- { CUDA_ERROR_NOT_FOUND, "not found" },
- { CUDA_ERROR_NOT_READY, "not ready" },
- { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
- { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
- { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
- { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- "launch incompatibe texturing" },
- { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
- { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
- { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
- { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
- { CUDA_ERROR_ASSERT, "assert" },
- { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
- { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- "host memory already registered" },
- { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
- { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
- { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
- { CUDA_ERROR_UNKNOWN, "unknown" }
- };
- static char errmsg[128];
- static char *
- cuda_error (CUresult r)
- {
- int i;
- for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
- {
- if (cuda_errlist[i].r == r)
- return &cuda_errlist[i].m[0];
- }
- sprintf (&errmsg[0], "unknown result code: %5d", r);
- return &errmsg[0];
- }
- struct targ_fn_descriptor
- {
- CUfunction fn;
- const char *name;
- };
- static unsigned int instantiated_devices = 0;
- static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
- struct ptx_stream
- {
- CUstream stream;
- pthread_t host_thread;
- bool multithreaded;
- CUdeviceptr d;
- void *h;
- void *h_begin;
- void *h_end;
- void *h_next;
- void *h_prev;
- void *h_tail;
- struct ptx_stream *next;
- };
- /* Thread-specific data for PTX. */
- struct nvptx_thread
- {
- struct ptx_stream *current_stream;
- struct ptx_device *ptx_dev;
- };
- struct map
- {
- int async;
- size_t size;
- char mappings[0];
- };
- static void
- map_init (struct ptx_stream *s)
- {
- CUresult r;
- int size = getpagesize ();
- assert (s);
- assert (!s->d);
- assert (!s->h);
- r = cuMemAllocHost (&s->h, size);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
- r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
- assert (s->h);
- s->h_begin = s->h;
- s->h_end = s->h_begin + size;
- s->h_next = s->h_prev = s->h_tail = s->h_begin;
- assert (s->h_next);
- assert (s->h_end);
- }
- static void
- map_fini (struct ptx_stream *s)
- {
- CUresult r;
- r = cuMemFreeHost (s->h);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
- }
- static void
- map_pop (struct ptx_stream *s)
- {
- struct map *m;
- assert (s != NULL);
- assert (s->h_next);
- assert (s->h_prev);
- assert (s->h_tail);
- m = s->h_tail;
- s->h_tail += m->size;
- if (s->h_tail >= s->h_end)
- s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
- if (s->h_next == s->h_tail)
- s->h_prev = s->h_next;
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
- }
- static void
- map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
- {
- int left;
- int offset;
- struct map *m;
- assert (s != NULL);
- left = s->h_end - s->h_next;
- size += sizeof (struct map);
- assert (s->h_prev);
- assert (s->h_next);
- if (size >= left)
- {
- m = s->h_prev;
- m->size += left;
- s->h_next = s->h_begin;
- if (s->h_next + size > s->h_end)
- GOMP_PLUGIN_fatal ("unable to push map");
- }
- assert (s->h_next);
- m = s->h_next;
- m->async = async;
- m->size = size;
- offset = (void *)&m->mappings[0] - s->h;
- *d = (void *)(s->d + offset);
- *h = (void *)(s->h + offset);
- s->h_prev = s->h_next;
- s->h_next += size;
- assert (s->h_prev);
- assert (s->h_next);
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
- return;
- }
- struct ptx_device
- {
- CUcontext ctx;
- bool ctx_shared;
- CUdevice dev;
- struct ptx_stream *null_stream;
- /* All non-null streams associated with this device (actually context),
- either created implicitly or passed in from the user (via
- acc_set_cuda_stream). */
- struct ptx_stream *active_streams;
- struct {
- struct ptx_stream **arr;
- int size;
- } async_streams;
- /* A lock for use when manipulating the above stream list and array. */
- pthread_mutex_t stream_lock;
- int ord;
- bool overlap;
- bool map;
- bool concur;
- int mode;
- bool mkern;
- struct ptx_device *next;
- };
- enum ptx_event_type
- {
- PTX_EVT_MEM,
- PTX_EVT_KNL,
- PTX_EVT_SYNC,
- PTX_EVT_ASYNC_CLEANUP
- };
- struct ptx_event
- {
- CUevent *evt;
- int type;
- void *addr;
- int ord;
- struct ptx_event *next;
- };
- struct ptx_image_data
- {
- void *target_data;
- CUmodule module;
- struct ptx_image_data *next;
- };
- static pthread_mutex_t ptx_event_lock;
- static struct ptx_event *ptx_events;
- static struct ptx_device **ptx_devices;
- static struct ptx_image_data *ptx_images = NULL;
- static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
- #define _XSTR(s) _STR(s)
- #define _STR(s) #s
- static struct _synames
- {
- char *n;
- } cuda_symnames[] =
- {
- { _XSTR (cuCtxCreate) },
- { _XSTR (cuCtxDestroy) },
- { _XSTR (cuCtxGetCurrent) },
- { _XSTR (cuCtxPushCurrent) },
- { _XSTR (cuCtxSynchronize) },
- { _XSTR (cuDeviceGet) },
- { _XSTR (cuDeviceGetAttribute) },
- { _XSTR (cuDeviceGetCount) },
- { _XSTR (cuEventCreate) },
- { _XSTR (cuEventDestroy) },
- { _XSTR (cuEventQuery) },
- { _XSTR (cuEventRecord) },
- { _XSTR (cuInit) },
- { _XSTR (cuLaunchKernel) },
- { _XSTR (cuLinkAddData) },
- { _XSTR (cuLinkComplete) },
- { _XSTR (cuLinkCreate) },
- { _XSTR (cuMemAlloc) },
- { _XSTR (cuMemAllocHost) },
- { _XSTR (cuMemcpy) },
- { _XSTR (cuMemcpyDtoH) },
- { _XSTR (cuMemcpyDtoHAsync) },
- { _XSTR (cuMemcpyHtoD) },
- { _XSTR (cuMemcpyHtoDAsync) },
- { _XSTR (cuMemFree) },
- { _XSTR (cuMemFreeHost) },
- { _XSTR (cuMemGetAddressRange) },
- { _XSTR (cuMemHostGetDevicePointer) },
- { _XSTR (cuMemHostRegister) },
- { _XSTR (cuMemHostUnregister) },
- { _XSTR (cuModuleGetFunction) },
- { _XSTR (cuModuleLoadData) },
- { _XSTR (cuStreamDestroy) },
- { _XSTR (cuStreamQuery) },
- { _XSTR (cuStreamSynchronize) },
- { _XSTR (cuStreamWaitEvent) }
- };
- static int
- verify_device_library (void)
- {
- int i;
- void *dh, *ds;
- dh = dlopen ("libcuda.so", RTLD_LAZY);
- if (!dh)
- return -1;
- for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
- {
- ds = dlsym (dh, cuda_symnames[i].n);
- if (!ds)
- return -1;
- }
- dlclose (dh);
- return 0;
- }
- static inline struct nvptx_thread *
- nvptx_thread (void)
- {
- return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
- }
- static void
- init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
- {
- int i;
- struct ptx_stream *null_stream
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
- null_stream->stream = NULL;
- null_stream->host_thread = pthread_self ();
- null_stream->multithreaded = true;
- null_stream->d = (CUdeviceptr) NULL;
- null_stream->h = NULL;
- map_init (null_stream);
- ptx_dev->null_stream = null_stream;
- ptx_dev->active_streams = NULL;
- pthread_mutex_init (&ptx_dev->stream_lock, NULL);
- if (concurrency < 1)
- concurrency = 1;
- /* This is just a guess -- make space for as many async streams as the
- current device is capable of concurrently executing. This can grow
- later as necessary. No streams are created yet. */
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
- ptx_dev->async_streams.size = concurrency;
- for (i = 0; i < concurrency; i++)
- ptx_dev->async_streams.arr[i] = NULL;
- }
- static void
- fini_streams_for_device (struct ptx_device *ptx_dev)
- {
- free (ptx_dev->async_streams.arr);
- while (ptx_dev->active_streams != NULL)
- {
- struct ptx_stream *s = ptx_dev->active_streams;
- ptx_dev->active_streams = ptx_dev->active_streams->next;
- map_fini (s);
- cuStreamDestroy (s->stream);
- free (s);
- }
- map_fini (ptx_dev->null_stream);
- free (ptx_dev->null_stream);
- }
- /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
- thread THREAD (and also current device/context). If CREATE is true, create
- the stream if it does not exist (or use EXISTING if it is non-NULL), and
- associate the stream with the same thread argument. Returns stream to use
- as result. */
- static struct ptx_stream *
- select_stream_for_async (int async, pthread_t thread, bool create,
- CUstream existing)
- {
- struct nvptx_thread *nvthd = nvptx_thread ();
- /* Local copy of TLS variable. */
- struct ptx_device *ptx_dev = nvthd->ptx_dev;
- struct ptx_stream *stream = NULL;
- int orig_async = async;
- /* The special value acc_async_noval (-1) maps (for now) to an
- implicitly-created stream, which is then handled the same as any other
- numbered async stream. Other options are available, e.g. using the null
- stream for anonymous async operations, or choosing an idle stream from an
- active set. But, stick with this for now. */
- if (async > acc_async_sync)
- async++;
- if (create)
- pthread_mutex_lock (&ptx_dev->stream_lock);
- /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
- null stream, and in fact better performance may be obtainable if it doesn't
- (because the null stream enforces overly-strict synchronisation with
- respect to other streams for legacy reasons, and that's probably not
- needed with OpenACC). Maybe investigate later. */
- if (async == acc_async_sync)
- stream = ptx_dev->null_stream;
- else if (async >= 0 && async < ptx_dev->async_streams.size
- && ptx_dev->async_streams.arr[async] && !(create && existing))
- stream = ptx_dev->async_streams.arr[async];
- else if (async >= 0 && create)
- {
- if (async >= ptx_dev->async_streams.size)
- {
- int i, newsize = ptx_dev->async_streams.size * 2;
- if (async >= newsize)
- newsize = async + 1;
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
- newsize * sizeof (struct ptx_stream *));
- for (i = ptx_dev->async_streams.size; i < newsize; i++)
- ptx_dev->async_streams.arr[i] = NULL;
- ptx_dev->async_streams.size = newsize;
- }
- /* Create a new stream on-demand if there isn't one already, or if we're
- setting a particular async value to an existing (externally-provided)
- stream. */
- if (!ptx_dev->async_streams.arr[async] || existing)
- {
- CUresult r;
- struct ptx_stream *s
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
- if (existing)
- s->stream = existing;
- else
- {
- r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
- }
- /* If CREATE is true, we're going to be queueing some work on this
- stream. Associate it with the current host thread. */
- s->host_thread = thread;
- s->multithreaded = false;
- s->d = (CUdeviceptr) NULL;
- s->h = NULL;
- map_init (s);
- s->next = ptx_dev->active_streams;
- ptx_dev->active_streams = s;
- ptx_dev->async_streams.arr[async] = s;
- }
- stream = ptx_dev->async_streams.arr[async];
- }
- else if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
- if (create)
- {
- assert (stream != NULL);
- /* If we're trying to use the same stream from different threads
- simultaneously, set stream->multithreaded to true. This affects the
- behaviour of acc_async_test_all and acc_wait_all, which are supposed to
- only wait for asynchronous launches from the same host thread they are
- invoked on. If multiple threads use the same async value, we make note
- of that here and fall back to testing/waiting for all threads in those
- functions. */
- if (thread != stream->host_thread)
- stream->multithreaded = true;
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- }
- else if (stream && !stream->multithreaded
- && !pthread_equal (stream->host_thread, thread))
- GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
- return stream;
- }
- /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
- should be locked on entry and remains locked on exit. */
- static bool
- nvptx_init (void)
- {
- CUresult r;
- int rc;
- int ndevs;
- if (instantiated_devices != 0)
- return true;
- rc = verify_device_library ();
- if (rc < 0)
- return false;
- r = cuInit (0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
- ptx_events = NULL;
- pthread_mutex_init (&ptx_event_lock, NULL);
- r = cuDeviceGetCount (&ndevs);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
- ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
- * ndevs);
- return true;
- }
- /* Select the N'th PTX device for the current host thread. The device must
- have been previously opened before calling this function. */
- static void
- nvptx_attach_host_thread_to_device (int n)
- {
- CUdevice dev;
- CUresult r;
- struct ptx_device *ptx_dev;
- CUcontext thd_ctx;
- r = cuCtxGetDevice (&dev);
- if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
- GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
- if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
- return;
- else
- {
- CUcontext old_ctx;
- ptx_dev = ptx_devices[n];
- assert (ptx_dev);
- r = cuCtxGetCurrent (&thd_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
- /* We don't necessarily have a current context (e.g. if it has been
- destroyed. Pop it if we do though. */
- if (thd_ctx != NULL)
- {
- r = cuCtxPopCurrent (&old_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
- }
- r = cuCtxPushCurrent (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
- }
- }
- static struct ptx_device *
- nvptx_open_device (int n)
- {
- struct ptx_device *ptx_dev;
- CUdevice dev, ctx_dev;
- CUresult r;
- int async_engines, pi;
- r = cuDeviceGet (&dev, n);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
- ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
- ptx_dev->ord = n;
- ptx_dev->dev = dev;
- ptx_dev->ctx_shared = false;
- r = cuCtxGetDevice (&ctx_dev);
- if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
- GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
-
- if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
- {
- /* The current host thread has an active context for a different device.
- Detach it. */
- CUcontext old_ctx;
-
- r = cuCtxPopCurrent (&old_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
- }
- r = cuCtxGetCurrent (&ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
- if (!ptx_dev->ctx)
- {
- r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
- }
- else
- ptx_dev->ctx_shared = true;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
- ptx_dev->overlap = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
- ptx_dev->map = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
- ptx_dev->concur = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
- ptx_dev->mode = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
- ptx_dev->mkern = pi;
- r = cuDeviceGetAttribute (&async_engines,
- CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
- if (r != CUDA_SUCCESS)
- async_engines = 1;
- init_streams_for_device (ptx_dev, async_engines);
- return ptx_dev;
- }
- static void
- nvptx_close_device (struct ptx_device *ptx_dev)
- {
- CUresult r;
- if (!ptx_dev)
- return;
- fini_streams_for_device (ptx_dev);
- if (!ptx_dev->ctx_shared)
- {
- r = cuCtxDestroy (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
- }
- free (ptx_dev);
- }
- static int
- nvptx_get_num_devices (void)
- {
- int n;
- CUresult r;
- /* PR libgomp/65099: Currently, we only support offloading in 64-bit
- configurations. */
- if (sizeof (void *) != 8)
- return 0;
- /* This function will be called before the plugin has been initialized in
- order to enumerate available devices, but CUDA API routines can't be used
- until cuInit has been called. Just call it now (but don't yet do any
- further initialization). */
- if (instantiated_devices == 0)
- cuInit (0);
- r = cuDeviceGetCount (&n);
- if (r!= CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
- return n;
- }
- static void
- link_ptx (CUmodule *module, char *ptx_code)
- {
- CUjit_option opts[7];
- void *optvals[7];
- float elapsed = 0.0;
- #define LOGSIZE 8192
- char elog[LOGSIZE];
- char ilog[LOGSIZE];
- unsigned long logsize = LOGSIZE;
- CUlinkState linkstate;
- CUresult r;
- void *linkout;
- size_t linkoutsize __attribute__ ((unused));
- GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
- opts[0] = CU_JIT_WALL_TIME;
- optvals[0] = &elapsed;
- opts[1] = CU_JIT_INFO_LOG_BUFFER;
- optvals[1] = &ilog[0];
- opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
- optvals[2] = (void *) logsize;
- opts[3] = CU_JIT_ERROR_LOG_BUFFER;
- optvals[3] = &elog[0];
- opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
- optvals[4] = (void *) logsize;
- opts[5] = CU_JIT_LOG_VERBOSE;
- optvals[5] = (void *) 1;
- opts[6] = CU_JIT_TARGET;
- optvals[6] = (void *) CU_TARGET_COMPUTE_30;
- r = cuLinkCreate (7, opts, optvals, &linkstate);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
- char *abort_ptx = ABORT_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
- strlen (abort_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
- }
- char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
- strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
- cuda_error (r));
- }
- char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
- strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
- cuda_error (r));
- }
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
- strlen (ptx_code) + 1, 0, 0, 0, 0);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
- }
- r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
- GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
- GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
- r = cuModuleLoadData (module, linkout);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
- }
- static void
- event_gc (bool memmap_lockable)
- {
- struct ptx_event *ptx_event = ptx_events;
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_mutex_lock (&ptx_event_lock);
- while (ptx_event != NULL)
- {
- CUresult r;
- struct ptx_event *e = ptx_event;
- ptx_event = ptx_event->next;
- if (e->ord != nvthd->ptx_dev->ord)
- continue;
- r = cuEventQuery (*e->evt);
- if (r == CUDA_SUCCESS)
- {
- CUevent *te;
- te = e->evt;
- switch (e->type)
- {
- case PTX_EVT_MEM:
- case PTX_EVT_SYNC:
- break;
- case PTX_EVT_KNL:
- map_pop (e->addr);
- break;
- case PTX_EVT_ASYNC_CLEANUP:
- {
- /* The function gomp_plugin_async_unmap_vars needs to claim the
- memory-map splay tree lock for the current device, so we
- can't call it when one of our callers has already claimed
- the lock. In that case, just delay the GC for this event
- until later. */
- if (!memmap_lockable)
- continue;
- GOMP_PLUGIN_async_unmap_vars (e->addr);
- }
- break;
- }
- cuEventDestroy (*te);
- free ((void *)te);
- if (ptx_events == e)
- ptx_events = ptx_events->next;
- else
- {
- struct ptx_event *e_ = ptx_events;
- while (e_->next != e)
- e_ = e_->next;
- e_->next = e_->next->next;
- }
- free (e);
- }
- }
- pthread_mutex_unlock (&ptx_event_lock);
- }
- static void
- event_add (enum ptx_event_type type, CUevent *e, void *h)
- {
- struct ptx_event *ptx_event;
- struct nvptx_thread *nvthd = nvptx_thread ();
- assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
- || type == PTX_EVT_ASYNC_CLEANUP);
- ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
- ptx_event->type = type;
- ptx_event->evt = e;
- ptx_event->addr = h;
- ptx_event->ord = nvthd->ptx_dev->ord;
- pthread_mutex_lock (&ptx_event_lock);
- ptx_event->next = ptx_events;
- ptx_events = ptx_event;
- pthread_mutex_unlock (&ptx_event_lock);
- }
- void
- nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
- int vector_length, int async, void *targ_mem_desc)
- {
- struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
- CUfunction function;
- CUresult r;
- int i;
- struct ptx_stream *dev_str;
- void *kargs[1];
- void *hp, *dp;
- unsigned int nthreads_in_block;
- struct nvptx_thread *nvthd = nvptx_thread ();
- const char *maybe_abort_msg = "(perhaps abort was called)";
- function = targ_fn->fn;
- dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
- assert (dev_str == nvthd->current_stream);
- /* This reserves a chunk of a pre-allocated page of memory mapped on both
- the host and the device. HP is a host pointer to the new chunk, and DP is
- the corresponding device pointer. */
- map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
- /* Copy the array of arguments to the mapped page. */
- for (i = 0; i < mapnum; i++)
- ((void **) hp)[i] = devaddrs[i];
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
- fact have the same value on a unified-memory system). */
- r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
- GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
- // OpenACC CUDA
- //
- // num_gangs blocks
- // num_workers warps (where a warp is equivalent to 32 threads)
- // vector length threads
- //
- /* The openacc vector_length clause 'determines the vector length to use for
- vector or SIMD operations'. The question is how to map this to CUDA.
- In CUDA, the warp size is the vector length of a CUDA device. However, the
- CUDA interface abstracts away from that, and only shows us warp size
- indirectly in maximum number of threads per block, which is a product of
- warp size and the number of hyperthreads of a multiprocessor.
- We choose to map openacc vector_length directly onto the number of threads
- in a block, in the x dimension. This is reflected in gcc code generation
- that uses ThreadIdx.x to access vector elements.
- Attempting to use an openacc vector_length of more than the maximum number
- of threads per block will result in a cuda error. */
- nthreads_in_block = vector_length;
- kargs[0] = &dp;
- r = cuLaunchKernel (function,
- num_gangs, 1, 1,
- nthreads_in_block, 1, 1,
- 0, dev_str->stream, kargs, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
- #ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
- {
- r = cuStreamSynchronize (dev_str->stream);
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
- }
- else
- {
- CUevent *e;
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
- event_gc (true);
- r = cuEventRecord (*e, dev_str->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_KNL, e, (void *)dev_str);
- }
- #else
- r = cuCtxSynchronize ();
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
- #endif
- GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
- targ_fn->name);
- #ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
- #endif
- map_pop (dev_str);
- }
- void * openacc_get_current_cuda_context (void);
- static void *
- nvptx_alloc (size_t s)
- {
- CUdeviceptr d;
- CUresult r;
- r = cuMemAlloc (&d, s);
- if (r == CUDA_ERROR_OUT_OF_MEMORY)
- return 0;
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
- return (void *)d;
- }
- static void
- nvptx_free (void *p)
- {
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
- if ((CUdeviceptr)p != pb)
- GOMP_PLUGIN_fatal ("invalid device address");
- r = cuMemFree ((CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
- }
- static void *
- nvptx_host2dev (void *d, const void *h, size_t s)
- {
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
- if (!s)
- return 0;
- if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
- if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
- if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
- if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
- if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
- #ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e;
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
- event_gc (false);
- r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_MEM, e, (void *)h);
- }
- else
- #endif
- {
- r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
- }
- return 0;
- }
- static void *
- nvptx_dev2host (void *h, const void *d, size_t s)
- {
- CUresult r;
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
- if (!s)
- return 0;
- if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
- if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
- if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
- if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
- if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
- #ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e;
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
- event_gc (false);
- r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_MEM, e, (void *)h);
- }
- else
- #endif
- {
- r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
- }
- return 0;
- }
- static void
- nvptx_set_async (int async)
- {
- struct nvptx_thread *nvthd = nvptx_thread ();
- nvthd->current_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
- }
- static int
- nvptx_async_test (int async)
- {
- CUresult r;
- struct ptx_stream *s;
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- {
- /* The oacc-parallel.c:goacc_wait function calls this hook to determine
- whether all work has completed on this stream, and if so omits the call
- to the wait hook. If that happens, event_gc might not get called
- (which prevents variables from getting unmapped and their associated
- device storage freed), so call it here. */
- event_gc (true);
- return 1;
- }
- else if (r == CUDA_ERROR_NOT_READY)
- return 0;
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
- return 0;
- }
- static int
- nvptx_async_test_all (void)
- {
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if ((s->multithreaded || pthread_equal (s->host_thread, self))
- && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
- {
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- return 0;
- }
- }
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- event_gc (true);
- return 1;
- }
- static void
- nvptx_wait (int async)
- {
- CUresult r;
- struct ptx_stream *s;
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
- event_gc (true);
- }
- static void
- nvptx_wait_async (int async1, int async2)
- {
- CUresult r;
- CUevent *e;
- struct ptx_stream *s1, *s2;
- pthread_t self = pthread_self ();
- /* The stream that is waiting (rather than being waited for) doesn't
- necessarily have to exist already. */
- s2 = select_stream_for_async (async2, self, true, NULL);
- s1 = select_stream_for_async (async1, self, false, NULL);
- if (!s1)
- GOMP_PLUGIN_fatal ("invalid async 1\n");
- if (s1 == s2)
- GOMP_PLUGIN_fatal ("identical parameters");
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
- event_gc (true);
- r = cuEventRecord (*e, s1->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_SYNC, e, NULL);
- r = cuStreamWaitEvent (s2->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
- }
- static void
- nvptx_wait_all (void)
- {
- CUresult r;
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
- /* Wait for active streams initiated by this thread (or by multiple threads)
- to complete. */
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if (s->multithreaded || pthread_equal (s->host_thread, self))
- {
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- continue;
- else if (r != CUDA_ERROR_NOT_READY)
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
- }
- }
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- event_gc (true);
- }
- static void
- nvptx_wait_all_async (int async)
- {
- CUresult r;
- struct ptx_stream *waiting_stream, *other_stream;
- CUevent *e;
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_t self = pthread_self ();
- /* The stream doing the waiting. This could be the first mention of the
- stream, so create it if necessary. */
- waiting_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
- /* Launches on the null stream already block on other streams in the
- context. */
- if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
- return;
- event_gc (true);
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
- for (other_stream = nvthd->ptx_dev->active_streams;
- other_stream != NULL;
- other_stream = other_stream->next)
- {
- if (!other_stream->multithreaded
- && !pthread_equal (other_stream->host_thread, self))
- continue;
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
- /* Record an event on the waited-for stream. */
- r = cuEventRecord (*e, other_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_SYNC, e, NULL);
- r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
- }
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- }
- static void *
- nvptx_get_current_cuda_device (void)
- {
- struct nvptx_thread *nvthd = nvptx_thread ();
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
- return &nvthd->ptx_dev->dev;
- }
- static void *
- nvptx_get_current_cuda_context (void)
- {
- struct nvptx_thread *nvthd = nvptx_thread ();
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
- return nvthd->ptx_dev->ctx;
- }
- static void *
- nvptx_get_cuda_stream (int async)
- {
- struct ptx_stream *s;
- struct nvptx_thread *nvthd = nvptx_thread ();
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- return s ? s->stream : NULL;
- }
- static int
- nvptx_set_cuda_stream (int async, void *stream)
- {
- struct ptx_stream *oldstream;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
- if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
- /* We have a list of active streams and an array mapping async values to
- entries of that list. We need to take "ownership" of the passed-in stream,
- and add it to our list, removing the previous entry also (if there was one)
- in order to prevent resource leaks. Note the potential for surprise
- here: maybe we should keep track of passed-in streams and leave it up to
- the user to tidy those up, but that doesn't work for stream handles
- returned from acc_get_cuda_stream above... */
- oldstream = select_stream_for_async (async, self, false, NULL);
- if (oldstream)
- {
- if (nvthd->ptx_dev->active_streams == oldstream)
- nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
- else
- {
- struct ptx_stream *s = nvthd->ptx_dev->active_streams;
- while (s->next != oldstream)
- s = s->next;
- s->next = s->next->next;
- }
- cuStreamDestroy (oldstream->stream);
- map_fini (oldstream);
- free (oldstream);
- }
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- (void) select_stream_for_async (async, self, true, (CUstream) stream);
- return 1;
- }
- /* Plugin entry points. */
- const char *
- GOMP_OFFLOAD_get_name (void)
- {
- return "nvptx";
- }
- unsigned int
- GOMP_OFFLOAD_get_caps (void)
- {
- return GOMP_OFFLOAD_CAP_OPENACC_200;
- }
- int
- GOMP_OFFLOAD_get_type (void)
- {
- return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
- }
- int
- GOMP_OFFLOAD_get_num_devices (void)
- {
- return nvptx_get_num_devices ();
- }
- void
- GOMP_OFFLOAD_init_device (int n)
- {
- pthread_mutex_lock (&ptx_dev_lock);
- if (!nvptx_init () || ptx_devices[n] != NULL)
- {
- pthread_mutex_unlock (&ptx_dev_lock);
- return;
- }
- ptx_devices[n] = nvptx_open_device (n);
- instantiated_devices++;
- pthread_mutex_unlock (&ptx_dev_lock);
- }
- void
- GOMP_OFFLOAD_fini_device (int n)
- {
- pthread_mutex_lock (&ptx_dev_lock);
- if (ptx_devices[n] != NULL)
- {
- nvptx_attach_host_thread_to_device (n);
- nvptx_close_device (ptx_devices[n]);
- ptx_devices[n] = NULL;
- instantiated_devices--;
- }
- pthread_mutex_unlock (&ptx_dev_lock);
- }
- int
- GOMP_OFFLOAD_load_image (int ord, void *target_data,
- struct addr_pair **target_table)
- {
- CUmodule module;
- char **fn_names, **var_names;
- unsigned int fn_entries, var_entries, i, j;
- CUresult r;
- struct targ_fn_descriptor *targ_fns;
- void **img_header = (void **) target_data;
- struct ptx_image_data *new_image;
- GOMP_OFFLOAD_init_device (ord);
- nvptx_attach_host_thread_to_device (ord);
- link_ptx (&module, img_header[0]);
- pthread_mutex_lock (&ptx_image_lock);
- new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
- new_image->target_data = target_data;
- new_image->module = module;
- new_image->next = ptx_images;
- ptx_images = new_image;
- pthread_mutex_unlock (&ptx_image_lock);
- /* The mkoffload utility emits a table of pointers/integers at the start of
- each offload image:
- img_header[0] -> ptx code
- img_header[1] -> number of variables
- img_header[2] -> array of variable names (pointers to strings)
- img_header[3] -> number of kernels
- img_header[4] -> array of kernel names (pointers to strings)
- The array of kernel names and the functions addresses form a
- one-to-one correspondence. */
- var_entries = (uintptr_t) img_header[1];
- var_names = (char **) img_header[2];
- fn_entries = (uintptr_t) img_header[3];
- fn_names = (char **) img_header[4];
- *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
- * (fn_entries + var_entries));
- targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
- * fn_entries);
- for (i = 0; i < fn_entries; i++)
- {
- CUfunction function;
- r = cuModuleGetFunction (&function, module, fn_names[i]);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
- targ_fns[i].fn = function;
- targ_fns[i].name = (const char *) fn_names[i];
- (*target_table)[i].start = (uintptr_t) &targ_fns[i];
- (*target_table)[i].end = (*target_table)[i].start + 1;
- }
- for (j = 0; j < var_entries; j++, i++)
- {
- CUdeviceptr var;
- size_t bytes;
- r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
- (*target_table)[i].start = (uintptr_t) var;
- (*target_table)[i].end = (*target_table)[i].start + bytes;
- }
- return i;
- }
- void
- GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
- {
- void **img_header = (void **) target_data;
- struct targ_fn_descriptor *targ_fns
- = (struct targ_fn_descriptor *) img_header[0];
- struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
- free (targ_fns);
- pthread_mutex_lock (&ptx_image_lock);
- for (image = ptx_images; image != NULL;)
- {
- struct ptx_image_data *next = image->next;
- if (image->target_data == target_data)
- {
- cuModuleUnload (image->module);
- free (image);
- if (prev)
- prev->next = next;
- }
- else
- {
- prev = image;
- if (!newhd)
- newhd = image;
- }
- image = next;
- }
- ptx_images = newhd;
- pthread_mutex_unlock (&ptx_image_lock);
- }
- void *
- GOMP_OFFLOAD_alloc (int ord, size_t size)
- {
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_alloc (size);
- }
- void
- GOMP_OFFLOAD_free (int ord, void *ptr)
- {
- nvptx_attach_host_thread_to_device (ord);
- nvptx_free (ptr);
- }
- void *
- GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
- {
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_dev2host (dst, src, n);
- }
- void *
- GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
- {
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_host2dev (dst, src, n);
- }
- void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
- void
- GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
- void **hostaddrs, void **devaddrs, size_t *sizes,
- unsigned short *kinds, int num_gangs,
- int num_workers, int vector_length, int async,
- void *targ_mem_desc)
- {
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
- num_workers, vector_length, async, targ_mem_desc);
- }
- void
- GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
- {
- CUevent *e;
- CUresult r;
- struct nvptx_thread *nvthd = nvptx_thread ();
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
- }
- int
- GOMP_OFFLOAD_openacc_async_test (int async)
- {
- return nvptx_async_test (async);
- }
- int
- GOMP_OFFLOAD_openacc_async_test_all (void)
- {
- return nvptx_async_test_all ();
- }
- void
- GOMP_OFFLOAD_openacc_async_wait (int async)
- {
- nvptx_wait (async);
- }
- void
- GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
- {
- nvptx_wait_async (async1, async2);
- }
- void
- GOMP_OFFLOAD_openacc_async_wait_all (void)
- {
- nvptx_wait_all ();
- }
- void
- GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
- {
- nvptx_wait_all_async (async);
- }
- void
- GOMP_OFFLOAD_openacc_async_set_async (int async)
- {
- nvptx_set_async (async);
- }
- void *
- GOMP_OFFLOAD_openacc_create_thread_data (int ord)
- {
- struct ptx_device *ptx_dev;
- struct nvptx_thread *nvthd
- = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
- CUresult r;
- CUcontext thd_ctx;
- ptx_dev = ptx_devices[ord];
- assert (ptx_dev);
- r = cuCtxGetCurrent (&thd_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
- assert (ptx_dev->ctx);
- if (!thd_ctx)
- {
- r = cuCtxPushCurrent (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
- }
- nvthd->current_stream = ptx_dev->null_stream;
- nvthd->ptx_dev = ptx_dev;
- return (void *) nvthd;
- }
- void
- GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
- {
- free (data);
- }
- void *
- GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
- {
- return nvptx_get_current_cuda_device ();
- }
- void *
- GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
- {
- return nvptx_get_current_cuda_context ();
- }
- /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
- void *
- GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
- {
- return nvptx_get_cuda_stream (async);
- }
- /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
- int
- GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
- {
- return nvptx_set_cuda_stream (async, stream);
- }
|