123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840 |
- Scientific Programming and Computer Architecture -
- https://divakarvi.github.io/bk-spca/spca.html
- Parallel processing in C -
- https://berkeley-scf.github.io/tutorial-parallelization/parallel-C
- Parallel Programming for Multicore Machines using OpenMP and MPI -
- https://ocw.mit.edu/courses/12-950-parallel-programming-for-multicore-machines-using-
- openmp-and-mpi-january-iap-2010/pages/syllabus/
- The GNU C Library Reference Manual -
- https://www.gnu.org/software/libc/manual/html_node/index.html#SEC_Contents
- /* compile */
- gcc -ansi -pedantic -Wall -O
- /* libraries for scientific computing */
- * GMP for integer and rational arithmetic:
- http://gmplib.org
- * MPFR for correctly rounded real floating-point arithmetic:
- http://www.mpfr.org
- * MPC for correctly rounded complex floating-point arithmetic:
- http://mpc.multiprecision.org
- /* keywords */
- Keywords are the words whose meaning has already been explained to the C
- compiler. There are only 32 keywords available in C. The keywords are also
- called 'Reserved words'.
- auto double int struct
- break else long switch
- case enum register typedef
- char extern return union
- const float short unsigned
- continue for signed void
- default goto sizeof volatile
- do if static while
- /* data types */
- Data Type Size (bytes) Range Format Specifier
- ---------------------------------------------------------------------------------------
- short int 2 -32,768 to 32,767 %hd
- unsigned short int 2 0 to 65,535 %hu
- unsigned int 4 0 to 4,294,967,295 %u
- int 4 -2,147,483,648 to 2,147,483,647 %d
- long int 4 -2,147,483,648 to 2,147,483,647 %ld
- unsigned long int 4 0 to 4,294,967,295 %lu
- long long int 8 -(2^63) to (2^63)-1 %lld
- unsigned long long int 8 0 to 18,446,744,073,709,551,615 %llu
- signed char 1 -128 to 127 %c
- unsigned char 1 0 to 255 %c
- float 4 1.2E-38 to 3.4E+38 %f
- double 8 1.7E-308 to 1.7#+308 %lf
- long double 16 3.4E-4932 to 1.1E+4932 %Lf
- /* mathematical functions */ <math.h>
- abs computes absolute value of an integer value
- labs computes absolute value of an integer value
- llabs computes absolute value of an integer value
- fabs computes absolute value of a floating-point value
- div computes the quotient and remainder of integer division
- ldiv computes the quotient and remainder of integer division
- lldiv computes the quotient and remainder of integer division
- fmod remainder of the floating-point division operation
- remainder signed remainder of the division operation
- remquo signed remainder as well as the three last bits of the division
- fma fused multiply-add operation
- fmax larger of two floating-point values
- fmin smaller of two floating-point values
- fdim positive difference of two floating-point values
- nan returns a NaN (not-a-number)
- nanf returns a NaN (not-a-number)
- nanl returns a NaN (not-a-number)
- exp returns e raised to the given power
- exp2 returns 2 raised to the given power
- expm1 returns e raised to the given power, minus one
- log computes natural logarithm (to base e)
- log2 computes binary logarithm (to base 2)
- log10 computes common logarithm (to base 10)
- log1p computes natural logarithm (to base e) of 1 plus the given number
- ilogb extracts exponent of the number
- logb extracts exponent of the number
- sqrt computes square root
- cbrt computes cubic root
- hypot computes square root of the sum of the squares of two given numbers
- pow raises a number to the given power
- sin computes sine
- cos computes cosine
- tan computes tangent
- asin computes arc sine
- acos computes arc cosine
- atan computes arc tangent
- atan2 computes arc tangent, using signs to determine quadrants
- sinh computes hyperbolic sine
- cosh computes hyperbolic cosine
- tanh computes hyperbolic tangent
- asinh computes hyperbolic arc sine
- acosh computes hyperbolic arc cosine
- atanh computes hyperbolic arc tangent
- erf computes error function
- erfc computes complementary error function
- lgamma computes natural logarithm of the absolute value of the gamma function
- tgamma computes gamma function
- ceil returns the nearest integer not less than the given value
- floor returns the nearest integer not greater than the given value
- trunc returns the nearest integer not greater in magnitude than given value
- round returns the nearest integer rounding away from zero in halfway cases
- lround returns the nearest integer rounding away from zero in halfway cases
- llround returns the nearest integer rounding away from zero in halfway cases
- nearbyint returns the nearest integer using current rounding mode
- rint returns the nearest integer using current rounding mode with exception
- lrint returns the nearest integer using current rounding mode with exception
- llrint returns the nearest integer using current rounding mode with exception
- frexp decomposes a number into significand and a power of 2
- ldexp multiplies a number by 2 raised to a power
- modf decomposes a number into integer and fractional parts
- scalbn multiplies a number by FLT_RADIX raised to a power
- scalbln multiplies a number by FLT_RADIX raised to a power
- nextafter returns next representable floating-point value towards the given value
- nextforward returns next representable floating-point value towards the given value
- copysign copies the sign of a floating-point value
- fpclassify categorizes the given floating-point value
- isfinite checks if the argument has finite value
- isinf checks if the argument is infinite
- isnan checks if the argument is NaN
- isnormal checks if the argument is normal
- signbit checks if the sign of the argument is negative
- /* floating-point environment */ <fenv.h>
- feclearexcept clears exceptions
- fegetenv stores current floating-point environment
- fegetexceptflag stores current status flags
- fegetround retrieves current rounding direction
- feholdexcept saves current floating-point environment and clears all exceptions
- feraiseexcept raises a floating-point exception
- fesetenv sets current floating-point environment
- fesetexceptflag sets current status flags
- fesetround sets current rounding direction
- fetestexcept tests whether certain exceptions have been raised
- feupdateenv restores floating-point environment, but keep current exceptions
- /* complex numbers */ <complex.h>
- cabs computes absolute value
- carg computes argument of a complex number
- cimag computes imaginary part of a complex number
- creal computes real part of a complex number
- conj computes complex conjugate
- cproj computes complex projection into the Riemann sphere
- cexp computes complex exponential
- clog computes complex logarithm
- csqrt computes complex square root
- cpow computes complex power
- csin computes complex sine
- ccos computes complex cosine
- ctan computes complex tangent
- casin computes complex arc sine
- cacos computes complex arc cosine
- catan computes complex arc tangent
- csinh computes complex hyperbolic sine
- ccsoh computes complex hyperbolic cosine
- ctanh computes complex hyperbolic tangent
- casinh computes complex hyperbolic arc sine
- cacosh computes complex hyperbolic arc cosine
- catanh computes complex hyperbolic arc tangent
- /* random-number generation */ <stdlib.h>
- rand generates a pseudo-random number between 0 and RAND_MAX, inclusive
- srand initializes a pseudo-random number generator
- arc4random generates a pseudo-random number between 0 and UINT32_MAX
- arc4random_uniform generates a pseudo-random number between 0 and a maximum value
- arc4random_buf fill a buffer with a pseudo-random bitstream
- arc4random_stir initializes a pseudo-random number generator
- Note: arc4random algorithm better than rand
- /* width and precision specification */
- %d print as decimal integer
- %6d print as decimal integer, at least 6 characters wide
- %f print as floating point
- %6f print as floating point, at least 6 characters wide
- %.2f print as floating point, 2 characters after decimal point
- %6.0f print as floating point, at least 6 wide and no decimal point
- %6.2f print as floating point, at least 6 wide and 2 after decimal point
- %o print as octal
- %x print as hexadecimal
- %c print as character
- %s print as character string
- %% print as % itself
- /* the length of an array A */
- (sizeof A)/(sizeof A[0])
- /* determine the size of an array */
- const char arr[] = "string";
- printf("Size of arr %lu\n", (int)sizeof(arr));
- /* static program analysis */
- gcc -Wall -pedantic program.c
- mpicc -Wall -pedantic program.c
- /* finding dynamic memory errors */
- The rules for dynamic memory use include these:
- * The number of allocation calls (calls to malloc) must exactly match the number
- of deallocation calls (calls to free).
- * Reads and writes to the allocated memory must occur within the memory, not
- outside its range.
- * The allocated memory cannot be used before it is allocated or after it is
- deallocated.
- /* initialize all the elements to 0 */
- int arr[5] = {0};
- /* use a designated initializer on the range */
- int arr[9] = { [0 ... 8] = 10 };
- /* setting only array index arr[0], arr[8] as 0,
- while the others are designated initialized to 10 */
- int arr[9] = { 0, [1 ... 7] = 10, 0 };
- /* malloc */
- the malloc() function not initializes the allocated memory to zero
- int *arr = (int *) malloc(10 * sizeof(*arr)); /* allocate 10 ints */
- double *arr = (double *) malloc(10 * sizeof(arr)); /* allocate 10 doubles */
- assert(arr); /* verify that malloc succeeded */
- /* calloc */
- the calloc() function initializes the allocated memory to zero and
- it's the recommended way to allocate memory for arrays.
- int *arr = (int *) calloc(size_t size, size_t nmemb);
- int *arr = (int *) calloc(10, sizeof(*arr));
- * calloc accepts two arguments, whereas malloc accepts one. nmemb represents
- number of memory blocks, size represents size of each block. This is more
- suitable for allocating memory for arrays.
- * malloc allocates memory all at once, in a single block, whereas calloc
- allocates memory in multiple blocks, which are contiguous.
- * Note: zero value doesn't just mean 0. If we are allocating an array of
- structs, calloc assigns NULL to strings, 0 to ints/floats etc.
- * free: This function deallocates the dynamic memory. Calling free(arr) just
- before return would prevent the error. free MUST be called explicitly after
- the usage of dynamic memory, irrespective of which function is used to create
- it (malloc, calloc, etc.)
- Note:
- In C, you need not (and in fact, should not) cast the return value of malloc/calloc.
- int *arr = calloc(10, sizeof(*arr)); /* do this */
- int *arr = (int *) calloc(10, sizeof(*arr)); /* rather than */
- https://stackoverflow.com/questions/605845/should-i-cast-the-result-of-malloc
- /* calloc and memset */
- calloc() already 0-initializes the memory, so using memset() to do it again is
- pointless (unless you're dealing with a buggy calloc implementation)
- /* a function with an array parameter */
- #include <stdio.h>
- void swap_double(double a[static 2]) {
- double tmp = a[0];
- a[0] = a[1];
- a[1] = tmp;
- }
- int main(void) {
- double A[2] = {1.0, 2.0};
- swap_double(A);
- printf("A[0] = %g, A[1] = %g\n", A[0], A[1]);
- }
- /* format */ Ref: https://www.gnu.org/software/indent/manual/indent.html
- $ doas apt-get install indent
- $ indent -gnu program.c /* --gnu-style (default) */
- $ indent -kr program.c /* --k-and-r-style */
- $ indent -linux program.c /* --linux-style */
- /* splint - Secure Programming LINT */
- $ splint program.c
- $ splint +bounds program.c
- Message Format:
- The line-len and limit flags may be preceded by + or - with the same meaning;
- for the other flags, + turns on the describe printing and - turns it off.
- + show-column // show column number where error is found
- + show-func // show name of function (or macro) definition containing error
- - show-all-conjs // show all possible alternate types
- + hints // provide hints describing an error
- - force-hints // provide hints for all errors reported
- 80 line-len <num> // set length of maximum message line to <number> characters
- 3 indentspaces <num> // set number of spaces to indent sub-messages
- 3 locindentspaces <num> // set number of spaces to indent sub-messages
- - showdeephistory // show all available information about storage mentioned
- - showloadloc // show location information for load files
- - csv // produce comma-separated values (CSV) warnings output file
- - csvoverwrite // overwrite existing CSV output file
- - htmlfileformat // show file locations as links
- + streamoverwrite // warn and exit if a stream output file would overwrite
- /* astyle */
- /* binary, dynamic information-gathering tools */
- $ ldd ./program
- $ objdump --help
- $ objdump -R ./program
- $ strings --help
- $ strace ./program
- $ ltrace ./program
- $ checksec --file=./program
- $ patchelf
- $ one_gadget
- $ ropper
- /* common gcc options */
- binary code optimization:
- -Os Optimize the code to reduce the size of the binary.
- -O1 Turn on basic optimizations. The compiler tries to reduce code
- size and execution time, without performing any optimizations
- that take a great deal of compilation time.
- -O2 Optimize even more. GCC performs nearly all optimizations that
- do not invole a space-speed trade-off. As compared to -O1, this
- option increases both compilation time and the performance.
- -O3 Aggressive optimization. It tries to unroll loops constructs and
- inlines small functions. It can cause unexpected effects in the
- program. The output is usually larger then using -O2.
- -march=native Automatically determines the code generation options to
- optimally exploit your local CPU features. Code may not be
- executable on other machines.
- debugging:
- -g Include the debug symbols in the output. This is necessary for
- tools like gdb, ddd or valgrind.
- -pg Include the profiling information for the GNU profiler.
- Execution in gprof then produces the desired information.
- /* recommended compiler and linker flags for gcc */
- -D_FORTIFY_SOURCE=2 Detect runtime buffer overflows
- -fpie -W1,-pie Needed to enable full ASLR for executables
- -fpic -shared Disable text relocations for shared libraries
- -g3 Generate abundant debugging information
- -O2 Optimize your code for speed/space efficiency
- -Wall Turn on recommended compiler warnings
- -Werror Turn warnings into errors
- -std=c17 Specify the language standard
- -pedantic Issue warnings demanded by strict conformance to the standard
- /* compiling with math library */
- gcc fdtd.c -lm -o fdtd
- /* compiling with optimization */
- gcc -Wall -pedantic -std=c99 -O2 files-to-compile
- /* compiling for profiling */
- gcc -g -pg -o fdtd fdtd.c -lm
- /* compiling with version 8 of the gcc compiler */
- CFLAGS=-g -O3 -fstrict-aliasing -ftree-vectorize -march=native -mtune=native \
- -fopt-info-vec-optimized
- /* compiling for vectorization */
- gcc fdtd.c -lm -g -O3 -fstrict-aliasing -ftree-vectorize -march=native
- -mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
- -mprefer-vector-width=512 -o fdtd
- gcc fdtd.c -lm -g -O3 -fopenmp -fstrict-aliasing -ftree-vectorize -march=native
- -mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
- -mprefer-vector-width=512 -o fdtd
- Note: same for loop optimizations replace vec with loop
- -fopenmp implies -fopenmp-simd
- -fopenmp-simd == enable handling of OpenMP SIMD directives while other OpenMP
- directives are ignored
- -fopt-info-vec-optimized == list info on vector optimized
- -fopt-info-vec-missed == list info on vector missed
- -fopt-info-vec-all == list info on vector including optimized and missed
- /* compiling with timer.c and timer.h */
- gcc program.c timer.c -o program
- /* profiling with timer.c and timer.h */
- gcc program.c timer.c -pg -o program
- lscpu | egrep --color "mmx|sse|avx"
- /* register names */
- * SSE: xmm0 to xmm15 (128 bits)
- * AVX2: ymm0 to ymm15 (256 bits)
- * AVX512: zmm0 to zmm31 (512 bits)
- In scalar mode, SSE registers are used
- /* floating point instruction names */
- <op><simd or not><raw type>
- where
- * <op> is something like vmul, vadd, vmov or vfmadd
- * <simd or not> is either 's' for scalar or 'p' for packed (i.e. vector)
- * raw type is either 's' for single precision or 'd' for double precision
- Typically:
- vmulss, vmovaps, vaddpd, vfmaddpd
- /* extract assembly code */
- * run objdump -d -C on your executable or library
- * search for your function name
- $ objdump -d -C program | less
- /* check for vectorization */
- * for avx2, look for ymm
- * for avx512, look for zmm
- * othersize look for instructions with ps or pd at the end
- * but ignore mov operations
- * only concentrate on arithmetic ones
- There are three-distinct OpenMP capcbilities:
- 1. vectorization through SIMD directives
- 2. CPU threading from the original OpenMP model
- 3. offloading to an accelerator, generally a GPU, through the new target
- directives.
- /* OpenMP */
- Open Multi-Processing (OpenMP), a shared memory programming standard.
- Pragmas are preprocessor statements in C. A pragma indicates to the compiler
- where to initiate OpenMP threads.
- Relaxed memory model - The value of the variables in main memory or caches of
- all the processors are not updated immediately.
- Race condition - A situation where multiple outcomes are possible, and the
- result is dependent on the timing of the contributors.
- Private variable - In the context of OpenMP, a private variable is local and
- only visible to its thread.
- Shared variable - In the context of OpenMP, a shared variable is visible and
- modifiable by any thread.
- Each thread has a private memory in its stack and shares memory in the heap.
- Work sharing - To split the work across a number of threads or processes.
- First touch - The first touch of an array causes the memory to be allocated.
- The memory is allocated near the thread location where the touch occurs. Prior
- to the first touch, the memory only exists as an entry in a virtual memory
- table. The physical memory that corresponds to the virtual memory is created
- when it is first accessed.
- On some computing nodes, blocks of memory are closer to some processors than
- others. This situation is called Non-Uniform Memory Access (NUMA).
- Because OpenMP has a relaxed memory model, an OpenMP barrier or flush operation
- is required for the memory view of a thread to be communicated to other threads.
- A flush operation guarantees that a value moves between two threads, preventing
- race conditions. An OpenMP barrier flushes all the locally modified values and
- synchronizes the threads.
- OpenMP addresses a single node, not multiple nodes with distributed memory
- architectures. Thus, its memory scalability is limited to the memory on the
- node. For parallel applications that have larger memory requirements, OpenMP
- needs to be used in conjunction with a distributed-memory parallel technique.
- There are several ways to control how many threads you have in the parallel
- region. These are
- - Default -- The default is usually the maximum number of threads for the
- node, but it can be different, depending on the compiler and if MPI ranks
- exist.
- - Environment variable -- Set the size with the OMP_NUM_THREADS environment
- variable; for example
- export OMP_NUM_THREADS=16
- - Function call -- Call the OpenMP function omp_set_threads, for example
- omp_set_threads(16)
- - Pragma -- For example, #pragma omp parallel num_threads(16)
- To compile with GCC, gcc -fopenmp program.c -lm -o program
- where -fopen is the compiler flag to turn on OpenMP.
- [Note: master pragma is replaced by masked pragma in new compilers]
- Loop level OpenMP:
- Threads are allocated by cores, and thread binding is enabled using the
- following OpenMP environment variables to reduce the performance variation of
- runs:
- export OMP_PLACES=cores
- export OMP_CPU_BIND=true
- In the vector addition example (listing 7.7), you can see the interaction
- between the three components: OpenMP work-sharing directives, implied variable
- scope, and memory placement by the operating system. These three components are
- necessary for OpenMP program correctness and performance.
- OpenMP SIMD directives for better portability:
- #pragma omp simd // vectorizes the following loop or block of code
- #pragma omp for simd // threads and vectorizes the following loop
- Another important modifier is the collapse clause. It tells the compiler to
- combine nested loops into a single loop for the vectorized implementation. The
- argument to the clause indicates how many loops to collapse:
- #pragma omp collapse(2)
- for (int j=0; j<n; j++) {
- for (int i=0; i<n; i++) {
- a[j][i] = 0.0;
- }
- }
- The loops are required to be perfectly nested. Perfectly nested loops only have
- statements in the innermost loop, with no extraneous statements before or after
- each loop block.
- OpenMP SIMD functions:
- We can also vectorize an entire function so that it can be called from within a
- vectorized region of the code.
- #pragma omp declare simd
- double pythagorean(double a, double b) {
- return sqrt(a*a + b*b)
- }
- /* compiling MPI program */
- mpicc program.c -o program
- mpiexec -np 2 ./program
- /* using valgrind Memcheck to find memory issues */
- mpiexec -np 2 valgrind ./program
- /* sum all elements */
- MPI_Allreduce(starting address of send buffer, starting address of receive
- buffer, number of elements in send buffer, datatype of elements of send buffer,
- operation, communicator)
- MPI_Allreduce(sendbuf, recvbuf, count, datatype, operation, communicator)
- Example:
- MPI_Allreduce(wave, rwave, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD)
- /* buffers in C */
- general principle: buffer argument is address in memory of the data
- - buffer is void pointer
- - write &x or (void *)&x for scalar
- - write x or (void *)x for array
- /* gprof */
- gprof preparation:
- step 1: Compile and link source code with option -pg:
- $ gcc program.c -pg -o program
- step 2: Run instrumented application
- $ ./program
- step 3a: The Flat Profile shows how much time is spent in each function and how
- often each function was called.
- $ gprof --flat-profile program
- step 3b: The Call Graph shows which functions called each other and how many
- times.
- $ gprof --graph program
- step 3c: Gprof can even annotate your source code. (add option -g at compile)
- $ gprof --annotated-source program
- /* likwid */
- /* gather node arch information */
- likwid-topology - print thread, cache, and memory topology
- $ likwid-topology
- $ likwid-topology -g
- $ likwid-topology -c
- $ likwid-topology -O
- $ likwid-topology -o file.json/txt/xml
- $ likwid-topology -g | less -S
- likwid-powermeter - read out RAPL Energy information and get info about
- Turbo Mode steps
- /* affinity control and data placement */
- likwid-pin - enforce process and thread affinity
- likwid-mpirun - pinning of hybrid MPI/OpenMP applications
- /* query and alter system settings */
- likwid-features - view and toggle feature reagister on Intel processors
- likwid-setFrequencies - read out RAPL Energy information and get info about
- Turbo Mode steps
- /* performance profiling */
- likwid-perfctr - measure hardware performance counter data
- /* micro benchmarking */
- likwid-memsweeper - cleans up filled NUMA memory domains and evicts dirty
- cacheline from cache hierarchy
- likwid-bench - extensive set of threaded micro-benchmarking kernels
- and rapid prototyping environment for assembly
- benchmark kernels
- # install and configure ndiff [files: ndiff-2.00.zip / ndiff-2.00.tar.gz]
- download ndiff from:
- https://www.math.utah.edu/~beebe/software/ndiff/ndiff-2.00.tar.gz
- $ mkdir ~/.ndiff
- $ tar -xzvf ndiff-2.00.tar.gz -C ~/.ndiff
- $ cd ~/.ndiff
- $ ./configure
- $ make
- $ sudo make install
- /* fixing code up ex post facto */
- The indent program, an excellent GNU utility found on most Linux systems,
- formats source according to given rules. The default settings are for the GNU
- coding style, which is not too pretty. To get the utility to follow the Linux
- kernel style, do
- $ indent -kr -i8 -ts8 -sob -l80 -ss -bs -ps1 <file>
- This instructs the utility to format the code accroding to the kernel coding
- style.
- /* sparse - semantic parser of source files */
- /* coccinelle - semantic patching tool for C */
- /* plotting with matplotlib */
- ./fdtd > data /* save output to file data */
- ./fdtd.py data /* read data to plot with matplotlib */
- /* compute factorial */
- int factorial(int x) {
- for(int i=1; i<x; i++) {
- x *= i;
- }
- return x;
- }
- Advanced Scientific Computing - https://www3.nd.edu/~zxu2/ACMS40212-S16.html
- apt-get install cmake
- apt-get install numactl
- apt-get install libopenmpi-dev
- why optimized vectorpt.c runs slower than unoptimized vectorst.c ?
- /* mpi programming in google-colab */
- %%writefile program.c
- !ls -l
- %%shell
- mpiexec program.c -o program
- mpiexec --allow-run-as-root --oversubscribe -np 8 ./program
- /* GNU C Library: Development Tools [libc-devtools] */
- This package contains development tools shipped by the GNU C Library.
- * memusage, memusagestat: profile a program's memory usage
- * mtrace: interpret the malloc trace log
- * sotruss: trace shared library calls
- * sprof: display shared object profiling data
- /* raylib */
- gcc -o file file.c -lraylib -lm -Iinclude -Llib
- /* branchless return statement */
- int fibonacci(int n) {
- if (n == 1 || n == 2) return 1;
- return fibonacci(n - 2) + fibonacci(n - 1);
- }
- int fibonacci(int n) {
- return (n == 1 || n == 2) ? 1 : (fibonacci(n - 2) + fibonacci(n - 1));
- }
- /* branchless if-else condition */
- // instead of writing
- if (a == 2 && b == 3 && c == 4) {
- doSomething();
- } else {
- doOtherThings();
- }
- // prefer to write like
- (a == 2 && b == 3 && c == 4) && doSomething() || doOtherThings();
- // and in cases, where not sure what the function will return
- (a == 2 && b == 3 && c == 4) && (doSomething() || 1) || doOtherThings();
- /* branchless programming inside for loop */
- int a[50];
- for (int i = 0; i < 50; i++) {
- a[i] = rand() % 100;
- }
- int sum = 0;
- // branching programming
- for (int i = 0; i < 50; i++) {
- if (a[i] < 50) {
- sum += a[i];
- }
- }
- int sum = 0;
- // branchless programming
- for (int i = 0; i < 50; i++) {
- sum += (a[i] < 50) * a[i];
- }
- /* ternary operator optimization */
- Reference: http://www.nynaeve.net/?p=178
- if (condition) {
- var = value1;
- } else {
- var = value2;
- }
- var = condition ? value1 : value2;
- /* absolute value */
- int absolute(int x) {
- return (x < 0) ? -x : x;
- }
- /* minimum value */
- int minimum(int a, int b) {
- return (a < b) ? a : b;
- }
- /* maximum value */
- int maximum(int a, int b) {
- return (a > b) ? a : b;
- }
- /* increment statements */
- x = x + 1; /* regular */
- ++x; /* pre-increment */
- x++; /* post-increment */
- x += 1; /* assignment operator */
- /* find executable files */
- find . -type f -executable
- find . -maxdepth 1 -type f -executable /* don't list all executables recursively */
- /* generate shared libraries */
- gcc -c -Wall -Werror -fpic program.c /* generate file program.o */
- gcc -shared -o program.so program.o /* generate file program.so */
- gcc -Wall -fpic -shared -o program.so program.c /* generate file program.so */
- gcc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */
- mpicc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */
- Note:
- * object files for the shared library need to be compiled with the -fpic flag
- * pic: position independent code
- * object files for the static library don't need this flag
- * the extension .so indicates that is a shared library
- (also called dynamic-link library or shared object)
- * the advantage of creating a shared library over a static library is that in
- the former the Python interpreter needs not be recompiled
- /* project directory structure */
- bin/ binaries
- data/ data files
- lib/ libraries / third party libraries
- log/ program logs
- obj/ build object files
- src/ source files
- tmp/ temporary files
- /* structure */
- typedef is used to create an alias name for datatypes i.e. an alias of struct.
- without typedef:
- struct studentData {
- char *name;
- int rollNo;
- int age;
- };
- struct studentData student1;
- student1.name = "Issac Newton"
- using typedef:
- typedef struct studentData {
- char *name;
- int rollNo;
- int age;
- } student;
- student student1;
- student1.name = "Issac Newton"
- /* for loop multiple initialization */
- for (i = 1, j = 1; i < 10 && j < 10; i++, j++)
- * It is initializing two variables. Note: both are separated by comma (,).
- * It has two test conditions joined together using AND (&&) logical operator.
- Note: You cannot use multiple test conditions separated by comma, you must
- use logical operator such as && or || to join conditions.
- * It has two variables in increment part. Note: should be separated by comma.
- /* GNU MP */
- Reference: http://web.mit.edu/gnu/doc/html/gmp_4.html
- Initializing Integer objects:
- void mpz_init(MP_INT *integer)
- void mpz_clear(MP_INT *integer)
- void *_mpz_realloc(MP_INT *integer, mp_size new_alloc)
- void mpz_array_init(MP_INT integer_array[], size_t array_size, mp_size fixed_num_limbs)
- Integer assignment functions:
- void mpz_set(MP_INT *dest_integer, MP_INT *src_integer)
- void mpz_set_ui(MP_INT *integer, unsigned long int initial_value)
- void mpz_set_si(MP_INT *integer, signed long int initial_value)
- int mpz_set_str(MP_INT *integer, char *initial_value, int base)
- Combined initialization and assignment functions:
- void mpz_init_set(MP_INT *dest_integer, MP_INT *src_integer)
- void mpz_init_set_ui(MP_INT *dest_integer, unsigned long int src_ulong)
- void mpz_init_set_si(MP_INT *dest_integer, signed long int src_slong)
- int mpz_init_set_str(MP_INT *dest_integer, char *src_cstring, int base)
- Conversion functions:
|