dsaravanan
/
reference


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840
							Scientific Programming and Computer Architecture -
https://divakarvi.github.io/bk-spca/spca.html
Parallel processing in C -
https://berkeley-scf.github.io/tutorial-parallelization/parallel-C
Parallel Programming for Multicore Machines using OpenMP and MPI -
https://ocw.mit.edu/courses/12-950-parallel-programming-for-multicore-machines-using-
openmp-and-mpi-january-iap-2010/pages/syllabus/
The GNU C Library Reference Manual -
https://www.gnu.org/software/libc/manual/html_node/index.html#SEC_Contents

/* compile */
gcc -ansi -pedantic -Wall -O

/* libraries for scientific computing */
* GMP for integer and rational arithmetic:
    http://gmplib.org
* MPFR for correctly rounded real floating-point arithmetic:
    http://www.mpfr.org
* MPC for correctly rounded complex floating-point arithmetic:
    http://mpc.multiprecision.org

/* keywords */
Keywords are the words whose meaning has already been explained to the C
compiler. There are only 32 keywords available in C. The keywords are also
called 'Reserved words'.

auto        double      int         struct
break       else        long        switch
case        enum        register    typedef
char        extern      return      union
const       float       short       unsigned
continue    for         signed      void
default     goto        sizeof      volatile
do          if          static      while


/* data types */
Data Type               Size (bytes)              Range               Format Specifier
---------------------------------------------------------------------------------------
short int                   2               -32,768 to 32,767                %hd
unsigned short int          2                   0 to 65,535                  %hu
unsigned int                4               0 to 4,294,967,295               %u
int                         4       -2,147,483,648 to 2,147,483,647          %d
long int                    4       -2,147,483,648 to 2,147,483,647          %ld
unsigned long int           4               0 to 4,294,967,295               %lu
long long int               8               -(2^63) to (2^63)-1              %lld
unsigned long long int      8       0 to 18,446,744,073,709,551,615          %llu
signed char                 1                   -128 to 127                  %c
unsigned char               1                      0 to 255                  %c
float                       4                1.2E-38 to 3.4E+38              %f
double                      8               1.7E-308 to 1.7#+308             %lf
long double                16              3.4E-4932 to 1.1E+4932            %Lf


/* mathematical functions */ <math.h>
abs             computes absolute value of an integer value
labs            computes absolute value of an integer value
llabs           computes absolute value of an integer value
fabs            computes absolute value of a floating-point value
div             computes the quotient and remainder of integer division
ldiv            computes the quotient and remainder of integer division
lldiv           computes the quotient and remainder of integer division
fmod            remainder of the floating-point division operation
remainder       signed remainder of the division operation
remquo          signed remainder as well as the three last bits of the division
fma             fused multiply-add operation
fmax            larger of two floating-point values
fmin            smaller of two floating-point values
fdim            positive difference of two floating-point values
nan             returns a NaN (not-a-number)
nanf            returns a NaN (not-a-number)
nanl            returns a NaN (not-a-number)
exp             returns e raised to the given power
exp2            returns 2 raised to the given power
expm1           returns e raised to the given power, minus one
log             computes natural logarithm (to base e)
log2            computes binary logarithm (to base 2)
log10           computes common logarithm (to base 10)
log1p           computes natural logarithm (to base e) of 1 plus the given number
ilogb           extracts exponent of the number
logb            extracts exponent of the number
sqrt            computes square root
cbrt            computes cubic root
hypot           computes square root of the sum of the squares of two given numbers
pow             raises a number to the given power
sin             computes sine
cos             computes cosine
tan             computes tangent
asin            computes arc sine
acos            computes arc cosine
atan            computes arc tangent
atan2           computes arc tangent, using signs to determine quadrants
sinh            computes hyperbolic sine
cosh            computes hyperbolic cosine
tanh            computes hyperbolic tangent
asinh           computes hyperbolic arc sine
acosh           computes hyperbolic arc cosine
atanh           computes hyperbolic arc tangent
erf             computes error function
erfc            computes complementary error function
lgamma          computes natural logarithm of the absolute value of the gamma function
tgamma          computes gamma function
ceil            returns the nearest integer not less than the given value
floor           returns the nearest integer not greater than the given value
trunc           returns the nearest integer not greater in magnitude than given value
round           returns the nearest integer rounding away from zero in halfway cases
lround          returns the nearest integer rounding away from zero in halfway cases
llround         returns the nearest integer rounding away from zero in halfway cases
nearbyint       returns the nearest integer using current rounding mode
rint            returns the nearest integer using current rounding mode with exception
lrint           returns the nearest integer using current rounding mode with exception
llrint          returns the nearest integer using current rounding mode with exception
frexp           decomposes a number into significand and a power of 2
ldexp           multiplies a number by 2 raised to a power
modf            decomposes a number into integer and fractional parts
scalbn          multiplies a number by FLT_RADIX raised to a power
scalbln         multiplies a number by FLT_RADIX raised to a power
nextafter       returns next representable floating-point value towards the given value
nextforward     returns next representable floating-point value towards the given value
copysign        copies the sign of a floating-point value
fpclassify      categorizes the given floating-point value
isfinite        checks if the argument has finite value
isinf           checks if the argument is infinite
isnan           checks if the argument is NaN
isnormal        checks if the argument is normal
signbit         checks if the sign of the argument is negative

/* floating-point environment */ <fenv.h>
feclearexcept       clears exceptions
fegetenv            stores current floating-point environment
fegetexceptflag     stores current status flags
fegetround          retrieves current rounding direction
feholdexcept        saves current floating-point environment and clears all exceptions
feraiseexcept       raises a floating-point exception
fesetenv            sets current floating-point environment
fesetexceptflag     sets current status flags
fesetround          sets current rounding direction
fetestexcept        tests whether certain exceptions have been raised
feupdateenv         restores floating-point environment, but keep current exceptions

/* complex numbers */ <complex.h>
cabs            computes absolute value
carg            computes argument of a complex number
cimag           computes imaginary part of a complex number
creal           computes real part of a complex number
conj            computes complex conjugate
cproj           computes complex projection into the Riemann sphere
cexp            computes complex exponential
clog            computes complex logarithm
csqrt           computes complex square root
cpow            computes complex power
csin            computes complex sine
ccos            computes complex cosine
ctan            computes complex tangent
casin           computes complex arc sine
cacos           computes complex arc cosine
catan           computes complex arc tangent
csinh           computes complex hyperbolic sine
ccsoh           computes complex hyperbolic cosine
ctanh           computes complex hyperbolic tangent
casinh          computes complex hyperbolic arc sine
cacosh          computes complex hyperbolic arc cosine
catanh          computes complex hyperbolic arc tangent

/* random-number generation */ <stdlib.h>
rand                generates a pseudo-random number between 0 and RAND_MAX, inclusive
srand               initializes a pseudo-random number generator
arc4random          generates a pseudo-random number between 0 and UINT32_MAX
arc4random_uniform  generates a pseudo-random number between 0 and a maximum value
arc4random_buf      fill a buffer with a pseudo-random bitstream
arc4random_stir     initializes a pseudo-random number generator
Note: arc4random algorithm better than rand

/* width and precision specification */
%d      print as decimal integer
%6d     print as decimal integer, at least 6 characters wide
%f      print as floating point
%6f     print as floating point, at least 6 characters wide
%.2f    print as floating point, 2 characters after decimal point
%6.0f   print as floating point, at least 6 wide and no decimal point
%6.2f   print as floating point, at least 6 wide and 2 after decimal point
%o      print as octal
%x      print as hexadecimal
%c      print as character
%s      print as character string
%%      print as % itself

/* the length of an array A */
(sizeof A)/(sizeof A[0])

/* determine the size of an array */
const char arr[] = "string";
printf("Size of arr %lu\n", (int)sizeof(arr));

/* static program analysis */
gcc -Wall -pedantic program.c
mpicc -Wall -pedantic program.c

/* finding dynamic memory errors */
The rules for dynamic memory use include these:
* The number of allocation calls (calls to malloc) must exactly match the number
  of deallocation calls (calls to free).
* Reads and writes to the allocated memory must occur within the memory, not
  outside its range.
* The allocated memory cannot be used before it is allocated or after it is
  deallocated.

/* initialize all the elements to 0 */
int arr[5] = {0};

/* use a designated initializer on the range */
int arr[9] = { [0 ... 8] = 10 };

/* setting only array index arr[0], arr[8] as 0,
while the others are designated initialized to 10 */
int arr[9] = { 0, [1 ... 7] = 10, 0 };

/* malloc */
the malloc() function not initializes the allocated memory to zero
int *arr = (int *) malloc(10 * sizeof(*arr)); /* allocate 10 ints */
double *arr = (double *) malloc(10 * sizeof(arr)); /* allocate 10 doubles */
assert(arr); /* verify that malloc succeeded */

/* calloc */
the calloc() function initializes the allocated memory to zero and
it's the recommended way to allocate memory for arrays.
int *arr = (int *) calloc(size_t size, size_t nmemb);
int *arr = (int *) calloc(10, sizeof(*arr));

* calloc accepts two arguments, whereas malloc accepts one. nmemb represents
  number of memory blocks, size represents size of each block. This is more
  suitable for allocating memory for arrays.

* malloc allocates memory all at once, in a single block, whereas calloc
  allocates memory in multiple blocks, which are contiguous.

* Note: zero value doesn't just mean 0. If we are allocating an array of
  structs, calloc assigns NULL to strings, 0 to ints/floats etc.

* free: This function deallocates the dynamic memory. Calling free(arr) just
  before return would prevent the error. free MUST be called explicitly after
  the usage of dynamic memory, irrespective of which function is used to create
  it (malloc, calloc, etc.)

Note:
In C, you need not (and in fact, should not) cast the return value of malloc/calloc.
int *arr = calloc(10, sizeof(*arr)); /* do this */
int *arr = (int *) calloc(10, sizeof(*arr)); /* rather than */
https://stackoverflow.com/questions/605845/should-i-cast-the-result-of-malloc

/* calloc and memset */
calloc() already 0-initializes the memory, so using memset() to do it again is
pointless (unless you're dealing with a buggy calloc implementation)

/* a function with an array parameter */
#include <stdio.h>

void swap_double(double a[static 2]) {
    double tmp = a[0];
    a[0] = a[1];
    a[1] = tmp;
}

int main(void) {
    double A[2] = {1.0, 2.0};
    swap_double(A);
    printf("A[0] = %g, A[1] = %g\n", A[0], A[1]);
}

/* format */ Ref: https://www.gnu.org/software/indent/manual/indent.html
$ doas apt-get install indent
$ indent -gnu program.c         /* --gnu-style (default) */
$ indent -kr program.c          /* --k-and-r-style */
$ indent -linux program.c       /* --linux-style */

/* splint - Secure Programming LINT */
$ splint program.c
$ splint +bounds program.c

Message Format:
The line-len and limit flags may be preceded by + or - with the same meaning;
for the other flags, + turns on the describe printing and - turns it off.
+ show-column    // show column number where error is found
+ show-func      // show name of function (or macro) definition containing error
- show-all-conjs // show all possible alternate types
+ hints          // provide hints describing an error
- force-hints    // provide hints for all errors reported
80 line-len <num>   // set length of maximum message line to <number> characters
3 indentspaces <num>  // set number of spaces to indent sub-messages
3 locindentspaces <num> // set number of spaces to indent sub-messages
- showdeephistory  // show all available information about storage mentioned
- showloadloc      // show location information for load files
- csv              // produce comma-separated values (CSV) warnings output file
- csvoverwrite     // overwrite existing CSV output file
- htmlfileformat   // show file locations as links
+ streamoverwrite  // warn and exit if a stream output file would overwrite

/* astyle */


/* binary, dynamic information-gathering tools */
$ ldd ./program
$ objdump --help
$ objdump -R ./program
$ strings --help
$ strace ./program
$ ltrace ./program
$ checksec --file=./program
$ patchelf
$ one_gadget
$ ropper


/* common gcc options */
binary code optimization:
-Os             Optimize the code to reduce the size of the binary.
-O1             Turn on basic optimizations. The compiler tries to reduce code
                size and execution time, without performing any optimizations
                that take a great deal of compilation time.
-O2             Optimize even more. GCC performs nearly all optimizations that
                do not invole a space-speed trade-off. As compared to -O1, this
                option increases both compilation time and the performance.
-O3             Aggressive optimization. It tries to unroll loops constructs and
                inlines small functions. It can cause unexpected effects in the
                program. The output is usually larger then using -O2.
-march=native   Automatically determines the code generation options to
                optimally exploit your local CPU features. Code may not be
                executable on other machines.

debugging:
-g              Include the debug symbols in the output. This is necessary for
                tools like gdb, ddd or valgrind.
-pg             Include the profiling information for the GNU profiler.
                Execution in gprof then produces the desired information.

/* recommended compiler and linker flags for gcc */
-D_FORTIFY_SOURCE=2     Detect runtime buffer overflows
-fpie -W1,-pie          Needed to enable full ASLR for executables
-fpic -shared           Disable text relocations for shared libraries
-g3                     Generate abundant debugging information
-O2                     Optimize your code for speed/space efficiency
-Wall                   Turn on recommended compiler warnings
-Werror                 Turn warnings into errors
-std=c17                Specify the language standard
-pedantic               Issue warnings demanded by strict conformance to the standard

/* compiling with math library */
gcc fdtd.c -lm -o fdtd

/* compiling with optimization */
gcc -Wall -pedantic -std=c99 -O2 files-to-compile

/* compiling for profiling */
gcc -g -pg -o fdtd fdtd.c -lm

/* compiling with version 8 of the gcc compiler */
CFLAGS=-g -O3 -fstrict-aliasing -ftree-vectorize -march=native -mtune=native \
       -fopt-info-vec-optimized


/* compiling for vectorization */
gcc fdtd.c -lm -g -O3 -fstrict-aliasing -ftree-vectorize -march=native
-mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
-mprefer-vector-width=512 -o fdtd

gcc fdtd.c -lm -g -O3 -fopenmp -fstrict-aliasing -ftree-vectorize -march=native
-mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
-mprefer-vector-width=512 -o fdtd

Note: same for loop optimizations replace vec with loop

-fopenmp implies -fopenmp-simd
-fopenmp-simd == enable handling of OpenMP SIMD directives while other OpenMP
                 directives are ignored

-fopt-info-vec-optimized == list info on vector optimized
-fopt-info-vec-missed    == list info on vector missed
-fopt-info-vec-all       == list info on vector including optimized and missed


/* compiling with timer.c and timer.h */
gcc program.c timer.c -o program

/* profiling with timer.c and timer.h */
gcc program.c timer.c -pg -o program


lscpu | egrep --color "mmx|sse|avx"


/* register names */
* SSE: xmm0 to xmm15 (128 bits)
* AVX2: ymm0 to ymm15 (256 bits)
* AVX512: zmm0 to zmm31 (512 bits)
In scalar mode, SSE registers are used

/* floating point instruction names */
<op><simd or not><raw type>
where
    * <op> is something like vmul, vadd, vmov or vfmadd
    * <simd or not> is either 's' for scalar or 'p' for packed (i.e. vector)
    * raw type is either 's' for single precision or 'd' for double precision
Typically:
    vmulss, vmovaps, vaddpd, vfmaddpd


/* extract assembly code */
* run objdump -d -C on your executable or library
* search for your function name
$ objdump -d -C program | less

/* check for vectorization */
* for avx2, look for ymm
* for avx512, look for zmm
* othersize look for instructions with ps or pd at the end
    * but ignore mov operations
    * only concentrate on arithmetic ones


There are three-distinct OpenMP capcbilities:
1. vectorization through SIMD directives
2. CPU threading from the original OpenMP model
3. offloading to an accelerator, generally a GPU, through the new target
   directives.


/* OpenMP */
Open Multi-Processing (OpenMP), a shared memory programming standard.
Pragmas are preprocessor statements in C. A pragma indicates to the compiler
where to initiate OpenMP threads.

Relaxed memory model - The value of the variables in main memory or caches of
all the processors are not updated immediately.

Race condition - A situation where multiple outcomes are possible, and the
result is dependent on the timing of the contributors.

Private variable - In the context of OpenMP, a private variable is local and
only visible to its thread.

Shared variable - In the context of OpenMP, a shared variable is visible and
modifiable by any thread.

Each thread has a private memory in its stack and shares memory in the heap.

Work sharing - To split the work across a number of threads or processes.

First touch - The first touch of an array causes the memory to be allocated.
The memory is allocated near the thread location where the touch occurs. Prior
to the first touch, the memory only exists as an entry in a virtual memory
table. The physical memory that corresponds to the virtual memory is created
when it is first accessed.

On some computing nodes, blocks of memory are closer to some processors than
others. This situation is called Non-Uniform Memory Access (NUMA).

Because OpenMP has a relaxed memory model, an OpenMP barrier or flush operation
is required for the memory view of a thread to be communicated to other threads.
A flush operation guarantees that a value moves between two threads, preventing
race conditions. An OpenMP barrier flushes all the locally modified values and
synchronizes the threads.

OpenMP addresses a single node, not multiple nodes with distributed memory
architectures. Thus, its memory scalability is limited to the memory on the
node. For parallel applications that have larger memory requirements, OpenMP
needs to be used in conjunction with a distributed-memory parallel technique.

There are several ways to control how many threads you have in the parallel
region. These are
    - Default -- The default is usually the maximum number of threads for the
      node, but it can be different, depending on the compiler and if MPI ranks
      exist.
    - Environment variable -- Set the size with the OMP_NUM_THREADS environment
      variable; for example

      export OMP_NUM_THREADS=16

    - Function call -- Call the OpenMP function omp_set_threads, for example

      omp_set_threads(16)

    - Pragma -- For example, #pragma omp parallel num_threads(16)


To compile with GCC, gcc -fopenmp program.c -lm -o program
where -fopen is the compiler flag to turn on OpenMP.

[Note: master pragma is replaced by masked pragma in new compilers]

Loop level OpenMP:
Threads are allocated by cores, and thread binding is enabled using the
following OpenMP environment variables to reduce the performance variation of
runs:
export OMP_PLACES=cores
export OMP_CPU_BIND=true

In the vector addition example (listing 7.7), you can see the interaction
between the three components: OpenMP work-sharing directives, implied variable
scope, and memory placement by the operating system. These three components are
necessary for OpenMP program correctness and performance.

OpenMP SIMD directives for better portability:
#pragma omp simd            // vectorizes the following loop or block of code
#pragma omp for simd        // threads and vectorizes the following loop

Another important modifier is the collapse clause. It tells the compiler to
combine nested loops into a single loop for the vectorized implementation. The
argument to the clause indicates how many loops to collapse:

#pragma omp collapse(2)
for (int j=0; j<n; j++) {
    for (int i=0; i<n; i++) {
        a[j][i] = 0.0;
    }
}

The loops are required to be perfectly nested. Perfectly nested loops only have
statements in the innermost loop, with no extraneous statements before or after
each loop block.

OpenMP SIMD functions:
We can also vectorize an entire function so that it can be called from within a
vectorized region of the code.

#pragma omp declare simd
double pythagorean(double a, double b) {
    return sqrt(a*a + b*b)
}

/* compiling MPI program */
mpicc program.c -o program
mpiexec -np 2 ./program

/* using valgrind Memcheck to find memory issues */
mpiexec -np 2 valgrind ./program

/* sum all elements */
MPI_Allreduce(starting address of send buffer, starting address of receive
buffer, number of elements in send buffer, datatype of elements of send buffer,
operation, communicator)

MPI_Allreduce(sendbuf, recvbuf, count, datatype, operation, communicator)

Example:
MPI_Allreduce(wave, rwave, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD)

/* buffers in C */
general principle: buffer argument is address in memory of the data
    - buffer is void pointer
    - write &x or (void *)&x for scalar
    - write x or (void *)x for array


/* gprof */

gprof preparation:
step 1: Compile and link source code with option -pg:
$ gcc program.c -pg -o program
step 2: Run instrumented application
$ ./program
step 3a: The Flat Profile shows how much time is spent in each function and how
often each function was called.
$ gprof --flat-profile program
step 3b: The Call Graph shows which functions called each other and how many
times.
$ gprof --graph program
step 3c: Gprof can even annotate your source code. (add option -g at compile)
$ gprof --annotated-source program


/* likwid */

/* gather node arch information */
likwid-topology         - print thread, cache, and memory topology

$ likwid-topology
$ likwid-topology -g
$ likwid-topology -c
$ likwid-topology -O
$ likwid-topology -o file.json/txt/xml
$ likwid-topology -g | less -S

likwid-powermeter       - read out RAPL Energy information and get info about
                          Turbo Mode steps

/* affinity control and data placement */
likwid-pin              - enforce process and thread affinity
likwid-mpirun           - pinning of hybrid MPI/OpenMP applications

/* query and alter system settings */
likwid-features         - view and toggle feature reagister on Intel processors
likwid-setFrequencies   - read out RAPL Energy information and get info about
                          Turbo Mode steps

/* performance profiling */
likwid-perfctr          - measure hardware performance counter data

/* micro benchmarking */
likwid-memsweeper       - cleans up filled NUMA memory domains and evicts dirty
                          cacheline from cache hierarchy

likwid-bench            - extensive set of threaded micro-benchmarking kernels
                          and rapid prototyping environment for assembly
                          benchmark kernels


# install and configure ndiff [files: ndiff-2.00.zip / ndiff-2.00.tar.gz]
download ndiff from:
https://www.math.utah.edu/~beebe/software/ndiff/ndiff-2.00.tar.gz
$ mkdir ~/.ndiff
$ tar -xzvf ndiff-2.00.tar.gz -C ~/.ndiff
$ cd ~/.ndiff
$ ./configure
$ make
$ sudo make install

/* fixing code up ex post facto */
The indent program, an excellent GNU utility found on most Linux systems,
formats source according to given rules. The default settings are for the GNU
coding style, which is not too pretty. To get the utility to follow the Linux
kernel style, do
$ indent -kr -i8 -ts8 -sob -l80 -ss -bs -ps1 <file>
This instructs the utility to format the code accroding to the kernel coding
style.

/* sparse - semantic parser of source files */

/* coccinelle - semantic patching tool for C */

/* plotting with matplotlib */
./fdtd > data       /* save output to file data */
./fdtd.py data      /* read data to plot with matplotlib */

/* compute factorial */
int factorial(int x) {
    for(int i=1; i<x; i++) {
        x *= i;
    }
    return x;
}


Advanced Scientific Computing - https://www3.nd.edu/~zxu2/ACMS40212-S16.html

apt-get install cmake
apt-get install numactl
apt-get install libopenmpi-dev

why optimized vectorpt.c runs slower than unoptimized vectorst.c ?

/* mpi programming in google-colab */
%%writefile program.c

!ls -l

%%shell
mpiexec program.c -o program
mpiexec --allow-run-as-root --oversubscribe -np 8 ./program

/* GNU C Library: Development Tools [libc-devtools] */
This package contains development tools shipped by the GNU C Library.
* memusage, memusagestat: profile a program's memory usage
* mtrace: interpret the malloc trace log
* sotruss: trace shared library calls
* sprof: display shared object profiling data

/* raylib */
gcc -o file file.c -lraylib -lm -Iinclude -Llib

/* branchless return statement */
int fibonacci(int n) {
	if (n == 1 || n == 2) return 1;
	return fibonacci(n - 2) + fibonacci(n - 1);
}

int fibonacci(int n) {
	return (n == 1 || n == 2) ? 1 : (fibonacci(n - 2) + fibonacci(n - 1));
}

/* branchless if-else condition */
// instead of writing
if (a == 2 && b == 3 && c == 4) {
    doSomething();
} else {
    doOtherThings();
}

// prefer to write like
(a == 2 && b == 3 && c == 4) && doSomething() || doOtherThings();

// and in cases, where not sure what the function will return
(a == 2 && b == 3 && c == 4) && (doSomething() || 1) || doOtherThings();

/* branchless programming inside for loop */
int a[50];

for (int i = 0; i < 50; i++) {
    a[i] = rand() % 100;
}

int sum = 0;

// branching programming
for (int i = 0; i < 50; i++) {
    if (a[i] < 50) {
        sum += a[i];
    }
}

int sum = 0;

// branchless programming
for (int i = 0; i < 50; i++) {
    sum += (a[i] < 50) * a[i];
}

/* ternary operator optimization */
Reference: http://www.nynaeve.net/?p=178

if (condition) {
    var = value1;
} else {
    var = value2;
}

var = condition ? value1 : value2;

/* absolute value */
int absolute(int x) {
    return (x < 0) ? -x : x;
}

/* minimum value */
int minimum(int a, int b) {
    return (a < b) ? a : b;
}

/* maximum value */
int maximum(int a, int b) {
    return (a > b) ? a : b;
}

/* increment statements */
x = x + 1;  /* regular */
++x;        /* pre-increment */
x++;        /* post-increment */
x += 1;     /* assignment operator */

/* find executable files */
find . -type f -executable
find . -maxdepth 1 -type f -executable  /* don't list all executables recursively */

/* generate shared libraries */
gcc -c -Wall -Werror -fpic program.c    /* generate file program.o */
gcc -shared -o program.so program.o     /* generate file program.so */

gcc -Wall -fpic -shared -o program.so program.c /* generate file program.so */
gcc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */

mpicc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */

Note:
* object files for the shared library need to be compiled with the -fpic flag
* pic: position independent code
* object files for the static library don't need this flag
* the extension .so indicates that is a shared library
  (also called dynamic-link library or shared object)
* the advantage of creating a shared library over a static library is that in
  the former the Python interpreter needs not be recompiled


/* project directory structure */
bin/    binaries
data/   data files
lib/    libraries / third party libraries
log/    program logs
obj/    build object files
src/    source files
tmp/    temporary files

/* structure */

typedef is used to create an alias name for datatypes i.e. an alias of struct.

without typedef:
struct studentData {
    char *name;
    int rollNo;
    int age;
};

struct studentData student1;
student1.name = "Issac Newton"


using typedef:
typedef struct studentData {
    char *name;
    int rollNo;
    int age;
} student;

student student1;
student1.name = "Issac Newton"

/* for loop multiple initialization */
for (i = 1, j = 1; i < 10 && j < 10; i++, j++)
* It is initializing two variables. Note: both are separated by comma (,).
* It has two test conditions joined together using AND (&&) logical operator.
  Note: You cannot use multiple test conditions separated by comma, you must
  use logical operator such as && or || to join conditions.
* It has two variables in increment part. Note: should be separated by comma.

/* GNU MP */
Reference: http://web.mit.edu/gnu/doc/html/gmp_4.html

Initializing Integer objects:
void mpz_init(MP_INT *integer)
void mpz_clear(MP_INT *integer)
void *_mpz_realloc(MP_INT *integer, mp_size new_alloc)
void mpz_array_init(MP_INT integer_array[], size_t array_size, mp_size fixed_num_limbs)

Integer assignment functions:
void mpz_set(MP_INT *dest_integer, MP_INT *src_integer)
void mpz_set_ui(MP_INT *integer, unsigned long int initial_value)
void mpz_set_si(MP_INT *integer, signed long int initial_value)
int mpz_set_str(MP_INT *integer, char *initial_value, int base)

Combined initialization and assignment functions:
void mpz_init_set(MP_INT *dest_integer, MP_INT *src_integer)
void mpz_init_set_ui(MP_INT *dest_integer, unsigned long int src_ulong)
void mpz_init_set_si(MP_INT *dest_integer, signed long int src_slong)
int mpz_init_set_str(MP_INT *dest_integer, char *src_cstring, int base)

Conversion functions: