clangref.txt 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840
  1. Scientific Programming and Computer Architecture -
  2. https://divakarvi.github.io/bk-spca/spca.html
  3. Parallel processing in C -
  4. https://berkeley-scf.github.io/tutorial-parallelization/parallel-C
  5. Parallel Programming for Multicore Machines using OpenMP and MPI -
  6. https://ocw.mit.edu/courses/12-950-parallel-programming-for-multicore-machines-using-
  7. openmp-and-mpi-january-iap-2010/pages/syllabus/
  8. The GNU C Library Reference Manual -
  9. https://www.gnu.org/software/libc/manual/html_node/index.html#SEC_Contents
  10. /* compile */
  11. gcc -ansi -pedantic -Wall -O
  12. /* libraries for scientific computing */
  13. * GMP for integer and rational arithmetic:
  14. http://gmplib.org
  15. * MPFR for correctly rounded real floating-point arithmetic:
  16. http://www.mpfr.org
  17. * MPC for correctly rounded complex floating-point arithmetic:
  18. http://mpc.multiprecision.org
  19. /* keywords */
  20. Keywords are the words whose meaning has already been explained to the C
  21. compiler. There are only 32 keywords available in C. The keywords are also
  22. called 'Reserved words'.
  23. auto double int struct
  24. break else long switch
  25. case enum register typedef
  26. char extern return union
  27. const float short unsigned
  28. continue for signed void
  29. default goto sizeof volatile
  30. do if static while
  31. /* data types */
  32. Data Type Size (bytes) Range Format Specifier
  33. ---------------------------------------------------------------------------------------
  34. short int 2 -32,768 to 32,767 %hd
  35. unsigned short int 2 0 to 65,535 %hu
  36. unsigned int 4 0 to 4,294,967,295 %u
  37. int 4 -2,147,483,648 to 2,147,483,647 %d
  38. long int 4 -2,147,483,648 to 2,147,483,647 %ld
  39. unsigned long int 4 0 to 4,294,967,295 %lu
  40. long long int 8 -(2^63) to (2^63)-1 %lld
  41. unsigned long long int 8 0 to 18,446,744,073,709,551,615 %llu
  42. signed char 1 -128 to 127 %c
  43. unsigned char 1 0 to 255 %c
  44. float 4 1.2E-38 to 3.4E+38 %f
  45. double 8 1.7E-308 to 1.7#+308 %lf
  46. long double 16 3.4E-4932 to 1.1E+4932 %Lf
  47. /* mathematical functions */ <math.h>
  48. abs computes absolute value of an integer value
  49. labs computes absolute value of an integer value
  50. llabs computes absolute value of an integer value
  51. fabs computes absolute value of a floating-point value
  52. div computes the quotient and remainder of integer division
  53. ldiv computes the quotient and remainder of integer division
  54. lldiv computes the quotient and remainder of integer division
  55. fmod remainder of the floating-point division operation
  56. remainder signed remainder of the division operation
  57. remquo signed remainder as well as the three last bits of the division
  58. fma fused multiply-add operation
  59. fmax larger of two floating-point values
  60. fmin smaller of two floating-point values
  61. fdim positive difference of two floating-point values
  62. nan returns a NaN (not-a-number)
  63. nanf returns a NaN (not-a-number)
  64. nanl returns a NaN (not-a-number)
  65. exp returns e raised to the given power
  66. exp2 returns 2 raised to the given power
  67. expm1 returns e raised to the given power, minus one
  68. log computes natural logarithm (to base e)
  69. log2 computes binary logarithm (to base 2)
  70. log10 computes common logarithm (to base 10)
  71. log1p computes natural logarithm (to base e) of 1 plus the given number
  72. ilogb extracts exponent of the number
  73. logb extracts exponent of the number
  74. sqrt computes square root
  75. cbrt computes cubic root
  76. hypot computes square root of the sum of the squares of two given numbers
  77. pow raises a number to the given power
  78. sin computes sine
  79. cos computes cosine
  80. tan computes tangent
  81. asin computes arc sine
  82. acos computes arc cosine
  83. atan computes arc tangent
  84. atan2 computes arc tangent, using signs to determine quadrants
  85. sinh computes hyperbolic sine
  86. cosh computes hyperbolic cosine
  87. tanh computes hyperbolic tangent
  88. asinh computes hyperbolic arc sine
  89. acosh computes hyperbolic arc cosine
  90. atanh computes hyperbolic arc tangent
  91. erf computes error function
  92. erfc computes complementary error function
  93. lgamma computes natural logarithm of the absolute value of the gamma function
  94. tgamma computes gamma function
  95. ceil returns the nearest integer not less than the given value
  96. floor returns the nearest integer not greater than the given value
  97. trunc returns the nearest integer not greater in magnitude than given value
  98. round returns the nearest integer rounding away from zero in halfway cases
  99. lround returns the nearest integer rounding away from zero in halfway cases
  100. llround returns the nearest integer rounding away from zero in halfway cases
  101. nearbyint returns the nearest integer using current rounding mode
  102. rint returns the nearest integer using current rounding mode with exception
  103. lrint returns the nearest integer using current rounding mode with exception
  104. llrint returns the nearest integer using current rounding mode with exception
  105. frexp decomposes a number into significand and a power of 2
  106. ldexp multiplies a number by 2 raised to a power
  107. modf decomposes a number into integer and fractional parts
  108. scalbn multiplies a number by FLT_RADIX raised to a power
  109. scalbln multiplies a number by FLT_RADIX raised to a power
  110. nextafter returns next representable floating-point value towards the given value
  111. nextforward returns next representable floating-point value towards the given value
  112. copysign copies the sign of a floating-point value
  113. fpclassify categorizes the given floating-point value
  114. isfinite checks if the argument has finite value
  115. isinf checks if the argument is infinite
  116. isnan checks if the argument is NaN
  117. isnormal checks if the argument is normal
  118. signbit checks if the sign of the argument is negative
  119. /* floating-point environment */ <fenv.h>
  120. feclearexcept clears exceptions
  121. fegetenv stores current floating-point environment
  122. fegetexceptflag stores current status flags
  123. fegetround retrieves current rounding direction
  124. feholdexcept saves current floating-point environment and clears all exceptions
  125. feraiseexcept raises a floating-point exception
  126. fesetenv sets current floating-point environment
  127. fesetexceptflag sets current status flags
  128. fesetround sets current rounding direction
  129. fetestexcept tests whether certain exceptions have been raised
  130. feupdateenv restores floating-point environment, but keep current exceptions
  131. /* complex numbers */ <complex.h>
  132. cabs computes absolute value
  133. carg computes argument of a complex number
  134. cimag computes imaginary part of a complex number
  135. creal computes real part of a complex number
  136. conj computes complex conjugate
  137. cproj computes complex projection into the Riemann sphere
  138. cexp computes complex exponential
  139. clog computes complex logarithm
  140. csqrt computes complex square root
  141. cpow computes complex power
  142. csin computes complex sine
  143. ccos computes complex cosine
  144. ctan computes complex tangent
  145. casin computes complex arc sine
  146. cacos computes complex arc cosine
  147. catan computes complex arc tangent
  148. csinh computes complex hyperbolic sine
  149. ccsoh computes complex hyperbolic cosine
  150. ctanh computes complex hyperbolic tangent
  151. casinh computes complex hyperbolic arc sine
  152. cacosh computes complex hyperbolic arc cosine
  153. catanh computes complex hyperbolic arc tangent
  154. /* random-number generation */ <stdlib.h>
  155. rand generates a pseudo-random number between 0 and RAND_MAX, inclusive
  156. srand initializes a pseudo-random number generator
  157. arc4random generates a pseudo-random number between 0 and UINT32_MAX
  158. arc4random_uniform generates a pseudo-random number between 0 and a maximum value
  159. arc4random_buf fill a buffer with a pseudo-random bitstream
  160. arc4random_stir initializes a pseudo-random number generator
  161. Note: arc4random algorithm better than rand
  162. /* width and precision specification */
  163. %d print as decimal integer
  164. %6d print as decimal integer, at least 6 characters wide
  165. %f print as floating point
  166. %6f print as floating point, at least 6 characters wide
  167. %.2f print as floating point, 2 characters after decimal point
  168. %6.0f print as floating point, at least 6 wide and no decimal point
  169. %6.2f print as floating point, at least 6 wide and 2 after decimal point
  170. %o print as octal
  171. %x print as hexadecimal
  172. %c print as character
  173. %s print as character string
  174. %% print as % itself
  175. /* the length of an array A */
  176. (sizeof A)/(sizeof A[0])
  177. /* determine the size of an array */
  178. const char arr[] = "string";
  179. printf("Size of arr %lu\n", (int)sizeof(arr));
  180. /* static program analysis */
  181. gcc -Wall -pedantic program.c
  182. mpicc -Wall -pedantic program.c
  183. /* finding dynamic memory errors */
  184. The rules for dynamic memory use include these:
  185. * The number of allocation calls (calls to malloc) must exactly match the number
  186. of deallocation calls (calls to free).
  187. * Reads and writes to the allocated memory must occur within the memory, not
  188. outside its range.
  189. * The allocated memory cannot be used before it is allocated or after it is
  190. deallocated.
  191. /* initialize all the elements to 0 */
  192. int arr[5] = {0};
  193. /* use a designated initializer on the range */
  194. int arr[9] = { [0 ... 8] = 10 };
  195. /* setting only array index arr[0], arr[8] as 0,
  196. while the others are designated initialized to 10 */
  197. int arr[9] = { 0, [1 ... 7] = 10, 0 };
  198. /* malloc */
  199. the malloc() function not initializes the allocated memory to zero
  200. int *arr = (int *) malloc(10 * sizeof(*arr)); /* allocate 10 ints */
  201. double *arr = (double *) malloc(10 * sizeof(arr)); /* allocate 10 doubles */
  202. assert(arr); /* verify that malloc succeeded */
  203. /* calloc */
  204. the calloc() function initializes the allocated memory to zero and
  205. it's the recommended way to allocate memory for arrays.
  206. int *arr = (int *) calloc(size_t size, size_t nmemb);
  207. int *arr = (int *) calloc(10, sizeof(*arr));
  208. * calloc accepts two arguments, whereas malloc accepts one. nmemb represents
  209. number of memory blocks, size represents size of each block. This is more
  210. suitable for allocating memory for arrays.
  211. * malloc allocates memory all at once, in a single block, whereas calloc
  212. allocates memory in multiple blocks, which are contiguous.
  213. * Note: zero value doesn't just mean 0. If we are allocating an array of
  214. structs, calloc assigns NULL to strings, 0 to ints/floats etc.
  215. * free: This function deallocates the dynamic memory. Calling free(arr) just
  216. before return would prevent the error. free MUST be called explicitly after
  217. the usage of dynamic memory, irrespective of which function is used to create
  218. it (malloc, calloc, etc.)
  219. Note:
  220. In C, you need not (and in fact, should not) cast the return value of malloc/calloc.
  221. int *arr = calloc(10, sizeof(*arr)); /* do this */
  222. int *arr = (int *) calloc(10, sizeof(*arr)); /* rather than */
  223. https://stackoverflow.com/questions/605845/should-i-cast-the-result-of-malloc
  224. /* calloc and memset */
  225. calloc() already 0-initializes the memory, so using memset() to do it again is
  226. pointless (unless you're dealing with a buggy calloc implementation)
  227. /* a function with an array parameter */
  228. #include <stdio.h>
  229. void swap_double(double a[static 2]) {
  230. double tmp = a[0];
  231. a[0] = a[1];
  232. a[1] = tmp;
  233. }
  234. int main(void) {
  235. double A[2] = {1.0, 2.0};
  236. swap_double(A);
  237. printf("A[0] = %g, A[1] = %g\n", A[0], A[1]);
  238. }
  239. /* format */ Ref: https://www.gnu.org/software/indent/manual/indent.html
  240. $ doas apt-get install indent
  241. $ indent -gnu program.c /* --gnu-style (default) */
  242. $ indent -kr program.c /* --k-and-r-style */
  243. $ indent -linux program.c /* --linux-style */
  244. /* splint - Secure Programming LINT */
  245. $ splint program.c
  246. $ splint +bounds program.c
  247. Message Format:
  248. The line-len and limit flags may be preceded by + or - with the same meaning;
  249. for the other flags, + turns on the describe printing and - turns it off.
  250. + show-column // show column number where error is found
  251. + show-func // show name of function (or macro) definition containing error
  252. - show-all-conjs // show all possible alternate types
  253. + hints // provide hints describing an error
  254. - force-hints // provide hints for all errors reported
  255. 80 line-len <num> // set length of maximum message line to <number> characters
  256. 3 indentspaces <num> // set number of spaces to indent sub-messages
  257. 3 locindentspaces <num> // set number of spaces to indent sub-messages
  258. - showdeephistory // show all available information about storage mentioned
  259. - showloadloc // show location information for load files
  260. - csv // produce comma-separated values (CSV) warnings output file
  261. - csvoverwrite // overwrite existing CSV output file
  262. - htmlfileformat // show file locations as links
  263. + streamoverwrite // warn and exit if a stream output file would overwrite
  264. /* astyle */
  265. /* binary, dynamic information-gathering tools */
  266. $ ldd ./program
  267. $ objdump --help
  268. $ objdump -R ./program
  269. $ strings --help
  270. $ strace ./program
  271. $ ltrace ./program
  272. $ checksec --file=./program
  273. $ patchelf
  274. $ one_gadget
  275. $ ropper
  276. /* common gcc options */
  277. binary code optimization:
  278. -Os Optimize the code to reduce the size of the binary.
  279. -O1 Turn on basic optimizations. The compiler tries to reduce code
  280. size and execution time, without performing any optimizations
  281. that take a great deal of compilation time.
  282. -O2 Optimize even more. GCC performs nearly all optimizations that
  283. do not invole a space-speed trade-off. As compared to -O1, this
  284. option increases both compilation time and the performance.
  285. -O3 Aggressive optimization. It tries to unroll loops constructs and
  286. inlines small functions. It can cause unexpected effects in the
  287. program. The output is usually larger then using -O2.
  288. -march=native Automatically determines the code generation options to
  289. optimally exploit your local CPU features. Code may not be
  290. executable on other machines.
  291. debugging:
  292. -g Include the debug symbols in the output. This is necessary for
  293. tools like gdb, ddd or valgrind.
  294. -pg Include the profiling information for the GNU profiler.
  295. Execution in gprof then produces the desired information.
  296. /* recommended compiler and linker flags for gcc */
  297. -D_FORTIFY_SOURCE=2 Detect runtime buffer overflows
  298. -fpie -W1,-pie Needed to enable full ASLR for executables
  299. -fpic -shared Disable text relocations for shared libraries
  300. -g3 Generate abundant debugging information
  301. -O2 Optimize your code for speed/space efficiency
  302. -Wall Turn on recommended compiler warnings
  303. -Werror Turn warnings into errors
  304. -std=c17 Specify the language standard
  305. -pedantic Issue warnings demanded by strict conformance to the standard
  306. /* compiling with math library */
  307. gcc fdtd.c -lm -o fdtd
  308. /* compiling with optimization */
  309. gcc -Wall -pedantic -std=c99 -O2 files-to-compile
  310. /* compiling for profiling */
  311. gcc -g -pg -o fdtd fdtd.c -lm
  312. /* compiling with version 8 of the gcc compiler */
  313. CFLAGS=-g -O3 -fstrict-aliasing -ftree-vectorize -march=native -mtune=native \
  314. -fopt-info-vec-optimized
  315. /* compiling for vectorization */
  316. gcc fdtd.c -lm -g -O3 -fstrict-aliasing -ftree-vectorize -march=native
  317. -mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
  318. -mprefer-vector-width=512 -o fdtd
  319. gcc fdtd.c -lm -g -O3 -fopenmp -fstrict-aliasing -ftree-vectorize -march=native
  320. -mtune=native -fopt-info-vec-optimized -fno-trapping-math -fno-math-errno
  321. -mprefer-vector-width=512 -o fdtd
  322. Note: same for loop optimizations replace vec with loop
  323. -fopenmp implies -fopenmp-simd
  324. -fopenmp-simd == enable handling of OpenMP SIMD directives while other OpenMP
  325. directives are ignored
  326. -fopt-info-vec-optimized == list info on vector optimized
  327. -fopt-info-vec-missed == list info on vector missed
  328. -fopt-info-vec-all == list info on vector including optimized and missed
  329. /* compiling with timer.c and timer.h */
  330. gcc program.c timer.c -o program
  331. /* profiling with timer.c and timer.h */
  332. gcc program.c timer.c -pg -o program
  333. lscpu | egrep --color "mmx|sse|avx"
  334. /* register names */
  335. * SSE: xmm0 to xmm15 (128 bits)
  336. * AVX2: ymm0 to ymm15 (256 bits)
  337. * AVX512: zmm0 to zmm31 (512 bits)
  338. In scalar mode, SSE registers are used
  339. /* floating point instruction names */
  340. <op><simd or not><raw type>
  341. where
  342. * <op> is something like vmul, vadd, vmov or vfmadd
  343. * <simd or not> is either 's' for scalar or 'p' for packed (i.e. vector)
  344. * raw type is either 's' for single precision or 'd' for double precision
  345. Typically:
  346. vmulss, vmovaps, vaddpd, vfmaddpd
  347. /* extract assembly code */
  348. * run objdump -d -C on your executable or library
  349. * search for your function name
  350. $ objdump -d -C program | less
  351. /* check for vectorization */
  352. * for avx2, look for ymm
  353. * for avx512, look for zmm
  354. * othersize look for instructions with ps or pd at the end
  355. * but ignore mov operations
  356. * only concentrate on arithmetic ones
  357. There are three-distinct OpenMP capcbilities:
  358. 1. vectorization through SIMD directives
  359. 2. CPU threading from the original OpenMP model
  360. 3. offloading to an accelerator, generally a GPU, through the new target
  361. directives.
  362. /* OpenMP */
  363. Open Multi-Processing (OpenMP), a shared memory programming standard.
  364. Pragmas are preprocessor statements in C. A pragma indicates to the compiler
  365. where to initiate OpenMP threads.
  366. Relaxed memory model - The value of the variables in main memory or caches of
  367. all the processors are not updated immediately.
  368. Race condition - A situation where multiple outcomes are possible, and the
  369. result is dependent on the timing of the contributors.
  370. Private variable - In the context of OpenMP, a private variable is local and
  371. only visible to its thread.
  372. Shared variable - In the context of OpenMP, a shared variable is visible and
  373. modifiable by any thread.
  374. Each thread has a private memory in its stack and shares memory in the heap.
  375. Work sharing - To split the work across a number of threads or processes.
  376. First touch - The first touch of an array causes the memory to be allocated.
  377. The memory is allocated near the thread location where the touch occurs. Prior
  378. to the first touch, the memory only exists as an entry in a virtual memory
  379. table. The physical memory that corresponds to the virtual memory is created
  380. when it is first accessed.
  381. On some computing nodes, blocks of memory are closer to some processors than
  382. others. This situation is called Non-Uniform Memory Access (NUMA).
  383. Because OpenMP has a relaxed memory model, an OpenMP barrier or flush operation
  384. is required for the memory view of a thread to be communicated to other threads.
  385. A flush operation guarantees that a value moves between two threads, preventing
  386. race conditions. An OpenMP barrier flushes all the locally modified values and
  387. synchronizes the threads.
  388. OpenMP addresses a single node, not multiple nodes with distributed memory
  389. architectures. Thus, its memory scalability is limited to the memory on the
  390. node. For parallel applications that have larger memory requirements, OpenMP
  391. needs to be used in conjunction with a distributed-memory parallel technique.
  392. There are several ways to control how many threads you have in the parallel
  393. region. These are
  394. - Default -- The default is usually the maximum number of threads for the
  395. node, but it can be different, depending on the compiler and if MPI ranks
  396. exist.
  397. - Environment variable -- Set the size with the OMP_NUM_THREADS environment
  398. variable; for example
  399. export OMP_NUM_THREADS=16
  400. - Function call -- Call the OpenMP function omp_set_threads, for example
  401. omp_set_threads(16)
  402. - Pragma -- For example, #pragma omp parallel num_threads(16)
  403. To compile with GCC, gcc -fopenmp program.c -lm -o program
  404. where -fopen is the compiler flag to turn on OpenMP.
  405. [Note: master pragma is replaced by masked pragma in new compilers]
  406. Loop level OpenMP:
  407. Threads are allocated by cores, and thread binding is enabled using the
  408. following OpenMP environment variables to reduce the performance variation of
  409. runs:
  410. export OMP_PLACES=cores
  411. export OMP_CPU_BIND=true
  412. In the vector addition example (listing 7.7), you can see the interaction
  413. between the three components: OpenMP work-sharing directives, implied variable
  414. scope, and memory placement by the operating system. These three components are
  415. necessary for OpenMP program correctness and performance.
  416. OpenMP SIMD directives for better portability:
  417. #pragma omp simd // vectorizes the following loop or block of code
  418. #pragma omp for simd // threads and vectorizes the following loop
  419. Another important modifier is the collapse clause. It tells the compiler to
  420. combine nested loops into a single loop for the vectorized implementation. The
  421. argument to the clause indicates how many loops to collapse:
  422. #pragma omp collapse(2)
  423. for (int j=0; j<n; j++) {
  424. for (int i=0; i<n; i++) {
  425. a[j][i] = 0.0;
  426. }
  427. }
  428. The loops are required to be perfectly nested. Perfectly nested loops only have
  429. statements in the innermost loop, with no extraneous statements before or after
  430. each loop block.
  431. OpenMP SIMD functions:
  432. We can also vectorize an entire function so that it can be called from within a
  433. vectorized region of the code.
  434. #pragma omp declare simd
  435. double pythagorean(double a, double b) {
  436. return sqrt(a*a + b*b)
  437. }
  438. /* compiling MPI program */
  439. mpicc program.c -o program
  440. mpiexec -np 2 ./program
  441. /* using valgrind Memcheck to find memory issues */
  442. mpiexec -np 2 valgrind ./program
  443. /* sum all elements */
  444. MPI_Allreduce(starting address of send buffer, starting address of receive
  445. buffer, number of elements in send buffer, datatype of elements of send buffer,
  446. operation, communicator)
  447. MPI_Allreduce(sendbuf, recvbuf, count, datatype, operation, communicator)
  448. Example:
  449. MPI_Allreduce(wave, rwave, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD)
  450. /* buffers in C */
  451. general principle: buffer argument is address in memory of the data
  452. - buffer is void pointer
  453. - write &x or (void *)&x for scalar
  454. - write x or (void *)x for array
  455. /* gprof */
  456. gprof preparation:
  457. step 1: Compile and link source code with option -pg:
  458. $ gcc program.c -pg -o program
  459. step 2: Run instrumented application
  460. $ ./program
  461. step 3a: The Flat Profile shows how much time is spent in each function and how
  462. often each function was called.
  463. $ gprof --flat-profile program
  464. step 3b: The Call Graph shows which functions called each other and how many
  465. times.
  466. $ gprof --graph program
  467. step 3c: Gprof can even annotate your source code. (add option -g at compile)
  468. $ gprof --annotated-source program
  469. /* likwid */
  470. /* gather node arch information */
  471. likwid-topology - print thread, cache, and memory topology
  472. $ likwid-topology
  473. $ likwid-topology -g
  474. $ likwid-topology -c
  475. $ likwid-topology -O
  476. $ likwid-topology -o file.json/txt/xml
  477. $ likwid-topology -g | less -S
  478. likwid-powermeter - read out RAPL Energy information and get info about
  479. Turbo Mode steps
  480. /* affinity control and data placement */
  481. likwid-pin - enforce process and thread affinity
  482. likwid-mpirun - pinning of hybrid MPI/OpenMP applications
  483. /* query and alter system settings */
  484. likwid-features - view and toggle feature reagister on Intel processors
  485. likwid-setFrequencies - read out RAPL Energy information and get info about
  486. Turbo Mode steps
  487. /* performance profiling */
  488. likwid-perfctr - measure hardware performance counter data
  489. /* micro benchmarking */
  490. likwid-memsweeper - cleans up filled NUMA memory domains and evicts dirty
  491. cacheline from cache hierarchy
  492. likwid-bench - extensive set of threaded micro-benchmarking kernels
  493. and rapid prototyping environment for assembly
  494. benchmark kernels
  495. # install and configure ndiff [files: ndiff-2.00.zip / ndiff-2.00.tar.gz]
  496. download ndiff from:
  497. https://www.math.utah.edu/~beebe/software/ndiff/ndiff-2.00.tar.gz
  498. $ mkdir ~/.ndiff
  499. $ tar -xzvf ndiff-2.00.tar.gz -C ~/.ndiff
  500. $ cd ~/.ndiff
  501. $ ./configure
  502. $ make
  503. $ sudo make install
  504. /* fixing code up ex post facto */
  505. The indent program, an excellent GNU utility found on most Linux systems,
  506. formats source according to given rules. The default settings are for the GNU
  507. coding style, which is not too pretty. To get the utility to follow the Linux
  508. kernel style, do
  509. $ indent -kr -i8 -ts8 -sob -l80 -ss -bs -ps1 <file>
  510. This instructs the utility to format the code accroding to the kernel coding
  511. style.
  512. /* sparse - semantic parser of source files */
  513. /* coccinelle - semantic patching tool for C */
  514. /* plotting with matplotlib */
  515. ./fdtd > data /* save output to file data */
  516. ./fdtd.py data /* read data to plot with matplotlib */
  517. /* compute factorial */
  518. int factorial(int x) {
  519. for(int i=1; i<x; i++) {
  520. x *= i;
  521. }
  522. return x;
  523. }
  524. Advanced Scientific Computing - https://www3.nd.edu/~zxu2/ACMS40212-S16.html
  525. apt-get install cmake
  526. apt-get install numactl
  527. apt-get install libopenmpi-dev
  528. why optimized vectorpt.c runs slower than unoptimized vectorst.c ?
  529. /* mpi programming in google-colab */
  530. %%writefile program.c
  531. !ls -l
  532. %%shell
  533. mpiexec program.c -o program
  534. mpiexec --allow-run-as-root --oversubscribe -np 8 ./program
  535. /* GNU C Library: Development Tools [libc-devtools] */
  536. This package contains development tools shipped by the GNU C Library.
  537. * memusage, memusagestat: profile a program's memory usage
  538. * mtrace: interpret the malloc trace log
  539. * sotruss: trace shared library calls
  540. * sprof: display shared object profiling data
  541. /* raylib */
  542. gcc -o file file.c -lraylib -lm -Iinclude -Llib
  543. /* branchless return statement */
  544. int fibonacci(int n) {
  545. if (n == 1 || n == 2) return 1;
  546. return fibonacci(n - 2) + fibonacci(n - 1);
  547. }
  548. int fibonacci(int n) {
  549. return (n == 1 || n == 2) ? 1 : (fibonacci(n - 2) + fibonacci(n - 1));
  550. }
  551. /* branchless if-else condition */
  552. // instead of writing
  553. if (a == 2 && b == 3 && c == 4) {
  554. doSomething();
  555. } else {
  556. doOtherThings();
  557. }
  558. // prefer to write like
  559. (a == 2 && b == 3 && c == 4) && doSomething() || doOtherThings();
  560. // and in cases, where not sure what the function will return
  561. (a == 2 && b == 3 && c == 4) && (doSomething() || 1) || doOtherThings();
  562. /* branchless programming inside for loop */
  563. int a[50];
  564. for (int i = 0; i < 50; i++) {
  565. a[i] = rand() % 100;
  566. }
  567. int sum = 0;
  568. // branching programming
  569. for (int i = 0; i < 50; i++) {
  570. if (a[i] < 50) {
  571. sum += a[i];
  572. }
  573. }
  574. int sum = 0;
  575. // branchless programming
  576. for (int i = 0; i < 50; i++) {
  577. sum += (a[i] < 50) * a[i];
  578. }
  579. /* ternary operator optimization */
  580. Reference: http://www.nynaeve.net/?p=178
  581. if (condition) {
  582. var = value1;
  583. } else {
  584. var = value2;
  585. }
  586. var = condition ? value1 : value2;
  587. /* absolute value */
  588. int absolute(int x) {
  589. return (x < 0) ? -x : x;
  590. }
  591. /* minimum value */
  592. int minimum(int a, int b) {
  593. return (a < b) ? a : b;
  594. }
  595. /* maximum value */
  596. int maximum(int a, int b) {
  597. return (a > b) ? a : b;
  598. }
  599. /* increment statements */
  600. x = x + 1; /* regular */
  601. ++x; /* pre-increment */
  602. x++; /* post-increment */
  603. x += 1; /* assignment operator */
  604. /* find executable files */
  605. find . -type f -executable
  606. find . -maxdepth 1 -type f -executable /* don't list all executables recursively */
  607. /* generate shared libraries */
  608. gcc -c -Wall -Werror -fpic program.c /* generate file program.o */
  609. gcc -shared -o program.so program.o /* generate file program.so */
  610. gcc -Wall -fpic -shared -o program.so program.c /* generate file program.so */
  611. gcc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */
  612. mpicc -Wall -fpic -lm -shared -o program.so program.c /* generate file program.so */
  613. Note:
  614. * object files for the shared library need to be compiled with the -fpic flag
  615. * pic: position independent code
  616. * object files for the static library don't need this flag
  617. * the extension .so indicates that is a shared library
  618. (also called dynamic-link library or shared object)
  619. * the advantage of creating a shared library over a static library is that in
  620. the former the Python interpreter needs not be recompiled
  621. /* project directory structure */
  622. bin/ binaries
  623. data/ data files
  624. lib/ libraries / third party libraries
  625. log/ program logs
  626. obj/ build object files
  627. src/ source files
  628. tmp/ temporary files
  629. /* structure */
  630. typedef is used to create an alias name for datatypes i.e. an alias of struct.
  631. without typedef:
  632. struct studentData {
  633. char *name;
  634. int rollNo;
  635. int age;
  636. };
  637. struct studentData student1;
  638. student1.name = "Issac Newton"
  639. using typedef:
  640. typedef struct studentData {
  641. char *name;
  642. int rollNo;
  643. int age;
  644. } student;
  645. student student1;
  646. student1.name = "Issac Newton"
  647. /* for loop multiple initialization */
  648. for (i = 1, j = 1; i < 10 && j < 10; i++, j++)
  649. * It is initializing two variables. Note: both are separated by comma (,).
  650. * It has two test conditions joined together using AND (&&) logical operator.
  651. Note: You cannot use multiple test conditions separated by comma, you must
  652. use logical operator such as && or || to join conditions.
  653. * It has two variables in increment part. Note: should be separated by comma.
  654. /* GNU MP */
  655. Reference: http://web.mit.edu/gnu/doc/html/gmp_4.html
  656. Initializing Integer objects:
  657. void mpz_init(MP_INT *integer)
  658. void mpz_clear(MP_INT *integer)
  659. void *_mpz_realloc(MP_INT *integer, mp_size new_alloc)
  660. void mpz_array_init(MP_INT integer_array[], size_t array_size, mp_size fixed_num_limbs)
  661. Integer assignment functions:
  662. void mpz_set(MP_INT *dest_integer, MP_INT *src_integer)
  663. void mpz_set_ui(MP_INT *integer, unsigned long int initial_value)
  664. void mpz_set_si(MP_INT *integer, signed long int initial_value)
  665. int mpz_set_str(MP_INT *integer, char *initial_value, int base)
  666. Combined initialization and assignment functions:
  667. void mpz_init_set(MP_INT *dest_integer, MP_INT *src_integer)
  668. void mpz_init_set_ui(MP_INT *dest_integer, unsigned long int src_ulong)
  669. void mpz_init_set_si(MP_INT *dest_integer, signed long int src_slong)
  670. int mpz_init_set_str(MP_INT *dest_integer, char *src_cstring, int base)
  671. Conversion functions: