g_journal.c 79 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause
  3. *
  4. * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  20. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26. * SUCH DAMAGE.
  27. */
  28. #include <sys/param.h>
  29. #include <sys/systm.h>
  30. #include <sys/bio.h>
  31. #include <sys/eventhandler.h>
  32. #include <sys/kernel.h>
  33. #include <sys/kthread.h>
  34. #include <sys/limits.h>
  35. #include <sys/lock.h>
  36. #include <sys/module.h>
  37. #include <sys/malloc.h>
  38. #include <sys/mount.h>
  39. #include <sys/mutex.h>
  40. #include <sys/proc.h>
  41. #include <sys/reboot.h>
  42. #include <sys/sbuf.h>
  43. #include <sys/sched.h>
  44. #include <sys/sysctl.h>
  45. #include <sys/taskqueue.h>
  46. #include <sys/vnode.h>
  47. #ifdef GJ_MEMDEBUG
  48. #include <sys/stack.h>
  49. #include <sys/kdb.h>
  50. #endif
  51. #include <vm/vm.h>
  52. #include <vm/vm_kern.h>
  53. #include <geom/geom.h>
  54. #include <geom/geom_dbg.h>
  55. #include <geom/journal/g_journal.h>
  56. FEATURE(geom_journal, "GEOM journaling support");
  57. /*
  58. * On-disk journal format:
  59. *
  60. * JH - Journal header
  61. * RH - Record header
  62. *
  63. * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%%
  64. * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
  65. * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%%
  66. *
  67. */
  68. CTASSERT(sizeof(struct g_journal_header) <= 512);
  69. CTASSERT(sizeof(struct g_journal_record_header) <= 512);
  70. static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
  71. static struct mtx g_journal_cache_mtx;
  72. MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
  73. const struct g_journal_desc *g_journal_filesystems[] = {
  74. &g_journal_ufs,
  75. NULL
  76. };
  77. SYSCTL_DECL(_kern_geom);
  78. int g_journal_debug = 0;
  79. static u_int g_journal_switch_time = 10;
  80. static u_int g_journal_force_switch = 70;
  81. static u_int g_journal_parallel_flushes = 16;
  82. static u_int g_journal_parallel_copies = 16;
  83. static u_int g_journal_accept_immediately = 64;
  84. static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
  85. static u_int g_journal_do_optimize = 1;
  86. static SYSCTL_NODE(_kern_geom, OID_AUTO, journal,
  87. CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  88. "GEOM_JOURNAL stuff");
  89. SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
  90. "Debug level");
  91. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
  92. &g_journal_switch_time, 0, "Switch journals every N seconds");
  93. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
  94. &g_journal_force_switch, 0, "Force switch when journal is N% full");
  95. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
  96. &g_journal_parallel_flushes, 0,
  97. "Number of flush I/O requests to send in parallel");
  98. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
  99. &g_journal_accept_immediately, 0,
  100. "Number of I/O requests accepted immediately");
  101. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
  102. &g_journal_parallel_copies, 0,
  103. "Number of copy I/O requests to send in parallel");
  104. static int
  105. g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
  106. {
  107. u_int entries;
  108. int error;
  109. entries = g_journal_record_entries;
  110. error = sysctl_handle_int(oidp, &entries, 0, req);
  111. if (error != 0 || req->newptr == NULL)
  112. return (error);
  113. if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
  114. return (EINVAL);
  115. g_journal_record_entries = entries;
  116. return (0);
  117. }
  118. SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
  119. CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
  120. g_journal_record_entries_sysctl, "I",
  121. "Maximum number of entries in one journal record");
  122. SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
  123. &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
  124. static u_long g_journal_cache_used = 0;
  125. static u_long g_journal_cache_limit = 64 * 1024 * 1024;
  126. static u_int g_journal_cache_divisor = 2;
  127. static u_int g_journal_cache_switch = 90;
  128. static u_int g_journal_cache_misses = 0;
  129. static u_int g_journal_cache_alloc_failures = 0;
  130. static u_long g_journal_cache_low = 0;
  131. static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache,
  132. CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  133. "GEOM_JOURNAL cache");
  134. SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
  135. &g_journal_cache_used, 0, "Number of allocated bytes");
  136. static int
  137. g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
  138. {
  139. u_long limit;
  140. int error;
  141. limit = g_journal_cache_limit;
  142. error = sysctl_handle_long(oidp, &limit, 0, req);
  143. if (error != 0 || req->newptr == NULL)
  144. return (error);
  145. g_journal_cache_limit = limit;
  146. g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
  147. return (0);
  148. }
  149. SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
  150. CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0,
  151. g_journal_cache_limit_sysctl, "I",
  152. "Maximum number of allocated bytes");
  153. SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
  154. &g_journal_cache_divisor, 0,
  155. "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
  156. static int
  157. g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
  158. {
  159. u_int cswitch;
  160. int error;
  161. cswitch = g_journal_cache_switch;
  162. error = sysctl_handle_int(oidp, &cswitch, 0, req);
  163. if (error != 0 || req->newptr == NULL)
  164. return (error);
  165. if (cswitch > 100)
  166. return (EINVAL);
  167. g_journal_cache_switch = cswitch;
  168. g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
  169. return (0);
  170. }
  171. SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
  172. CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
  173. g_journal_cache_switch_sysctl, "I",
  174. "Force switch when we hit this percent of cache use");
  175. SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
  176. &g_journal_cache_misses, 0, "Number of cache misses");
  177. SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
  178. &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
  179. static u_long g_journal_stats_bytes_skipped = 0;
  180. static u_long g_journal_stats_combined_ios = 0;
  181. static u_long g_journal_stats_switches = 0;
  182. static u_long g_journal_stats_wait_for_copy = 0;
  183. static u_long g_journal_stats_journal_full = 0;
  184. static u_long g_journal_stats_low_mem = 0;
  185. static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats,
  186. CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  187. "GEOM_JOURNAL statistics");
  188. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
  189. &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
  190. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
  191. &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
  192. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
  193. &g_journal_stats_switches, 0, "Number of journal switches");
  194. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
  195. &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
  196. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
  197. &g_journal_stats_journal_full, 0,
  198. "Number of times journal was almost full.");
  199. SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
  200. &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
  201. static g_taste_t g_journal_taste;
  202. static g_ctl_req_t g_journal_config;
  203. static g_dumpconf_t g_journal_dumpconf;
  204. static g_init_t g_journal_init;
  205. static g_fini_t g_journal_fini;
  206. struct g_class g_journal_class = {
  207. .name = G_JOURNAL_CLASS_NAME,
  208. .version = G_VERSION,
  209. .taste = g_journal_taste,
  210. .ctlreq = g_journal_config,
  211. .dumpconf = g_journal_dumpconf,
  212. .init = g_journal_init,
  213. .fini = g_journal_fini
  214. };
  215. static int g_journal_destroy(struct g_journal_softc *sc);
  216. static void g_journal_metadata_update(struct g_journal_softc *sc);
  217. static void g_journal_start_switcher(struct g_class *mp);
  218. static void g_journal_stop_switcher(void);
  219. static void g_journal_switch_wait(struct g_journal_softc *sc);
  220. #define GJ_SWITCHER_WORKING 0
  221. #define GJ_SWITCHER_DIE 1
  222. #define GJ_SWITCHER_DIED 2
  223. static struct proc *g_journal_switcher_proc = NULL;
  224. static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
  225. static int g_journal_switcher_wokenup = 0;
  226. static int g_journal_sync_requested = 0;
  227. #ifdef GJ_MEMDEBUG
  228. struct meminfo {
  229. size_t mi_size;
  230. struct stack mi_stack;
  231. };
  232. #endif
  233. /*
  234. * We use our own malloc/realloc/free functions, so we can collect statistics
  235. * and force journal switch when we're running out of cache.
  236. */
  237. static void *
  238. gj_malloc(size_t size, int flags)
  239. {
  240. void *p;
  241. #ifdef GJ_MEMDEBUG
  242. struct meminfo *mi;
  243. #endif
  244. mtx_lock(&g_journal_cache_mtx);
  245. if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
  246. g_journal_cache_used + size > g_journal_cache_low) {
  247. GJ_DEBUG(1, "No cache, waking up the switcher.");
  248. g_journal_switcher_wokenup = 1;
  249. wakeup(&g_journal_switcher_state);
  250. }
  251. if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
  252. g_journal_cache_used + size > g_journal_cache_limit) {
  253. mtx_unlock(&g_journal_cache_mtx);
  254. g_journal_cache_alloc_failures++;
  255. return (NULL);
  256. }
  257. g_journal_cache_used += size;
  258. mtx_unlock(&g_journal_cache_mtx);
  259. flags &= ~M_NOWAIT;
  260. #ifndef GJ_MEMDEBUG
  261. p = malloc(size, M_JOURNAL, flags | M_WAITOK);
  262. #else
  263. mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
  264. p = (u_char *)mi + sizeof(*mi);
  265. mi->mi_size = size;
  266. stack_save(&mi->mi_stack);
  267. #endif
  268. return (p);
  269. }
  270. static void
  271. gj_free(void *p, size_t size)
  272. {
  273. #ifdef GJ_MEMDEBUG
  274. struct meminfo *mi;
  275. #endif
  276. KASSERT(p != NULL, ("p=NULL"));
  277. KASSERT(size > 0, ("size=0"));
  278. mtx_lock(&g_journal_cache_mtx);
  279. KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
  280. g_journal_cache_used -= size;
  281. mtx_unlock(&g_journal_cache_mtx);
  282. #ifdef GJ_MEMDEBUG
  283. mi = p = (void *)((u_char *)p - sizeof(*mi));
  284. if (mi->mi_size != size) {
  285. printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
  286. mi->mi_size);
  287. printf("GJOURNAL: Alloc backtrace:\n");
  288. stack_print(&mi->mi_stack);
  289. printf("GJOURNAL: Free backtrace:\n");
  290. kdb_backtrace();
  291. }
  292. #endif
  293. free(p, M_JOURNAL);
  294. }
  295. static void *
  296. gj_realloc(void *p, size_t size, size_t oldsize)
  297. {
  298. void *np;
  299. #ifndef GJ_MEMDEBUG
  300. mtx_lock(&g_journal_cache_mtx);
  301. g_journal_cache_used -= oldsize;
  302. g_journal_cache_used += size;
  303. mtx_unlock(&g_journal_cache_mtx);
  304. np = realloc(p, size, M_JOURNAL, M_WAITOK);
  305. #else
  306. np = gj_malloc(size, M_WAITOK);
  307. bcopy(p, np, MIN(oldsize, size));
  308. gj_free(p, oldsize);
  309. #endif
  310. return (np);
  311. }
  312. static void
  313. g_journal_check_overflow(struct g_journal_softc *sc)
  314. {
  315. off_t length, used;
  316. if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
  317. sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
  318. (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
  319. sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
  320. sc->sc_journal_offset < sc->sc_active.jj_offset)) {
  321. panic("Journal overflow "
  322. "(id = %u joffset=%jd active=%jd inactive=%jd)",
  323. (unsigned)sc->sc_id,
  324. (intmax_t)sc->sc_journal_offset,
  325. (intmax_t)sc->sc_active.jj_offset,
  326. (intmax_t)sc->sc_inactive.jj_offset);
  327. }
  328. if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
  329. length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
  330. used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  331. } else {
  332. length = sc->sc_jend - sc->sc_active.jj_offset;
  333. length += sc->sc_inactive.jj_offset - sc->sc_jstart;
  334. if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
  335. used = sc->sc_journal_offset - sc->sc_active.jj_offset;
  336. else {
  337. used = sc->sc_jend - sc->sc_active.jj_offset;
  338. used += sc->sc_journal_offset - sc->sc_jstart;
  339. }
  340. }
  341. /* Already woken up? */
  342. if (g_journal_switcher_wokenup)
  343. return;
  344. /*
  345. * If the active journal takes more than g_journal_force_switch percent
  346. * of free journal space, we force journal switch.
  347. */
  348. KASSERT(length > 0,
  349. ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
  350. (intmax_t)length, (intmax_t)used,
  351. (intmax_t)sc->sc_active.jj_offset,
  352. (intmax_t)sc->sc_inactive.jj_offset,
  353. (intmax_t)sc->sc_journal_offset));
  354. if ((used * 100) / length > g_journal_force_switch) {
  355. g_journal_stats_journal_full++;
  356. GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
  357. sc->sc_name, (used * 100) / length);
  358. mtx_lock(&g_journal_cache_mtx);
  359. g_journal_switcher_wokenup = 1;
  360. wakeup(&g_journal_switcher_state);
  361. mtx_unlock(&g_journal_cache_mtx);
  362. }
  363. }
  364. static void
  365. g_journal_orphan(struct g_consumer *cp)
  366. {
  367. struct g_journal_softc *sc;
  368. char name[256];
  369. int error;
  370. g_topology_assert();
  371. sc = cp->geom->softc;
  372. strlcpy(name, cp->provider->name, sizeof(name));
  373. GJ_DEBUG(0, "Lost provider %s.", name);
  374. if (sc == NULL)
  375. return;
  376. error = g_journal_destroy(sc);
  377. if (error == 0)
  378. GJ_DEBUG(0, "Journal %s destroyed.", name);
  379. else {
  380. GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
  381. "Destroy it manually after last close.", sc->sc_name,
  382. error);
  383. }
  384. }
  385. static int
  386. g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
  387. {
  388. struct g_journal_softc *sc;
  389. int dcw;
  390. g_topology_assert();
  391. GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
  392. acr, acw, ace);
  393. dcw = pp->acw + acw;
  394. sc = pp->geom->softc;
  395. if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
  396. if (acr <= 0 && acw <= 0 && ace <= 0)
  397. return (0);
  398. else
  399. return (ENXIO);
  400. }
  401. if (pp->acw == 0 && dcw > 0) {
  402. GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
  403. sc->sc_flags &= ~GJF_DEVICE_CLEAN;
  404. g_topology_unlock();
  405. g_journal_metadata_update(sc);
  406. g_topology_lock();
  407. } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
  408. GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
  409. sc->sc_flags |= GJF_DEVICE_CLEAN;
  410. g_topology_unlock();
  411. g_journal_metadata_update(sc);
  412. g_topology_lock();
  413. } */
  414. return (0);
  415. }
  416. static void
  417. g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
  418. {
  419. bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
  420. data += sizeof(GJ_HEADER_MAGIC);
  421. le32enc(data, hdr->jh_journal_id);
  422. data += 4;
  423. le32enc(data, hdr->jh_journal_next_id);
  424. }
  425. static int
  426. g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
  427. {
  428. bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
  429. data += sizeof(hdr->jh_magic);
  430. if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
  431. return (EINVAL);
  432. hdr->jh_journal_id = le32dec(data);
  433. data += 4;
  434. hdr->jh_journal_next_id = le32dec(data);
  435. return (0);
  436. }
  437. static void
  438. g_journal_flush_cache(struct g_journal_softc *sc)
  439. {
  440. struct bintime bt;
  441. int error;
  442. if (sc->sc_bio_flush == 0)
  443. return;
  444. GJ_TIMER_START(1, &bt);
  445. if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
  446. error = g_io_flush(sc->sc_jconsumer);
  447. GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  448. sc->sc_jconsumer->provider->name, error);
  449. }
  450. if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
  451. /*
  452. * TODO: This could be called in parallel with the
  453. * previous call.
  454. */
  455. error = g_io_flush(sc->sc_dconsumer);
  456. GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
  457. sc->sc_dconsumer->provider->name, error);
  458. }
  459. GJ_TIMER_STOP(1, &bt, "Cache flush time");
  460. }
  461. static int
  462. g_journal_write_header(struct g_journal_softc *sc)
  463. {
  464. struct g_journal_header hdr;
  465. struct g_consumer *cp;
  466. u_char *buf;
  467. int error;
  468. cp = sc->sc_jconsumer;
  469. buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  470. strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
  471. hdr.jh_journal_id = sc->sc_journal_id;
  472. hdr.jh_journal_next_id = sc->sc_journal_next_id;
  473. g_journal_header_encode(&hdr, buf);
  474. error = g_write_data(cp, sc->sc_journal_offset, buf,
  475. cp->provider->sectorsize);
  476. /* if (error == 0) */
  477. sc->sc_journal_offset += cp->provider->sectorsize;
  478. gj_free(buf, cp->provider->sectorsize);
  479. return (error);
  480. }
  481. /*
  482. * Every journal record has a header and data following it.
  483. * Functions below are used to decode the header before storing it to
  484. * little endian and to encode it after reading to system endianness.
  485. */
  486. static void
  487. g_journal_record_header_encode(struct g_journal_record_header *hdr,
  488. u_char *data)
  489. {
  490. struct g_journal_entry *ent;
  491. u_int i;
  492. bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
  493. data += sizeof(GJ_RECORD_HEADER_MAGIC);
  494. le32enc(data, hdr->jrh_journal_id);
  495. data += 8;
  496. le16enc(data, hdr->jrh_nentries);
  497. data += 2;
  498. bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
  499. data += 8;
  500. for (i = 0; i < hdr->jrh_nentries; i++) {
  501. ent = &hdr->jrh_entries[i];
  502. le64enc(data, ent->je_joffset);
  503. data += 8;
  504. le64enc(data, ent->je_offset);
  505. data += 8;
  506. le64enc(data, ent->je_length);
  507. data += 8;
  508. }
  509. }
  510. static int
  511. g_journal_record_header_decode(const u_char *data,
  512. struct g_journal_record_header *hdr)
  513. {
  514. struct g_journal_entry *ent;
  515. u_int i;
  516. bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
  517. data += sizeof(hdr->jrh_magic);
  518. if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
  519. return (EINVAL);
  520. hdr->jrh_journal_id = le32dec(data);
  521. data += 8;
  522. hdr->jrh_nentries = le16dec(data);
  523. data += 2;
  524. if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
  525. return (EINVAL);
  526. bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
  527. data += 8;
  528. for (i = 0; i < hdr->jrh_nentries; i++) {
  529. ent = &hdr->jrh_entries[i];
  530. ent->je_joffset = le64dec(data);
  531. data += 8;
  532. ent->je_offset = le64dec(data);
  533. data += 8;
  534. ent->je_length = le64dec(data);
  535. data += 8;
  536. }
  537. return (0);
  538. }
  539. /*
  540. * Function reads metadata from a provider (via the given consumer), decodes
  541. * it to system endianness and verifies its correctness.
  542. */
  543. static int
  544. g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
  545. {
  546. struct g_provider *pp;
  547. u_char *buf;
  548. int error;
  549. g_topology_assert();
  550. error = g_access(cp, 1, 0, 0);
  551. if (error != 0)
  552. return (error);
  553. pp = cp->provider;
  554. g_topology_unlock();
  555. /* Metadata is stored in last sector. */
  556. buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
  557. &error);
  558. g_topology_lock();
  559. g_access(cp, -1, 0, 0);
  560. if (buf == NULL) {
  561. GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
  562. cp->provider->name, error);
  563. return (error);
  564. }
  565. /* Decode metadata. */
  566. error = journal_metadata_decode(buf, md);
  567. g_free(buf);
  568. /* Is this is gjournal provider at all? */
  569. if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
  570. return (EINVAL);
  571. /*
  572. * Are we able to handle this version of metadata?
  573. * We only maintain backward compatibility.
  574. */
  575. if (md->md_version > G_JOURNAL_VERSION) {
  576. GJ_DEBUG(0,
  577. "Kernel module is too old to handle metadata from %s.",
  578. cp->provider->name);
  579. return (EINVAL);
  580. }
  581. /* Is checksum correct? */
  582. if (error != 0) {
  583. GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
  584. cp->provider->name);
  585. return (error);
  586. }
  587. return (0);
  588. }
  589. /*
  590. * Two functions below are responsible for updating metadata.
  591. * Only metadata on the data provider is updated (we need to update
  592. * information about active journal in there).
  593. */
  594. static void
  595. g_journal_metadata_done(struct bio *bp)
  596. {
  597. /*
  598. * There is not much we can do on error except informing about it.
  599. */
  600. if (bp->bio_error != 0) {
  601. GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
  602. bp->bio_error);
  603. } else {
  604. GJ_LOGREQ(2, bp, "Metadata updated.");
  605. }
  606. gj_free(bp->bio_data, bp->bio_length);
  607. g_destroy_bio(bp);
  608. }
  609. static void
  610. g_journal_metadata_update(struct g_journal_softc *sc)
  611. {
  612. struct g_journal_metadata md;
  613. struct g_consumer *cp;
  614. struct bio *bp;
  615. u_char *sector;
  616. cp = sc->sc_dconsumer;
  617. sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  618. strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
  619. md.md_version = G_JOURNAL_VERSION;
  620. md.md_id = sc->sc_id;
  621. md.md_type = sc->sc_orig_type;
  622. md.md_jstart = sc->sc_jstart;
  623. md.md_jend = sc->sc_jend;
  624. md.md_joffset = sc->sc_inactive.jj_offset;
  625. md.md_jid = sc->sc_journal_previous_id;
  626. md.md_flags = 0;
  627. if (sc->sc_flags & GJF_DEVICE_CLEAN)
  628. md.md_flags |= GJ_FLAG_CLEAN;
  629. if (sc->sc_flags & GJF_DEVICE_HARDCODED)
  630. strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
  631. else
  632. bzero(md.md_provider, sizeof(md.md_provider));
  633. md.md_provsize = cp->provider->mediasize;
  634. journal_metadata_encode(&md, sector);
  635. /*
  636. * Flush the cache, so we know all data are on disk.
  637. * We write here informations like "journal is consistent", so we need
  638. * to be sure it is. Without BIO_FLUSH here, we can end up in situation
  639. * where metadata is stored on disk, but not all data.
  640. */
  641. g_journal_flush_cache(sc);
  642. bp = g_alloc_bio();
  643. bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
  644. bp->bio_length = cp->provider->sectorsize;
  645. bp->bio_data = sector;
  646. bp->bio_cmd = BIO_WRITE;
  647. if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
  648. bp->bio_done = g_journal_metadata_done;
  649. g_io_request(bp, cp);
  650. } else {
  651. bp->bio_done = NULL;
  652. g_io_request(bp, cp);
  653. biowait(bp, "gjmdu");
  654. g_journal_metadata_done(bp);
  655. }
  656. /*
  657. * Be sure metadata reached the disk.
  658. */
  659. g_journal_flush_cache(sc);
  660. }
  661. /*
  662. * This is where the I/O request comes from the GEOM.
  663. */
  664. static void
  665. g_journal_start(struct bio *bp)
  666. {
  667. struct g_journal_softc *sc;
  668. sc = bp->bio_to->geom->softc;
  669. GJ_LOGREQ(3, bp, "Request received.");
  670. switch (bp->bio_cmd) {
  671. case BIO_READ:
  672. case BIO_WRITE:
  673. mtx_lock(&sc->sc_mtx);
  674. bioq_insert_tail(&sc->sc_regular_queue, bp);
  675. wakeup(sc);
  676. mtx_unlock(&sc->sc_mtx);
  677. return;
  678. case BIO_GETATTR:
  679. if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
  680. strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
  681. bp->bio_completed = strlen(bp->bio_to->name) + 1;
  682. g_io_deliver(bp, 0);
  683. return;
  684. }
  685. /* FALLTHROUGH */
  686. case BIO_SPEEDUP:
  687. case BIO_DELETE:
  688. default:
  689. g_io_deliver(bp, EOPNOTSUPP);
  690. return;
  691. }
  692. }
  693. static void
  694. g_journal_std_done(struct bio *bp)
  695. {
  696. struct g_journal_softc *sc;
  697. sc = bp->bio_from->geom->softc;
  698. mtx_lock(&sc->sc_mtx);
  699. bioq_insert_tail(&sc->sc_back_queue, bp);
  700. wakeup(sc);
  701. mtx_unlock(&sc->sc_mtx);
  702. }
  703. static struct bio *
  704. g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
  705. int flags)
  706. {
  707. struct bio *bp;
  708. bp = g_alloc_bio();
  709. bp->bio_offset = start;
  710. bp->bio_joffset = joffset;
  711. bp->bio_length = end - start;
  712. bp->bio_cmd = BIO_WRITE;
  713. bp->bio_done = g_journal_std_done;
  714. if (data == NULL)
  715. bp->bio_data = NULL;
  716. else {
  717. bp->bio_data = gj_malloc(bp->bio_length, flags);
  718. if (bp->bio_data != NULL)
  719. bcopy(data, bp->bio_data, bp->bio_length);
  720. }
  721. return (bp);
  722. }
  723. #define g_journal_insert_bio(head, bp, flags) \
  724. g_journal_insert((head), (bp)->bio_offset, \
  725. (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
  726. (bp)->bio_data, flags)
  727. /*
  728. * The function below does a lot more than just inserting bio to the queue.
  729. * It keeps the queue sorted by offset and ensures that there are no doubled
  730. * data (it combines bios where ranges overlap).
  731. *
  732. * The function returns the number of bios inserted (as bio can be splitted).
  733. */
  734. static int
  735. g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
  736. u_char *data, int flags)
  737. {
  738. struct bio *nbp, *cbp, *pbp;
  739. off_t cstart, cend;
  740. u_char *tmpdata;
  741. int n;
  742. GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
  743. joffset);
  744. n = 0;
  745. pbp = NULL;
  746. GJQ_FOREACH(*head, cbp) {
  747. cstart = cbp->bio_offset;
  748. cend = cbp->bio_offset + cbp->bio_length;
  749. if (nstart >= cend) {
  750. /*
  751. * +-------------+
  752. * | |
  753. * | current | +-------------+
  754. * | bio | | |
  755. * | | | new |
  756. * +-------------+ | bio |
  757. * | |
  758. * +-------------+
  759. */
  760. GJ_DEBUG(3, "INSERT(%p): 1", *head);
  761. } else if (nend <= cstart) {
  762. /*
  763. * +-------------+
  764. * | |
  765. * +-------------+ | current |
  766. * | | | bio |
  767. * | new | | |
  768. * | bio | +-------------+
  769. * | |
  770. * +-------------+
  771. */
  772. nbp = g_journal_new_bio(nstart, nend, joffset, data,
  773. flags);
  774. if (pbp == NULL)
  775. *head = nbp;
  776. else
  777. pbp->bio_next = nbp;
  778. nbp->bio_next = cbp;
  779. n++;
  780. GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
  781. pbp);
  782. goto end;
  783. } else if (nstart <= cstart && nend >= cend) {
  784. /*
  785. * +-------------+ +-------------+
  786. * | current bio | | current bio |
  787. * +---+-------------+---+ +-------------+---+
  788. * | | | | | | |
  789. * | | | | | | |
  790. * | +-------------+ | +-------------+ |
  791. * | new bio | | new bio |
  792. * +---------------------+ +-----------------+
  793. *
  794. * +-------------+ +-------------+
  795. * | current bio | | current bio |
  796. * +---+-------------+ +-------------+
  797. * | | | | |
  798. * | | | | |
  799. * | +-------------+ +-------------+
  800. * | new bio | | new bio |
  801. * +-----------------+ +-------------+
  802. */
  803. g_journal_stats_bytes_skipped += cbp->bio_length;
  804. cbp->bio_offset = nstart;
  805. cbp->bio_joffset = joffset;
  806. cbp->bio_length = cend - nstart;
  807. if (cbp->bio_data != NULL) {
  808. gj_free(cbp->bio_data, cend - cstart);
  809. cbp->bio_data = NULL;
  810. }
  811. if (data != NULL) {
  812. cbp->bio_data = gj_malloc(cbp->bio_length,
  813. flags);
  814. if (cbp->bio_data != NULL) {
  815. bcopy(data, cbp->bio_data,
  816. cbp->bio_length);
  817. }
  818. data += cend - nstart;
  819. }
  820. joffset += cend - nstart;
  821. nstart = cend;
  822. GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
  823. } else if (nstart > cstart && nend >= cend) {
  824. /*
  825. * +-----------------+ +-------------+
  826. * | current bio | | current bio |
  827. * | +-------------+ | +---------+---+
  828. * | | | | | | |
  829. * | | | | | | |
  830. * +---+-------------+ +---+---------+ |
  831. * | new bio | | new bio |
  832. * +-------------+ +-------------+
  833. */
  834. g_journal_stats_bytes_skipped += cend - nstart;
  835. nbp = g_journal_new_bio(nstart, cend, joffset, data,
  836. flags);
  837. nbp->bio_next = cbp->bio_next;
  838. cbp->bio_next = nbp;
  839. cbp->bio_length = nstart - cstart;
  840. if (cbp->bio_data != NULL) {
  841. cbp->bio_data = gj_realloc(cbp->bio_data,
  842. cbp->bio_length, cend - cstart);
  843. }
  844. if (data != NULL)
  845. data += cend - nstart;
  846. joffset += cend - nstart;
  847. nstart = cend;
  848. n++;
  849. GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
  850. } else if (nstart > cstart && nend < cend) {
  851. /*
  852. * +---------------------+
  853. * | current bio |
  854. * | +-------------+ |
  855. * | | | |
  856. * | | | |
  857. * +---+-------------+---+
  858. * | new bio |
  859. * +-------------+
  860. */
  861. g_journal_stats_bytes_skipped += nend - nstart;
  862. nbp = g_journal_new_bio(nstart, nend, joffset, data,
  863. flags);
  864. nbp->bio_next = cbp->bio_next;
  865. cbp->bio_next = nbp;
  866. if (cbp->bio_data == NULL)
  867. tmpdata = NULL;
  868. else
  869. tmpdata = cbp->bio_data + nend - cstart;
  870. nbp = g_journal_new_bio(nend, cend,
  871. cbp->bio_joffset + nend - cstart, tmpdata, flags);
  872. nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
  873. ((struct bio *)cbp->bio_next)->bio_next = nbp;
  874. cbp->bio_length = nstart - cstart;
  875. if (cbp->bio_data != NULL) {
  876. cbp->bio_data = gj_realloc(cbp->bio_data,
  877. cbp->bio_length, cend - cstart);
  878. }
  879. n += 2;
  880. GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
  881. goto end;
  882. } else if (nstart <= cstart && nend < cend) {
  883. /*
  884. * +-----------------+ +-------------+
  885. * | current bio | | current bio |
  886. * +-------------+ | +---+---------+ |
  887. * | | | | | | |
  888. * | | | | | | |
  889. * +-------------+---+ | +---------+---+
  890. * | new bio | | new bio |
  891. * +-------------+ +-------------+
  892. */
  893. g_journal_stats_bytes_skipped += nend - nstart;
  894. nbp = g_journal_new_bio(nstart, nend, joffset, data,
  895. flags);
  896. if (pbp == NULL)
  897. *head = nbp;
  898. else
  899. pbp->bio_next = nbp;
  900. nbp->bio_next = cbp;
  901. cbp->bio_offset = nend;
  902. cbp->bio_length = cend - nend;
  903. cbp->bio_joffset += nend - cstart;
  904. tmpdata = cbp->bio_data;
  905. if (tmpdata != NULL) {
  906. cbp->bio_data = gj_malloc(cbp->bio_length,
  907. flags);
  908. if (cbp->bio_data != NULL) {
  909. bcopy(tmpdata + nend - cstart,
  910. cbp->bio_data, cbp->bio_length);
  911. }
  912. gj_free(tmpdata, cend - cstart);
  913. }
  914. n++;
  915. GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
  916. goto end;
  917. }
  918. if (nstart == nend)
  919. goto end;
  920. pbp = cbp;
  921. }
  922. nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
  923. if (pbp == NULL)
  924. *head = nbp;
  925. else
  926. pbp->bio_next = nbp;
  927. nbp->bio_next = NULL;
  928. n++;
  929. GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
  930. end:
  931. if (g_journal_debug >= 3) {
  932. GJQ_FOREACH(*head, cbp) {
  933. GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
  934. (intmax_t)cbp->bio_offset,
  935. (intmax_t)cbp->bio_length,
  936. (intmax_t)cbp->bio_joffset, cbp->bio_data);
  937. }
  938. GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
  939. }
  940. return (n);
  941. }
  942. /*
  943. * The function combines neighbour bios trying to squeeze as much data as
  944. * possible into one bio.
  945. *
  946. * The function returns the number of bios combined (negative value).
  947. */
  948. static int
  949. g_journal_optimize(struct bio *head)
  950. {
  951. struct bio *cbp, *pbp;
  952. int n;
  953. n = 0;
  954. pbp = NULL;
  955. GJQ_FOREACH(head, cbp) {
  956. /* Skip bios which has to be read first. */
  957. if (cbp->bio_data == NULL) {
  958. pbp = NULL;
  959. continue;
  960. }
  961. /* There is no previous bio yet. */
  962. if (pbp == NULL) {
  963. pbp = cbp;
  964. continue;
  965. }
  966. /* Is this a neighbour bio? */
  967. if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
  968. /* Be sure that bios queue is sorted. */
  969. KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
  970. ("poffset=%jd plength=%jd coffset=%jd",
  971. (intmax_t)pbp->bio_offset,
  972. (intmax_t)pbp->bio_length,
  973. (intmax_t)cbp->bio_offset));
  974. pbp = cbp;
  975. continue;
  976. }
  977. /* Be sure we don't end up with too big bio. */
  978. if (pbp->bio_length + cbp->bio_length > maxphys) {
  979. pbp = cbp;
  980. continue;
  981. }
  982. /* Ok, we can join bios. */
  983. GJ_LOGREQ(4, pbp, "Join: ");
  984. GJ_LOGREQ(4, cbp, "and: ");
  985. pbp->bio_data = gj_realloc(pbp->bio_data,
  986. pbp->bio_length + cbp->bio_length, pbp->bio_length);
  987. bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
  988. cbp->bio_length);
  989. gj_free(cbp->bio_data, cbp->bio_length);
  990. pbp->bio_length += cbp->bio_length;
  991. pbp->bio_next = cbp->bio_next;
  992. g_destroy_bio(cbp);
  993. cbp = pbp;
  994. g_journal_stats_combined_ios++;
  995. n--;
  996. GJ_LOGREQ(4, pbp, "Got: ");
  997. }
  998. return (n);
  999. }
  1000. /*
  1001. * TODO: Update comment.
  1002. * These are functions responsible for copying one portion of data from journal
  1003. * to the destination provider.
  1004. * The order goes like this:
  1005. * 1. Read the header, which contains informations about data blocks
  1006. * following it.
  1007. * 2. Read the data blocks from the journal.
  1008. * 3. Write the data blocks on the data provider.
  1009. *
  1010. * g_journal_copy_start()
  1011. * g_journal_copy_done() - got finished write request, logs potential errors.
  1012. */
  1013. /*
  1014. * When there is no data in cache, this function is used to read it.
  1015. */
  1016. static void
  1017. g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
  1018. {
  1019. struct bio *cbp;
  1020. /*
  1021. * We were short in memory, so data was freed.
  1022. * In that case we need to read it back from journal.
  1023. */
  1024. cbp = g_alloc_bio();
  1025. cbp->bio_cflags = bp->bio_cflags;
  1026. cbp->bio_parent = bp;
  1027. cbp->bio_offset = bp->bio_joffset;
  1028. cbp->bio_length = bp->bio_length;
  1029. cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
  1030. cbp->bio_cmd = BIO_READ;
  1031. cbp->bio_done = g_journal_std_done;
  1032. GJ_LOGREQ(4, cbp, "READ FIRST");
  1033. g_io_request(cbp, sc->sc_jconsumer);
  1034. g_journal_cache_misses++;
  1035. }
  1036. static void
  1037. g_journal_copy_send(struct g_journal_softc *sc)
  1038. {
  1039. struct bio *bioq, *bp, *lbp;
  1040. bioq = lbp = NULL;
  1041. mtx_lock(&sc->sc_mtx);
  1042. for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
  1043. bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
  1044. if (bp == NULL)
  1045. break;
  1046. GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
  1047. sc->sc_copy_in_progress++;
  1048. GJQ_INSERT_AFTER(bioq, bp, lbp);
  1049. lbp = bp;
  1050. }
  1051. mtx_unlock(&sc->sc_mtx);
  1052. if (g_journal_do_optimize)
  1053. sc->sc_copy_in_progress += g_journal_optimize(bioq);
  1054. while ((bp = GJQ_FIRST(bioq)) != NULL) {
  1055. GJQ_REMOVE(bioq, bp);
  1056. GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
  1057. bp->bio_cflags = GJ_BIO_COPY;
  1058. if (bp->bio_data == NULL)
  1059. g_journal_read_first(sc, bp);
  1060. else {
  1061. bp->bio_joffset = 0;
  1062. GJ_LOGREQ(4, bp, "SEND");
  1063. g_io_request(bp, sc->sc_dconsumer);
  1064. }
  1065. }
  1066. }
  1067. static void
  1068. g_journal_copy_start(struct g_journal_softc *sc)
  1069. {
  1070. /*
  1071. * Remember in metadata that we're starting to copy journaled data
  1072. * to the data provider.
  1073. * In case of power failure, we will copy these data once again on boot.
  1074. */
  1075. if (!sc->sc_journal_copying) {
  1076. sc->sc_journal_copying = 1;
  1077. GJ_DEBUG(1, "Starting copy of journal.");
  1078. g_journal_metadata_update(sc);
  1079. }
  1080. g_journal_copy_send(sc);
  1081. }
  1082. /*
  1083. * Data block has been read from the journal provider.
  1084. */
  1085. static int
  1086. g_journal_copy_read_done(struct bio *bp)
  1087. {
  1088. struct g_journal_softc *sc;
  1089. struct g_consumer *cp;
  1090. struct bio *pbp;
  1091. KASSERT(bp->bio_cflags == GJ_BIO_COPY,
  1092. ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
  1093. sc = bp->bio_from->geom->softc;
  1094. pbp = bp->bio_parent;
  1095. if (bp->bio_error != 0) {
  1096. GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
  1097. bp->bio_to->name, bp->bio_error);
  1098. /*
  1099. * We will not be able to deliver WRITE request as well.
  1100. */
  1101. gj_free(bp->bio_data, bp->bio_length);
  1102. g_destroy_bio(pbp);
  1103. g_destroy_bio(bp);
  1104. sc->sc_copy_in_progress--;
  1105. return (1);
  1106. }
  1107. pbp->bio_data = bp->bio_data;
  1108. cp = sc->sc_dconsumer;
  1109. g_io_request(pbp, cp);
  1110. GJ_LOGREQ(4, bp, "READ DONE");
  1111. g_destroy_bio(bp);
  1112. return (0);
  1113. }
  1114. /*
  1115. * Data block has been written to the data provider.
  1116. */
  1117. static void
  1118. g_journal_copy_write_done(struct bio *bp)
  1119. {
  1120. struct g_journal_softc *sc;
  1121. KASSERT(bp->bio_cflags == GJ_BIO_COPY,
  1122. ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
  1123. sc = bp->bio_from->geom->softc;
  1124. sc->sc_copy_in_progress--;
  1125. if (bp->bio_error != 0) {
  1126. GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
  1127. bp->bio_error);
  1128. }
  1129. GJQ_REMOVE(sc->sc_copy_queue, bp);
  1130. gj_free(bp->bio_data, bp->bio_length);
  1131. GJ_LOGREQ(4, bp, "DONE");
  1132. g_destroy_bio(bp);
  1133. if (sc->sc_copy_in_progress == 0) {
  1134. /*
  1135. * This was the last write request for this journal.
  1136. */
  1137. GJ_DEBUG(1, "Data has been copied.");
  1138. sc->sc_journal_copying = 0;
  1139. }
  1140. }
  1141. static void g_journal_flush_done(struct bio *bp);
  1142. /*
  1143. * Flush one record onto active journal provider.
  1144. */
  1145. static void
  1146. g_journal_flush(struct g_journal_softc *sc)
  1147. {
  1148. struct g_journal_record_header hdr;
  1149. struct g_journal_entry *ent;
  1150. struct g_provider *pp;
  1151. struct bio **bioq;
  1152. struct bio *bp, *fbp, *pbp;
  1153. off_t joffset;
  1154. u_char *data, hash[16];
  1155. MD5_CTX ctx;
  1156. u_int i;
  1157. if (sc->sc_current_count == 0)
  1158. return;
  1159. pp = sc->sc_jprovider;
  1160. GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
  1161. joffset = sc->sc_journal_offset;
  1162. GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
  1163. sc->sc_current_count, pp->name, (intmax_t)joffset);
  1164. /*
  1165. * Store 'journal id', so we know to which journal this record belongs.
  1166. */
  1167. hdr.jrh_journal_id = sc->sc_journal_id;
  1168. /* Could be less than g_journal_record_entries if called due timeout. */
  1169. hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
  1170. strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
  1171. bioq = &sc->sc_active.jj_queue;
  1172. GJQ_LAST(sc->sc_flush_queue, pbp);
  1173. fbp = g_alloc_bio();
  1174. fbp->bio_parent = NULL;
  1175. fbp->bio_cflags = GJ_BIO_JOURNAL;
  1176. fbp->bio_offset = -1;
  1177. fbp->bio_joffset = joffset;
  1178. fbp->bio_length = pp->sectorsize;
  1179. fbp->bio_cmd = BIO_WRITE;
  1180. fbp->bio_done = g_journal_std_done;
  1181. GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
  1182. pbp = fbp;
  1183. fbp->bio_to = pp;
  1184. GJ_LOGREQ(4, fbp, "FLUSH_OUT");
  1185. joffset += pp->sectorsize;
  1186. sc->sc_flush_count++;
  1187. if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
  1188. MD5Init(&ctx);
  1189. for (i = 0; i < hdr.jrh_nentries; i++) {
  1190. bp = sc->sc_current_queue;
  1191. KASSERT(bp != NULL, ("NULL bp"));
  1192. bp->bio_to = pp;
  1193. GJ_LOGREQ(4, bp, "FLUSHED");
  1194. sc->sc_current_queue = bp->bio_next;
  1195. bp->bio_next = NULL;
  1196. sc->sc_current_count--;
  1197. /* Add to the header. */
  1198. ent = &hdr.jrh_entries[i];
  1199. ent->je_offset = bp->bio_offset;
  1200. ent->je_joffset = joffset;
  1201. ent->je_length = bp->bio_length;
  1202. data = bp->bio_data;
  1203. if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
  1204. MD5Update(&ctx, data, ent->je_length);
  1205. g_reset_bio(bp);
  1206. bp->bio_cflags = GJ_BIO_JOURNAL;
  1207. bp->bio_offset = ent->je_offset;
  1208. bp->bio_joffset = ent->je_joffset;
  1209. bp->bio_length = ent->je_length;
  1210. bp->bio_data = data;
  1211. bp->bio_cmd = BIO_WRITE;
  1212. bp->bio_done = g_journal_std_done;
  1213. GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
  1214. pbp = bp;
  1215. bp->bio_to = pp;
  1216. GJ_LOGREQ(4, bp, "FLUSH_OUT");
  1217. joffset += bp->bio_length;
  1218. sc->sc_flush_count++;
  1219. /*
  1220. * Add request to the active sc_journal_queue queue.
  1221. * This is our cache. After journal switch we don't have to
  1222. * read the data from the inactive journal, because we keep
  1223. * it in memory.
  1224. */
  1225. g_journal_insert(bioq, ent->je_offset,
  1226. ent->je_offset + ent->je_length, ent->je_joffset, data,
  1227. M_NOWAIT);
  1228. }
  1229. /*
  1230. * After all requests, store valid header.
  1231. */
  1232. data = gj_malloc(pp->sectorsize, M_WAITOK);
  1233. if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
  1234. MD5Final(hash, &ctx);
  1235. bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
  1236. }
  1237. g_journal_record_header_encode(&hdr, data);
  1238. fbp->bio_data = data;
  1239. sc->sc_journal_offset = joffset;
  1240. g_journal_check_overflow(sc);
  1241. }
  1242. /*
  1243. * Flush request finished.
  1244. */
  1245. static void
  1246. g_journal_flush_done(struct bio *bp)
  1247. {
  1248. struct g_journal_softc *sc;
  1249. struct g_consumer *cp;
  1250. KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
  1251. ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
  1252. cp = bp->bio_from;
  1253. sc = cp->geom->softc;
  1254. sc->sc_flush_in_progress--;
  1255. if (bp->bio_error != 0) {
  1256. GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
  1257. bp->bio_error);
  1258. }
  1259. gj_free(bp->bio_data, bp->bio_length);
  1260. GJ_LOGREQ(4, bp, "DONE");
  1261. g_destroy_bio(bp);
  1262. }
  1263. static void g_journal_release_delayed(struct g_journal_softc *sc);
  1264. static void
  1265. g_journal_flush_send(struct g_journal_softc *sc)
  1266. {
  1267. struct g_consumer *cp;
  1268. struct bio *bioq, *bp, *lbp;
  1269. cp = sc->sc_jconsumer;
  1270. bioq = lbp = NULL;
  1271. while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
  1272. /* Send one flush request to the active journal. */
  1273. bp = GJQ_FIRST(sc->sc_flush_queue);
  1274. if (bp != NULL) {
  1275. GJQ_REMOVE(sc->sc_flush_queue, bp);
  1276. sc->sc_flush_count--;
  1277. bp->bio_offset = bp->bio_joffset;
  1278. bp->bio_joffset = 0;
  1279. sc->sc_flush_in_progress++;
  1280. GJQ_INSERT_AFTER(bioq, bp, lbp);
  1281. lbp = bp;
  1282. }
  1283. /* Try to release delayed requests. */
  1284. g_journal_release_delayed(sc);
  1285. /* If there are no requests to flush, leave. */
  1286. if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
  1287. break;
  1288. }
  1289. if (g_journal_do_optimize)
  1290. sc->sc_flush_in_progress += g_journal_optimize(bioq);
  1291. while ((bp = GJQ_FIRST(bioq)) != NULL) {
  1292. GJQ_REMOVE(bioq, bp);
  1293. GJ_LOGREQ(3, bp, "Flush request send");
  1294. g_io_request(bp, cp);
  1295. }
  1296. }
  1297. static void
  1298. g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
  1299. {
  1300. int n;
  1301. GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
  1302. n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
  1303. sc->sc_current_count += n;
  1304. n = g_journal_optimize(sc->sc_current_queue);
  1305. sc->sc_current_count += n;
  1306. /*
  1307. * For requests which are added to the current queue we deliver
  1308. * response immediately.
  1309. */
  1310. bp->bio_completed = bp->bio_length;
  1311. g_io_deliver(bp, 0);
  1312. if (sc->sc_current_count >= g_journal_record_entries) {
  1313. /*
  1314. * Let's flush one record onto active journal provider.
  1315. */
  1316. g_journal_flush(sc);
  1317. }
  1318. }
  1319. static void
  1320. g_journal_release_delayed(struct g_journal_softc *sc)
  1321. {
  1322. struct bio *bp;
  1323. for (;;) {
  1324. /* The flush queue is full, exit. */
  1325. if (sc->sc_flush_count >= g_journal_accept_immediately)
  1326. return;
  1327. bp = bioq_takefirst(&sc->sc_delayed_queue);
  1328. if (bp == NULL)
  1329. return;
  1330. sc->sc_delayed_count--;
  1331. g_journal_add_current(sc, bp);
  1332. }
  1333. }
  1334. /*
  1335. * Add I/O request to the current queue. If we have enough requests for one
  1336. * journal record we flush them onto active journal provider.
  1337. */
  1338. static void
  1339. g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
  1340. {
  1341. /*
  1342. * The flush queue is full, we need to delay the request.
  1343. */
  1344. if (sc->sc_delayed_count > 0 ||
  1345. sc->sc_flush_count >= g_journal_accept_immediately) {
  1346. GJ_LOGREQ(4, bp, "DELAYED");
  1347. bioq_insert_tail(&sc->sc_delayed_queue, bp);
  1348. sc->sc_delayed_count++;
  1349. return;
  1350. }
  1351. KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
  1352. ("DELAYED queue not empty."));
  1353. g_journal_add_current(sc, bp);
  1354. }
  1355. static void g_journal_read_done(struct bio *bp);
  1356. /*
  1357. * Try to find requested data in cache.
  1358. */
  1359. static struct bio *
  1360. g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
  1361. off_t oend)
  1362. {
  1363. off_t cstart, cend;
  1364. struct bio *bp;
  1365. GJQ_FOREACH(head, bp) {
  1366. if (bp->bio_offset == -1)
  1367. continue;
  1368. cstart = MAX(ostart, bp->bio_offset);
  1369. cend = MIN(oend, bp->bio_offset + bp->bio_length);
  1370. if (cend <= ostart)
  1371. continue;
  1372. else if (cstart >= oend) {
  1373. if (!sorted)
  1374. continue;
  1375. else {
  1376. bp = NULL;
  1377. break;
  1378. }
  1379. }
  1380. if (bp->bio_data == NULL)
  1381. break;
  1382. GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
  1383. bp);
  1384. bcopy(bp->bio_data + cstart - bp->bio_offset,
  1385. pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
  1386. pbp->bio_completed += cend - cstart;
  1387. if (pbp->bio_completed == pbp->bio_length) {
  1388. /*
  1389. * Cool, the whole request was in cache, deliver happy
  1390. * message.
  1391. */
  1392. g_io_deliver(pbp, 0);
  1393. return (pbp);
  1394. }
  1395. break;
  1396. }
  1397. return (bp);
  1398. }
  1399. /*
  1400. * This function is used for collecting data on read.
  1401. * The complexity is because parts of the data can be stored in four different
  1402. * places:
  1403. * - in memory - the data not yet send to the active journal provider
  1404. * - in the active journal
  1405. * - in the inactive journal
  1406. * - in the data provider
  1407. */
  1408. static void
  1409. g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
  1410. off_t oend)
  1411. {
  1412. struct bio *bp, *nbp, *head;
  1413. off_t cstart, cend;
  1414. u_int i, sorted = 0;
  1415. GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
  1416. cstart = cend = -1;
  1417. bp = NULL;
  1418. head = NULL;
  1419. for (i = 1; i <= 5; i++) {
  1420. switch (i) {
  1421. case 1: /* Not-yet-send data. */
  1422. head = sc->sc_current_queue;
  1423. sorted = 1;
  1424. break;
  1425. case 2: /* Skip flush queue as they are also in active queue */
  1426. continue;
  1427. case 3: /* Active journal. */
  1428. head = sc->sc_active.jj_queue;
  1429. sorted = 1;
  1430. break;
  1431. case 4: /* Inactive journal. */
  1432. /*
  1433. * XXX: Here could be a race with g_journal_lowmem().
  1434. */
  1435. head = sc->sc_inactive.jj_queue;
  1436. sorted = 1;
  1437. break;
  1438. case 5: /* In-flight to the data provider. */
  1439. head = sc->sc_copy_queue;
  1440. sorted = 0;
  1441. break;
  1442. default:
  1443. panic("gjournal %s: i=%d", __func__, i);
  1444. }
  1445. bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
  1446. if (bp == pbp) { /* Got the whole request. */
  1447. GJ_DEBUG(2, "Got the whole request from %u.", i);
  1448. return;
  1449. } else if (bp != NULL) {
  1450. cstart = MAX(ostart, bp->bio_offset);
  1451. cend = MIN(oend, bp->bio_offset + bp->bio_length);
  1452. GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
  1453. i, (intmax_t)cstart, (intmax_t)cend);
  1454. break;
  1455. }
  1456. }
  1457. if (bp != NULL) {
  1458. if (bp->bio_data == NULL) {
  1459. nbp = g_duplicate_bio(pbp);
  1460. nbp->bio_cflags = GJ_BIO_READ;
  1461. nbp->bio_data =
  1462. pbp->bio_data + cstart - pbp->bio_offset;
  1463. nbp->bio_offset =
  1464. bp->bio_joffset + cstart - bp->bio_offset;
  1465. nbp->bio_length = cend - cstart;
  1466. nbp->bio_done = g_journal_read_done;
  1467. g_io_request(nbp, sc->sc_jconsumer);
  1468. }
  1469. /*
  1470. * If we don't have the whole request yet, call g_journal_read()
  1471. * recursively.
  1472. */
  1473. if (ostart < cstart)
  1474. g_journal_read(sc, pbp, ostart, cstart);
  1475. if (oend > cend)
  1476. g_journal_read(sc, pbp, cend, oend);
  1477. } else {
  1478. /*
  1479. * No data in memory, no data in journal.
  1480. * Its time for asking data provider.
  1481. */
  1482. GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
  1483. nbp = g_duplicate_bio(pbp);
  1484. nbp->bio_cflags = GJ_BIO_READ;
  1485. nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
  1486. nbp->bio_offset = ostart;
  1487. nbp->bio_length = oend - ostart;
  1488. nbp->bio_done = g_journal_read_done;
  1489. g_io_request(nbp, sc->sc_dconsumer);
  1490. /* We have the whole request, return here. */
  1491. return;
  1492. }
  1493. }
  1494. /*
  1495. * Function responsible for handling finished READ requests.
  1496. * Actually, g_std_done() could be used here, the only difference is that we
  1497. * log error.
  1498. */
  1499. static void
  1500. g_journal_read_done(struct bio *bp)
  1501. {
  1502. struct bio *pbp;
  1503. KASSERT(bp->bio_cflags == GJ_BIO_READ,
  1504. ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
  1505. pbp = bp->bio_parent;
  1506. pbp->bio_inbed++;
  1507. pbp->bio_completed += bp->bio_length;
  1508. if (bp->bio_error != 0) {
  1509. if (pbp->bio_error == 0)
  1510. pbp->bio_error = bp->bio_error;
  1511. GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
  1512. bp->bio_to->name, bp->bio_error);
  1513. }
  1514. g_destroy_bio(bp);
  1515. if (pbp->bio_children == pbp->bio_inbed &&
  1516. pbp->bio_completed == pbp->bio_length) {
  1517. /* We're done. */
  1518. g_io_deliver(pbp, 0);
  1519. }
  1520. }
  1521. /*
  1522. * Deactive current journal and active next one.
  1523. */
  1524. static void
  1525. g_journal_switch(struct g_journal_softc *sc)
  1526. {
  1527. struct g_provider *pp;
  1528. if (JEMPTY(sc)) {
  1529. GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
  1530. pp = LIST_FIRST(&sc->sc_geom->provider);
  1531. if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
  1532. sc->sc_flags |= GJF_DEVICE_CLEAN;
  1533. GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
  1534. g_journal_metadata_update(sc);
  1535. }
  1536. } else {
  1537. GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
  1538. pp = sc->sc_jprovider;
  1539. sc->sc_journal_previous_id = sc->sc_journal_id;
  1540. sc->sc_journal_id = sc->sc_journal_next_id;
  1541. sc->sc_journal_next_id = arc4random();
  1542. GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
  1543. g_journal_write_header(sc);
  1544. sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
  1545. sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
  1546. sc->sc_active.jj_offset =
  1547. sc->sc_journal_offset - pp->sectorsize;
  1548. sc->sc_active.jj_queue = NULL;
  1549. /*
  1550. * Switch is done, start copying data from the (now) inactive
  1551. * journal to the data provider.
  1552. */
  1553. g_journal_copy_start(sc);
  1554. }
  1555. mtx_lock(&sc->sc_mtx);
  1556. sc->sc_flags &= ~GJF_DEVICE_SWITCH;
  1557. mtx_unlock(&sc->sc_mtx);
  1558. }
  1559. static void
  1560. g_journal_initialize(struct g_journal_softc *sc)
  1561. {
  1562. sc->sc_journal_id = arc4random();
  1563. sc->sc_journal_next_id = arc4random();
  1564. sc->sc_journal_previous_id = sc->sc_journal_id;
  1565. sc->sc_journal_offset = sc->sc_jstart;
  1566. sc->sc_inactive.jj_offset = sc->sc_jstart;
  1567. g_journal_write_header(sc);
  1568. sc->sc_active.jj_offset = sc->sc_jstart;
  1569. }
  1570. static void
  1571. g_journal_mark_as_dirty(struct g_journal_softc *sc)
  1572. {
  1573. const struct g_journal_desc *desc;
  1574. int i;
  1575. GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
  1576. for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
  1577. desc->jd_dirty(sc->sc_dconsumer);
  1578. }
  1579. /*
  1580. * Function read record header from the given journal.
  1581. * It is very similar to g_read_data(9), but it doesn't allocate memory for bio
  1582. * and data on every call.
  1583. */
  1584. static int
  1585. g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
  1586. void *data)
  1587. {
  1588. int error;
  1589. g_reset_bio(bp);
  1590. bp->bio_cmd = BIO_READ;
  1591. bp->bio_done = NULL;
  1592. bp->bio_offset = offset;
  1593. bp->bio_length = cp->provider->sectorsize;
  1594. bp->bio_data = data;
  1595. g_io_request(bp, cp);
  1596. error = biowait(bp, "gjs_read");
  1597. return (error);
  1598. }
  1599. #if 0
  1600. /*
  1601. * Function is called when we start the journal device and we detect that
  1602. * one of the journals was not fully copied.
  1603. * The purpose of this function is to read all records headers from journal
  1604. * and placed them in the inactive queue, so we can start journal
  1605. * synchronization process and the journal provider itself.
  1606. * Design decision was taken to not synchronize the whole journal here as it
  1607. * can take too much time. Reading headers only and delaying synchronization
  1608. * process until after journal provider is started should be the best choice.
  1609. */
  1610. #endif
  1611. static void
  1612. g_journal_sync(struct g_journal_softc *sc)
  1613. {
  1614. struct g_journal_record_header rhdr;
  1615. struct g_journal_entry *ent;
  1616. struct g_journal_header jhdr;
  1617. struct g_consumer *cp;
  1618. struct bio *bp, *fbp, *tbp;
  1619. off_t joffset, offset;
  1620. u_char *buf, sum[16];
  1621. uint64_t id;
  1622. MD5_CTX ctx;
  1623. int error, found, i;
  1624. found = 0;
  1625. fbp = NULL;
  1626. cp = sc->sc_jconsumer;
  1627. bp = g_alloc_bio();
  1628. buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
  1629. offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
  1630. GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
  1631. /*
  1632. * Read and decode first journal header.
  1633. */
  1634. error = g_journal_sync_read(cp, bp, offset, buf);
  1635. if (error != 0) {
  1636. GJ_DEBUG(0, "Error while reading journal header from %s.",
  1637. cp->provider->name);
  1638. goto end;
  1639. }
  1640. error = g_journal_header_decode(buf, &jhdr);
  1641. if (error != 0) {
  1642. GJ_DEBUG(0, "Cannot decode journal header from %s.",
  1643. cp->provider->name);
  1644. goto end;
  1645. }
  1646. id = sc->sc_journal_id;
  1647. if (jhdr.jh_journal_id != sc->sc_journal_id) {
  1648. GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
  1649. (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
  1650. goto end;
  1651. }
  1652. offset += cp->provider->sectorsize;
  1653. id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
  1654. for (;;) {
  1655. /*
  1656. * If the biggest record won't fit, look for a record header or
  1657. * journal header from the beginning.
  1658. */
  1659. GJ_VALIDATE_OFFSET(offset, sc);
  1660. error = g_journal_sync_read(cp, bp, offset, buf);
  1661. if (error != 0) {
  1662. /*
  1663. * Not good. Having an error while reading header
  1664. * means, that we cannot read next headers and in
  1665. * consequence we cannot find termination.
  1666. */
  1667. GJ_DEBUG(0,
  1668. "Error while reading record header from %s.",
  1669. cp->provider->name);
  1670. break;
  1671. }
  1672. error = g_journal_record_header_decode(buf, &rhdr);
  1673. if (error != 0) {
  1674. GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
  1675. (intmax_t)offset, error);
  1676. /*
  1677. * This is not a record header.
  1678. * If we are lucky, this is next journal header.
  1679. */
  1680. error = g_journal_header_decode(buf, &jhdr);
  1681. if (error != 0) {
  1682. GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
  1683. (intmax_t)offset, error);
  1684. /*
  1685. * Nope, this is not journal header, which
  1686. * basically means that journal is not
  1687. * terminated properly.
  1688. */
  1689. error = ENOENT;
  1690. break;
  1691. }
  1692. /*
  1693. * Ok. This is header of _some_ journal. Now we need to
  1694. * verify if this is header of the _next_ journal.
  1695. */
  1696. if (jhdr.jh_journal_id != id) {
  1697. GJ_DEBUG(1, "Journal ID mismatch at %jd "
  1698. "(0x%08x != 0x%08x).", (intmax_t)offset,
  1699. (u_int)jhdr.jh_journal_id, (u_int)id);
  1700. error = ENOENT;
  1701. break;
  1702. }
  1703. /* Found termination. */
  1704. found++;
  1705. GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
  1706. (intmax_t)offset, (u_int)id);
  1707. sc->sc_active.jj_offset = offset;
  1708. sc->sc_journal_offset =
  1709. offset + cp->provider->sectorsize;
  1710. sc->sc_journal_id = id;
  1711. id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
  1712. while ((tbp = fbp) != NULL) {
  1713. fbp = tbp->bio_next;
  1714. GJ_LOGREQ(3, tbp, "Adding request.");
  1715. g_journal_insert_bio(&sc->sc_inactive.jj_queue,
  1716. tbp, M_WAITOK);
  1717. }
  1718. /* Skip journal's header. */
  1719. offset += cp->provider->sectorsize;
  1720. continue;
  1721. }
  1722. /* Skip record's header. */
  1723. offset += cp->provider->sectorsize;
  1724. /*
  1725. * Add information about every record entry to the inactive
  1726. * queue.
  1727. */
  1728. if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
  1729. MD5Init(&ctx);
  1730. for (i = 0; i < rhdr.jrh_nentries; i++) {
  1731. ent = &rhdr.jrh_entries[i];
  1732. GJ_DEBUG(3, "Insert entry: %jd %jd.",
  1733. (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
  1734. g_journal_insert(&fbp, ent->je_offset,
  1735. ent->je_offset + ent->je_length, ent->je_joffset,
  1736. NULL, M_WAITOK);
  1737. if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
  1738. u_char *buf2;
  1739. /*
  1740. * TODO: Should use faster function (like
  1741. * g_journal_sync_read()).
  1742. */
  1743. buf2 = g_read_data(cp, offset, ent->je_length,
  1744. NULL);
  1745. if (buf2 == NULL)
  1746. GJ_DEBUG(0, "Cannot read data at %jd.",
  1747. (intmax_t)offset);
  1748. else {
  1749. MD5Update(&ctx, buf2, ent->je_length);
  1750. g_free(buf2);
  1751. }
  1752. }
  1753. /* Skip entry's data. */
  1754. offset += ent->je_length;
  1755. }
  1756. if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
  1757. MD5Final(sum, &ctx);
  1758. if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
  1759. GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
  1760. (intmax_t)offset);
  1761. }
  1762. }
  1763. }
  1764. end:
  1765. gj_free(bp->bio_data, cp->provider->sectorsize);
  1766. g_destroy_bio(bp);
  1767. /* Remove bios from unterminated journal. */
  1768. while ((tbp = fbp) != NULL) {
  1769. fbp = tbp->bio_next;
  1770. g_destroy_bio(tbp);
  1771. }
  1772. if (found < 1 && joffset > 0) {
  1773. GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
  1774. sc->sc_name);
  1775. while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
  1776. sc->sc_inactive.jj_queue = tbp->bio_next;
  1777. g_destroy_bio(tbp);
  1778. }
  1779. g_journal_initialize(sc);
  1780. g_journal_mark_as_dirty(sc);
  1781. } else {
  1782. GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
  1783. g_journal_copy_start(sc);
  1784. }
  1785. }
  1786. /*
  1787. * Wait for requests.
  1788. * If we have requests in the current queue, flush them after 3 seconds from the
  1789. * last flush. In this way we don't wait forever (or for journal switch) with
  1790. * storing not full records on journal.
  1791. */
  1792. static void
  1793. g_journal_wait(struct g_journal_softc *sc, time_t last_write)
  1794. {
  1795. int error, timeout;
  1796. GJ_DEBUG(3, "%s: enter", __func__);
  1797. if (sc->sc_current_count == 0) {
  1798. if (g_journal_debug < 2)
  1799. msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
  1800. else {
  1801. /*
  1802. * If we have debug turned on, show number of elements
  1803. * in various queues.
  1804. */
  1805. for (;;) {
  1806. error = msleep(sc, &sc->sc_mtx, PRIBIO,
  1807. "gj:work", hz * 3);
  1808. if (error == 0) {
  1809. mtx_unlock(&sc->sc_mtx);
  1810. break;
  1811. }
  1812. GJ_DEBUG(3, "Report: current count=%d",
  1813. sc->sc_current_count);
  1814. GJ_DEBUG(3, "Report: flush count=%d",
  1815. sc->sc_flush_count);
  1816. GJ_DEBUG(3, "Report: flush in progress=%d",
  1817. sc->sc_flush_in_progress);
  1818. GJ_DEBUG(3, "Report: copy in progress=%d",
  1819. sc->sc_copy_in_progress);
  1820. GJ_DEBUG(3, "Report: delayed=%d",
  1821. sc->sc_delayed_count);
  1822. }
  1823. }
  1824. GJ_DEBUG(3, "%s: exit 1", __func__);
  1825. return;
  1826. }
  1827. /*
  1828. * Flush even not full records every 3 seconds.
  1829. */
  1830. timeout = (last_write + 3 - time_second) * hz;
  1831. if (timeout <= 0) {
  1832. mtx_unlock(&sc->sc_mtx);
  1833. g_journal_flush(sc);
  1834. g_journal_flush_send(sc);
  1835. GJ_DEBUG(3, "%s: exit 2", __func__);
  1836. return;
  1837. }
  1838. error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
  1839. if (error == EWOULDBLOCK)
  1840. g_journal_flush_send(sc);
  1841. GJ_DEBUG(3, "%s: exit 3", __func__);
  1842. }
  1843. /*
  1844. * Worker thread.
  1845. */
  1846. static void
  1847. g_journal_worker(void *arg)
  1848. {
  1849. struct g_journal_softc *sc;
  1850. struct g_geom *gp;
  1851. struct g_provider *pp;
  1852. struct bio *bp;
  1853. time_t last_write;
  1854. int type;
  1855. thread_lock(curthread);
  1856. sched_prio(curthread, PRIBIO);
  1857. thread_unlock(curthread);
  1858. sc = arg;
  1859. type = 0; /* gcc */
  1860. if (sc->sc_flags & GJF_DEVICE_CLEAN) {
  1861. GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
  1862. g_journal_initialize(sc);
  1863. } else {
  1864. g_journal_sync(sc);
  1865. }
  1866. /*
  1867. * Check if we can use BIO_FLUSH.
  1868. */
  1869. sc->sc_bio_flush = 0;
  1870. if (g_io_flush(sc->sc_jconsumer) == 0) {
  1871. sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
  1872. GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
  1873. sc->sc_jconsumer->provider->name);
  1874. } else {
  1875. GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
  1876. sc->sc_jconsumer->provider->name);
  1877. }
  1878. if (sc->sc_jconsumer != sc->sc_dconsumer) {
  1879. if (g_io_flush(sc->sc_dconsumer) == 0) {
  1880. sc->sc_bio_flush |= GJ_FLUSH_DATA;
  1881. GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
  1882. sc->sc_dconsumer->provider->name);
  1883. } else {
  1884. GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
  1885. sc->sc_dconsumer->provider->name);
  1886. }
  1887. }
  1888. gp = sc->sc_geom;
  1889. g_topology_lock();
  1890. pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
  1891. pp->mediasize = sc->sc_mediasize;
  1892. /*
  1893. * There could be a problem when data provider and journal providers
  1894. * have different sectorsize, but such scenario is prevented on journal
  1895. * creation.
  1896. */
  1897. pp->sectorsize = sc->sc_sectorsize;
  1898. g_error_provider(pp, 0);
  1899. g_topology_unlock();
  1900. last_write = time_second;
  1901. if (sc->sc_rootmount != NULL) {
  1902. GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
  1903. root_mount_rel(sc->sc_rootmount);
  1904. sc->sc_rootmount = NULL;
  1905. }
  1906. for (;;) {
  1907. /* Get first request from the queue. */
  1908. mtx_lock(&sc->sc_mtx);
  1909. bp = bioq_first(&sc->sc_back_queue);
  1910. if (bp != NULL)
  1911. type = (bp->bio_cflags & GJ_BIO_MASK);
  1912. if (bp == NULL) {
  1913. bp = bioq_first(&sc->sc_regular_queue);
  1914. if (bp != NULL)
  1915. type = GJ_BIO_REGULAR;
  1916. }
  1917. if (bp == NULL) {
  1918. try_switch:
  1919. if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
  1920. (sc->sc_flags & GJF_DEVICE_DESTROY)) {
  1921. if (sc->sc_current_count > 0) {
  1922. mtx_unlock(&sc->sc_mtx);
  1923. g_journal_flush(sc);
  1924. g_journal_flush_send(sc);
  1925. continue;
  1926. }
  1927. if (sc->sc_flush_in_progress > 0)
  1928. goto sleep;
  1929. if (sc->sc_copy_in_progress > 0)
  1930. goto sleep;
  1931. }
  1932. if (sc->sc_flags & GJF_DEVICE_SWITCH) {
  1933. mtx_unlock(&sc->sc_mtx);
  1934. g_journal_switch(sc);
  1935. wakeup(&sc->sc_journal_copying);
  1936. continue;
  1937. }
  1938. if (sc->sc_flags & GJF_DEVICE_DESTROY) {
  1939. GJ_DEBUG(1, "Shutting down worker "
  1940. "thread for %s.", gp->name);
  1941. sc->sc_worker = NULL;
  1942. wakeup(&sc->sc_worker);
  1943. mtx_unlock(&sc->sc_mtx);
  1944. kproc_exit(0);
  1945. }
  1946. sleep:
  1947. g_journal_wait(sc, last_write);
  1948. continue;
  1949. }
  1950. /*
  1951. * If we're in switch process, we need to delay all new
  1952. * write requests until its done.
  1953. */
  1954. if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
  1955. type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
  1956. GJ_LOGREQ(2, bp, "WRITE on SWITCH");
  1957. goto try_switch;
  1958. }
  1959. if (type == GJ_BIO_REGULAR)
  1960. bioq_remove(&sc->sc_regular_queue, bp);
  1961. else
  1962. bioq_remove(&sc->sc_back_queue, bp);
  1963. mtx_unlock(&sc->sc_mtx);
  1964. switch (type) {
  1965. case GJ_BIO_REGULAR:
  1966. /* Regular request. */
  1967. switch (bp->bio_cmd) {
  1968. case BIO_READ:
  1969. g_journal_read(sc, bp, bp->bio_offset,
  1970. bp->bio_offset + bp->bio_length);
  1971. break;
  1972. case BIO_WRITE:
  1973. last_write = time_second;
  1974. g_journal_add_request(sc, bp);
  1975. g_journal_flush_send(sc);
  1976. break;
  1977. default:
  1978. panic("Invalid bio_cmd (%d).", bp->bio_cmd);
  1979. }
  1980. break;
  1981. case GJ_BIO_COPY:
  1982. switch (bp->bio_cmd) {
  1983. case BIO_READ:
  1984. if (g_journal_copy_read_done(bp))
  1985. g_journal_copy_send(sc);
  1986. break;
  1987. case BIO_WRITE:
  1988. g_journal_copy_write_done(bp);
  1989. g_journal_copy_send(sc);
  1990. break;
  1991. default:
  1992. panic("Invalid bio_cmd (%d).", bp->bio_cmd);
  1993. }
  1994. break;
  1995. case GJ_BIO_JOURNAL:
  1996. g_journal_flush_done(bp);
  1997. g_journal_flush_send(sc);
  1998. break;
  1999. case GJ_BIO_READ:
  2000. default:
  2001. panic("Invalid bio (%d).", type);
  2002. }
  2003. }
  2004. }
  2005. static void
  2006. g_journal_destroy_event(void *arg, int flags __unused)
  2007. {
  2008. struct g_journal_softc *sc;
  2009. g_topology_assert();
  2010. sc = arg;
  2011. g_journal_destroy(sc);
  2012. }
  2013. static void
  2014. g_journal_timeout(void *arg)
  2015. {
  2016. struct g_journal_softc *sc;
  2017. sc = arg;
  2018. GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
  2019. sc->sc_geom->name);
  2020. g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
  2021. }
  2022. static struct g_geom *
  2023. g_journal_create(struct g_class *mp, struct g_provider *pp,
  2024. const struct g_journal_metadata *md)
  2025. {
  2026. struct g_journal_softc *sc;
  2027. struct g_geom *gp;
  2028. struct g_consumer *cp;
  2029. int error;
  2030. sc = NULL; /* gcc */
  2031. g_topology_assert();
  2032. /*
  2033. * There are two possibilities:
  2034. * 1. Data and both journals are on the same provider.
  2035. * 2. Data and journals are all on separated providers.
  2036. */
  2037. /* Look for journal device with the same ID. */
  2038. LIST_FOREACH(gp, &mp->geom, geom) {
  2039. sc = gp->softc;
  2040. if (sc == NULL)
  2041. continue;
  2042. if (sc->sc_id == md->md_id)
  2043. break;
  2044. }
  2045. if (gp == NULL)
  2046. sc = NULL;
  2047. else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
  2048. GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
  2049. return (NULL);
  2050. }
  2051. if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
  2052. GJ_DEBUG(0, "Invalid type on %s.", pp->name);
  2053. return (NULL);
  2054. }
  2055. if (md->md_type & GJ_TYPE_DATA) {
  2056. GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
  2057. pp->name);
  2058. }
  2059. if (md->md_type & GJ_TYPE_JOURNAL) {
  2060. GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
  2061. pp->name);
  2062. }
  2063. if (sc == NULL) {
  2064. /* Action geom. */
  2065. sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
  2066. sc->sc_id = md->md_id;
  2067. sc->sc_type = 0;
  2068. sc->sc_flags = 0;
  2069. sc->sc_worker = NULL;
  2070. gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
  2071. gp->start = g_journal_start;
  2072. gp->orphan = g_journal_orphan;
  2073. gp->access = g_journal_access;
  2074. gp->softc = sc;
  2075. gp->flags |= G_GEOM_VOLATILE_BIO;
  2076. sc->sc_geom = gp;
  2077. mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
  2078. bioq_init(&sc->sc_back_queue);
  2079. bioq_init(&sc->sc_regular_queue);
  2080. bioq_init(&sc->sc_delayed_queue);
  2081. sc->sc_delayed_count = 0;
  2082. sc->sc_current_queue = NULL;
  2083. sc->sc_current_count = 0;
  2084. sc->sc_flush_queue = NULL;
  2085. sc->sc_flush_count = 0;
  2086. sc->sc_flush_in_progress = 0;
  2087. sc->sc_copy_queue = NULL;
  2088. sc->sc_copy_in_progress = 0;
  2089. sc->sc_inactive.jj_queue = NULL;
  2090. sc->sc_active.jj_queue = NULL;
  2091. sc->sc_rootmount = root_mount_hold("GJOURNAL");
  2092. GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
  2093. callout_init(&sc->sc_callout, 1);
  2094. if (md->md_type != GJ_TYPE_COMPLETE) {
  2095. /*
  2096. * Journal and data are on separate providers.
  2097. * At this point we have only one of them.
  2098. * We setup a timeout in case the other part will not
  2099. * appear, so we won't wait forever.
  2100. */
  2101. callout_reset(&sc->sc_callout, 5 * hz,
  2102. g_journal_timeout, sc);
  2103. }
  2104. }
  2105. /* Remember type of the data provider. */
  2106. if (md->md_type & GJ_TYPE_DATA)
  2107. sc->sc_orig_type = md->md_type;
  2108. sc->sc_type |= md->md_type;
  2109. cp = NULL;
  2110. if (md->md_type & GJ_TYPE_DATA) {
  2111. if (md->md_flags & GJ_FLAG_CLEAN)
  2112. sc->sc_flags |= GJF_DEVICE_CLEAN;
  2113. if (md->md_flags & GJ_FLAG_CHECKSUM)
  2114. sc->sc_flags |= GJF_DEVICE_CHECKSUM;
  2115. cp = g_new_consumer(gp);
  2116. error = g_attach(cp, pp);
  2117. KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
  2118. pp->name, error));
  2119. error = g_access(cp, 1, 1, 1);
  2120. if (error != 0) {
  2121. GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
  2122. error);
  2123. g_journal_destroy(sc);
  2124. return (NULL);
  2125. }
  2126. sc->sc_dconsumer = cp;
  2127. sc->sc_mediasize = pp->mediasize - pp->sectorsize;
  2128. sc->sc_sectorsize = pp->sectorsize;
  2129. sc->sc_jstart = md->md_jstart;
  2130. sc->sc_jend = md->md_jend;
  2131. if (md->md_provider[0] != '\0')
  2132. sc->sc_flags |= GJF_DEVICE_HARDCODED;
  2133. sc->sc_journal_offset = md->md_joffset;
  2134. sc->sc_journal_id = md->md_jid;
  2135. sc->sc_journal_previous_id = md->md_jid;
  2136. }
  2137. if (md->md_type & GJ_TYPE_JOURNAL) {
  2138. if (cp == NULL) {
  2139. cp = g_new_consumer(gp);
  2140. error = g_attach(cp, pp);
  2141. KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
  2142. pp->name, error));
  2143. error = g_access(cp, 1, 1, 1);
  2144. if (error != 0) {
  2145. GJ_DEBUG(0, "Cannot access %s (error=%d).",
  2146. pp->name, error);
  2147. g_journal_destroy(sc);
  2148. return (NULL);
  2149. }
  2150. } else {
  2151. /*
  2152. * Journal is on the same provider as data, which means
  2153. * that data provider ends where journal starts.
  2154. */
  2155. sc->sc_mediasize = md->md_jstart;
  2156. }
  2157. sc->sc_jconsumer = cp;
  2158. }
  2159. /* Start switcher kproc if needed. */
  2160. if (g_journal_switcher_proc == NULL)
  2161. g_journal_start_switcher(mp);
  2162. if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
  2163. /* Journal is not complete yet. */
  2164. return (gp);
  2165. } else {
  2166. /* Journal complete, cancel timeout. */
  2167. callout_drain(&sc->sc_callout);
  2168. }
  2169. error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
  2170. "g_journal %s", sc->sc_name);
  2171. if (error != 0) {
  2172. GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
  2173. sc->sc_name);
  2174. g_journal_destroy(sc);
  2175. return (NULL);
  2176. }
  2177. return (gp);
  2178. }
  2179. static void
  2180. g_journal_destroy_consumer(void *arg, int flags __unused)
  2181. {
  2182. struct g_consumer *cp;
  2183. g_topology_assert();
  2184. cp = arg;
  2185. g_detach(cp);
  2186. g_destroy_consumer(cp);
  2187. }
  2188. static int
  2189. g_journal_destroy(struct g_journal_softc *sc)
  2190. {
  2191. struct g_geom *gp;
  2192. struct g_provider *pp;
  2193. struct g_consumer *cp;
  2194. g_topology_assert();
  2195. if (sc == NULL)
  2196. return (ENXIO);
  2197. gp = sc->sc_geom;
  2198. pp = LIST_FIRST(&gp->provider);
  2199. if (pp != NULL) {
  2200. if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
  2201. GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
  2202. pp->name, pp->acr, pp->acw, pp->ace);
  2203. return (EBUSY);
  2204. }
  2205. g_error_provider(pp, ENXIO);
  2206. g_journal_flush(sc);
  2207. g_journal_flush_send(sc);
  2208. g_journal_switch(sc);
  2209. }
  2210. sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
  2211. g_topology_unlock();
  2212. if (sc->sc_rootmount != NULL) {
  2213. GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
  2214. root_mount_rel(sc->sc_rootmount);
  2215. sc->sc_rootmount = NULL;
  2216. }
  2217. callout_drain(&sc->sc_callout);
  2218. mtx_lock(&sc->sc_mtx);
  2219. wakeup(sc);
  2220. while (sc->sc_worker != NULL)
  2221. msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
  2222. mtx_unlock(&sc->sc_mtx);
  2223. if (pp != NULL) {
  2224. GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
  2225. g_journal_metadata_update(sc);
  2226. g_topology_lock();
  2227. g_wither_provider(pp, ENXIO);
  2228. } else {
  2229. g_topology_lock();
  2230. }
  2231. mtx_destroy(&sc->sc_mtx);
  2232. if (sc->sc_current_count != 0) {
  2233. GJ_DEBUG(0, "Warning! Number of current requests %d.",
  2234. sc->sc_current_count);
  2235. }
  2236. gp->softc = NULL;
  2237. LIST_FOREACH(cp, &gp->consumer, consumer) {
  2238. if (cp->acr + cp->acw + cp->ace > 0)
  2239. g_access(cp, -1, -1, -1);
  2240. /*
  2241. * We keep all consumers open for writing, so if I'll detach
  2242. * and destroy consumer here, I'll get providers for taste, so
  2243. * journal will be started again.
  2244. * Sending an event here, prevents this from happening.
  2245. */
  2246. g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
  2247. }
  2248. g_wither_geom(gp, ENXIO);
  2249. free(sc, M_JOURNAL);
  2250. return (0);
  2251. }
  2252. static void
  2253. g_journal_taste_orphan(struct g_consumer *cp)
  2254. {
  2255. KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
  2256. cp->provider->name));
  2257. }
  2258. static struct g_geom *
  2259. g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
  2260. {
  2261. struct g_journal_metadata md;
  2262. struct g_consumer *cp;
  2263. struct g_geom *gp;
  2264. int error;
  2265. g_topology_assert();
  2266. g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
  2267. GJ_DEBUG(2, "Tasting %s.", pp->name);
  2268. if (pp->geom->class == mp)
  2269. return (NULL);
  2270. gp = g_new_geomf(mp, "journal:taste");
  2271. /* This orphan function should be never called. */
  2272. gp->orphan = g_journal_taste_orphan;
  2273. cp = g_new_consumer(gp);
  2274. cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
  2275. error = g_attach(cp, pp);
  2276. if (error == 0) {
  2277. error = g_journal_metadata_read(cp, &md);
  2278. g_detach(cp);
  2279. }
  2280. g_destroy_consumer(cp);
  2281. g_destroy_geom(gp);
  2282. if (error != 0)
  2283. return (NULL);
  2284. gp = NULL;
  2285. if (md.md_provider[0] != '\0' &&
  2286. !g_compare_names(md.md_provider, pp->name))
  2287. return (NULL);
  2288. if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
  2289. return (NULL);
  2290. if (g_journal_debug >= 2)
  2291. journal_metadata_dump(&md);
  2292. gp = g_journal_create(mp, pp, &md);
  2293. return (gp);
  2294. }
  2295. static struct g_journal_softc *
  2296. g_journal_find_device(struct g_class *mp, const char *name)
  2297. {
  2298. struct g_journal_softc *sc;
  2299. struct g_geom *gp;
  2300. struct g_provider *pp;
  2301. if (strncmp(name, _PATH_DEV, 5) == 0)
  2302. name += 5;
  2303. LIST_FOREACH(gp, &mp->geom, geom) {
  2304. sc = gp->softc;
  2305. if (sc == NULL)
  2306. continue;
  2307. if (sc->sc_flags & GJF_DEVICE_DESTROY)
  2308. continue;
  2309. if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
  2310. continue;
  2311. pp = LIST_FIRST(&gp->provider);
  2312. if (strcmp(sc->sc_name, name) == 0)
  2313. return (sc);
  2314. if (pp != NULL && strcmp(pp->name, name) == 0)
  2315. return (sc);
  2316. }
  2317. return (NULL);
  2318. }
  2319. static void
  2320. g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
  2321. {
  2322. struct g_journal_softc *sc;
  2323. const char *name;
  2324. char param[16];
  2325. int *nargs;
  2326. int error, i;
  2327. g_topology_assert();
  2328. nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
  2329. if (nargs == NULL) {
  2330. gctl_error(req, "No '%s' argument.", "nargs");
  2331. return;
  2332. }
  2333. if (*nargs <= 0) {
  2334. gctl_error(req, "Missing device(s).");
  2335. return;
  2336. }
  2337. for (i = 0; i < *nargs; i++) {
  2338. snprintf(param, sizeof(param), "arg%d", i);
  2339. name = gctl_get_asciiparam(req, param);
  2340. if (name == NULL) {
  2341. gctl_error(req, "No 'arg%d' argument.", i);
  2342. return;
  2343. }
  2344. sc = g_journal_find_device(mp, name);
  2345. if (sc == NULL) {
  2346. gctl_error(req, "No such device: %s.", name);
  2347. return;
  2348. }
  2349. error = g_journal_destroy(sc);
  2350. if (error != 0) {
  2351. gctl_error(req, "Cannot destroy device %s (error=%d).",
  2352. LIST_FIRST(&sc->sc_geom->provider)->name, error);
  2353. return;
  2354. }
  2355. }
  2356. }
  2357. static void
  2358. g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
  2359. {
  2360. g_topology_assert();
  2361. g_topology_unlock();
  2362. g_journal_sync_requested++;
  2363. wakeup(&g_journal_switcher_state);
  2364. while (g_journal_sync_requested > 0)
  2365. tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
  2366. g_topology_lock();
  2367. }
  2368. static void
  2369. g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
  2370. {
  2371. uint32_t *version;
  2372. g_topology_assert();
  2373. version = gctl_get_paraml(req, "version", sizeof(*version));
  2374. if (version == NULL) {
  2375. gctl_error(req, "No '%s' argument.", "version");
  2376. return;
  2377. }
  2378. if (*version != G_JOURNAL_VERSION) {
  2379. gctl_error(req, "Userland and kernel parts are out of sync.");
  2380. return;
  2381. }
  2382. if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
  2383. g_journal_ctl_destroy(req, mp);
  2384. return;
  2385. } else if (strcmp(verb, "sync") == 0) {
  2386. g_journal_ctl_sync(req, mp);
  2387. return;
  2388. }
  2389. gctl_error(req, "Unknown verb.");
  2390. }
  2391. static void
  2392. g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
  2393. struct g_consumer *cp, struct g_provider *pp)
  2394. {
  2395. struct g_journal_softc *sc;
  2396. g_topology_assert();
  2397. sc = gp->softc;
  2398. if (sc == NULL)
  2399. return;
  2400. if (pp != NULL) {
  2401. /* Nothing here. */
  2402. } else if (cp != NULL) {
  2403. int first = 1;
  2404. sbuf_printf(sb, "%s<Role>", indent);
  2405. if (cp == sc->sc_dconsumer) {
  2406. sbuf_cat(sb, "Data");
  2407. first = 0;
  2408. }
  2409. if (cp == sc->sc_jconsumer) {
  2410. if (!first)
  2411. sbuf_cat(sb, ",");
  2412. sbuf_cat(sb, "Journal");
  2413. }
  2414. sbuf_cat(sb, "</Role>\n");
  2415. if (cp == sc->sc_jconsumer) {
  2416. sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
  2417. (intmax_t)sc->sc_jstart);
  2418. sbuf_printf(sb, "<Jend>%jd</Jend>\n",
  2419. (intmax_t)sc->sc_jend);
  2420. }
  2421. } else {
  2422. sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
  2423. }
  2424. }
  2425. static eventhandler_tag g_journal_event_shutdown = NULL;
  2426. static eventhandler_tag g_journal_event_lowmem = NULL;
  2427. static void
  2428. g_journal_shutdown_post_sync(void *arg, int howto)
  2429. {
  2430. struct g_class *mp;
  2431. struct g_geom *gp, *gp2;
  2432. if ((howto & RB_NOSYNC) != 0)
  2433. return;
  2434. mp = arg;
  2435. g_topology_lock();
  2436. LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
  2437. if (gp->softc == NULL)
  2438. continue;
  2439. GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
  2440. g_journal_destroy(gp->softc);
  2441. }
  2442. g_topology_unlock();
  2443. }
  2444. /*
  2445. * Free cached requests from inactive queue in case of low memory.
  2446. * We free GJ_FREE_AT_ONCE elements at once.
  2447. */
  2448. #define GJ_FREE_AT_ONCE 4
  2449. static void
  2450. g_journal_lowmem(void *arg, int howto __unused)
  2451. {
  2452. struct g_journal_softc *sc;
  2453. struct g_class *mp;
  2454. struct g_geom *gp;
  2455. struct bio *bp;
  2456. u_int nfree = GJ_FREE_AT_ONCE;
  2457. g_journal_stats_low_mem++;
  2458. mp = arg;
  2459. g_topology_lock();
  2460. LIST_FOREACH(gp, &mp->geom, geom) {
  2461. sc = gp->softc;
  2462. if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
  2463. continue;
  2464. mtx_lock(&sc->sc_mtx);
  2465. for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
  2466. nfree--, bp = bp->bio_next) {
  2467. /*
  2468. * This is safe to free the bio_data, because:
  2469. * 1. If bio_data is NULL it will be read from the
  2470. * inactive journal.
  2471. * 2. If bp is sent down, it is first removed from the
  2472. * inactive queue, so it's impossible to free the
  2473. * data from under in-flight bio.
  2474. * On the other hand, freeing elements from the active
  2475. * queue, is not safe.
  2476. */
  2477. if (bp->bio_data != NULL) {
  2478. GJ_DEBUG(2, "Freeing data from %s.",
  2479. sc->sc_name);
  2480. gj_free(bp->bio_data, bp->bio_length);
  2481. bp->bio_data = NULL;
  2482. }
  2483. }
  2484. mtx_unlock(&sc->sc_mtx);
  2485. if (nfree == 0)
  2486. break;
  2487. }
  2488. g_topology_unlock();
  2489. }
  2490. static void g_journal_switcher(void *arg);
  2491. static void
  2492. g_journal_init(struct g_class *mp)
  2493. {
  2494. /* Pick a conservative value if provided value sucks. */
  2495. if (g_journal_cache_divisor <= 0 ||
  2496. (vm_kmem_size / g_journal_cache_divisor == 0)) {
  2497. g_journal_cache_divisor = 5;
  2498. }
  2499. if (g_journal_cache_limit > 0) {
  2500. g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
  2501. g_journal_cache_low =
  2502. (g_journal_cache_limit / 100) * g_journal_cache_switch;
  2503. }
  2504. g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
  2505. g_journal_shutdown_post_sync, mp, EVENTHANDLER_PRI_FIRST);
  2506. if (g_journal_event_shutdown == NULL)
  2507. GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
  2508. g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
  2509. g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
  2510. if (g_journal_event_lowmem == NULL)
  2511. GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
  2512. }
  2513. static void
  2514. g_journal_fini(struct g_class *mp)
  2515. {
  2516. if (g_journal_event_shutdown != NULL) {
  2517. EVENTHANDLER_DEREGISTER(shutdown_post_sync,
  2518. g_journal_event_shutdown);
  2519. }
  2520. if (g_journal_event_lowmem != NULL)
  2521. EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
  2522. if (g_journal_switcher_proc != NULL)
  2523. g_journal_stop_switcher();
  2524. }
  2525. DECLARE_GEOM_CLASS(g_journal_class, g_journal);
  2526. static const struct g_journal_desc *
  2527. g_journal_find_desc(const char *fstype)
  2528. {
  2529. const struct g_journal_desc *desc;
  2530. int i;
  2531. for (desc = g_journal_filesystems[i = 0]; desc != NULL;
  2532. desc = g_journal_filesystems[++i]) {
  2533. if (strcmp(desc->jd_fstype, fstype) == 0)
  2534. break;
  2535. }
  2536. return (desc);
  2537. }
  2538. static void
  2539. g_journal_switch_wait(struct g_journal_softc *sc)
  2540. {
  2541. struct bintime bt;
  2542. mtx_assert(&sc->sc_mtx, MA_OWNED);
  2543. if (g_journal_debug >= 2) {
  2544. if (sc->sc_flush_in_progress > 0) {
  2545. GJ_DEBUG(2, "%d requests flushing.",
  2546. sc->sc_flush_in_progress);
  2547. }
  2548. if (sc->sc_copy_in_progress > 0) {
  2549. GJ_DEBUG(2, "%d requests copying.",
  2550. sc->sc_copy_in_progress);
  2551. }
  2552. if (sc->sc_flush_count > 0) {
  2553. GJ_DEBUG(2, "%d requests to flush.",
  2554. sc->sc_flush_count);
  2555. }
  2556. if (sc->sc_delayed_count > 0) {
  2557. GJ_DEBUG(2, "%d requests delayed.",
  2558. sc->sc_delayed_count);
  2559. }
  2560. }
  2561. g_journal_stats_switches++;
  2562. if (sc->sc_copy_in_progress > 0)
  2563. g_journal_stats_wait_for_copy++;
  2564. GJ_TIMER_START(1, &bt);
  2565. sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
  2566. sc->sc_flags |= GJF_DEVICE_SWITCH;
  2567. wakeup(sc);
  2568. while (sc->sc_flags & GJF_DEVICE_SWITCH) {
  2569. msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
  2570. "gj:switch", 0);
  2571. }
  2572. GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
  2573. }
  2574. static void
  2575. g_journal_do_switch(struct g_class *classp)
  2576. {
  2577. struct g_journal_softc *sc;
  2578. const struct g_journal_desc *desc;
  2579. struct g_geom *gp;
  2580. struct mount *mp;
  2581. struct bintime bt;
  2582. char *mountpoint;
  2583. int error, save;
  2584. g_topology_lock();
  2585. LIST_FOREACH(gp, &classp->geom, geom) {
  2586. sc = gp->softc;
  2587. if (sc == NULL)
  2588. continue;
  2589. if (sc->sc_flags & GJF_DEVICE_DESTROY)
  2590. continue;
  2591. if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
  2592. continue;
  2593. mtx_lock(&sc->sc_mtx);
  2594. sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
  2595. mtx_unlock(&sc->sc_mtx);
  2596. }
  2597. g_topology_unlock();
  2598. mtx_lock(&mountlist_mtx);
  2599. TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  2600. if (mp->mnt_gjprovider == NULL)
  2601. continue;
  2602. if (mp->mnt_flag & MNT_RDONLY)
  2603. continue;
  2604. desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
  2605. if (desc == NULL)
  2606. continue;
  2607. if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
  2608. continue;
  2609. /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
  2610. g_topology_lock();
  2611. sc = g_journal_find_device(classp, mp->mnt_gjprovider);
  2612. g_topology_unlock();
  2613. if (sc == NULL) {
  2614. GJ_DEBUG(0, "Cannot find journal geom for %s.",
  2615. mp->mnt_gjprovider);
  2616. goto next;
  2617. } else if (JEMPTY(sc)) {
  2618. mtx_lock(&sc->sc_mtx);
  2619. sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
  2620. mtx_unlock(&sc->sc_mtx);
  2621. GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
  2622. goto next;
  2623. }
  2624. mountpoint = mp->mnt_stat.f_mntonname;
  2625. error = vn_start_write(NULL, &mp, V_WAIT);
  2626. if (error != 0) {
  2627. GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
  2628. mountpoint, error);
  2629. goto next;
  2630. }
  2631. save = curthread_pflags_set(TDP_SYNCIO);
  2632. GJ_TIMER_START(1, &bt);
  2633. vfs_periodic(mp, MNT_NOWAIT);
  2634. GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
  2635. GJ_TIMER_START(1, &bt);
  2636. error = VFS_SYNC(mp, MNT_NOWAIT);
  2637. if (error == 0)
  2638. GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
  2639. else {
  2640. GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
  2641. mountpoint, error);
  2642. }
  2643. curthread_pflags_restore(save);
  2644. vn_finished_write(mp);
  2645. if (error != 0)
  2646. goto next;
  2647. /*
  2648. * Send BIO_FLUSH before freezing the file system, so it can be
  2649. * faster after the freeze.
  2650. */
  2651. GJ_TIMER_START(1, &bt);
  2652. g_journal_flush_cache(sc);
  2653. GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
  2654. GJ_TIMER_START(1, &bt);
  2655. error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
  2656. GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
  2657. if (error != 0) {
  2658. GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
  2659. mountpoint, error);
  2660. goto next;
  2661. }
  2662. error = desc->jd_clean(mp);
  2663. if (error != 0)
  2664. goto next;
  2665. mtx_lock(&sc->sc_mtx);
  2666. g_journal_switch_wait(sc);
  2667. mtx_unlock(&sc->sc_mtx);
  2668. vfs_write_resume(mp, 0);
  2669. next:
  2670. mtx_lock(&mountlist_mtx);
  2671. vfs_unbusy(mp);
  2672. }
  2673. mtx_unlock(&mountlist_mtx);
  2674. sc = NULL;
  2675. for (;;) {
  2676. g_topology_lock();
  2677. LIST_FOREACH(gp, &g_journal_class.geom, geom) {
  2678. sc = gp->softc;
  2679. if (sc == NULL)
  2680. continue;
  2681. mtx_lock(&sc->sc_mtx);
  2682. if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
  2683. !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
  2684. (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
  2685. break;
  2686. }
  2687. mtx_unlock(&sc->sc_mtx);
  2688. sc = NULL;
  2689. }
  2690. g_topology_unlock();
  2691. if (sc == NULL)
  2692. break;
  2693. mtx_assert(&sc->sc_mtx, MA_OWNED);
  2694. g_journal_switch_wait(sc);
  2695. mtx_unlock(&sc->sc_mtx);
  2696. }
  2697. }
  2698. static void
  2699. g_journal_start_switcher(struct g_class *mp)
  2700. {
  2701. int error __diagused;
  2702. g_topology_assert();
  2703. MPASS(g_journal_switcher_proc == NULL);
  2704. g_journal_switcher_state = GJ_SWITCHER_WORKING;
  2705. error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc,
  2706. 0, 0, "g_journal switcher");
  2707. KASSERT(error == 0, ("Cannot create switcher thread."));
  2708. }
  2709. static void
  2710. g_journal_stop_switcher(void)
  2711. {
  2712. g_topology_assert();
  2713. MPASS(g_journal_switcher_proc != NULL);
  2714. g_journal_switcher_state = GJ_SWITCHER_DIE;
  2715. wakeup(&g_journal_switcher_state);
  2716. while (g_journal_switcher_state != GJ_SWITCHER_DIED)
  2717. tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
  2718. GJ_DEBUG(1, "Switcher died.");
  2719. g_journal_switcher_proc = NULL;
  2720. }
  2721. /*
  2722. * TODO: Kill switcher thread on last geom destruction?
  2723. */
  2724. static void
  2725. g_journal_switcher(void *arg)
  2726. {
  2727. struct g_class *mp;
  2728. struct bintime bt;
  2729. int error;
  2730. mp = arg;
  2731. curthread->td_pflags |= TDP_NORUNNINGBUF;
  2732. for (;;) {
  2733. g_journal_switcher_wokenup = 0;
  2734. error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
  2735. g_journal_switch_time * hz);
  2736. if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
  2737. g_journal_switcher_state = GJ_SWITCHER_DIED;
  2738. GJ_DEBUG(1, "Switcher exiting.");
  2739. wakeup(&g_journal_switcher_state);
  2740. kproc_exit(0);
  2741. }
  2742. if (error == 0 && g_journal_sync_requested == 0) {
  2743. GJ_DEBUG(1, "Out of cache, force switch (used=%jd "
  2744. "limit=%jd).", (intmax_t)g_journal_cache_used,
  2745. (intmax_t)g_journal_cache_limit);
  2746. }
  2747. GJ_TIMER_START(1, &bt);
  2748. g_journal_do_switch(mp);
  2749. GJ_TIMER_STOP(1, &bt, "Entire switch time");
  2750. if (g_journal_sync_requested > 0) {
  2751. g_journal_sync_requested = 0;
  2752. wakeup(&g_journal_sync_requested);
  2753. }
  2754. }
  2755. }