bte_error.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Copyright (c) 2000-2007 Silicon Graphics, Inc. All Rights Reserved.
  7. */
  8. #include <linux/types.h>
  9. #include <asm/sn/sn_sal.h>
  10. #include "ioerror.h"
  11. #include <asm/sn/addrs.h>
  12. #include <asm/sn/shubio.h>
  13. #include <asm/sn/geo.h>
  14. #include "xtalk/xwidgetdev.h"
  15. #include "xtalk/hubdev.h"
  16. #include <asm/sn/bte.h>
  17. #include <asm/param.h>
  18. /*
  19. * Bte error handling is done in two parts. The first captures
  20. * any crb related errors. Since there can be multiple crbs per
  21. * interface and multiple interfaces active, we need to wait until
  22. * all active crbs are completed. This is the first job of the
  23. * second part error handler. When all bte related CRBs are cleanly
  24. * completed, it resets the interfaces and gets them ready for new
  25. * transfers to be queued.
  26. */
  27. void bte_error_handler(unsigned long);
  28. /*
  29. * Wait until all BTE related CRBs are completed
  30. * and then reset the interfaces.
  31. */
  32. int shub1_bte_error_handler(unsigned long _nodepda)
  33. {
  34. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  35. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  36. nasid_t nasid;
  37. int i;
  38. int valid_crbs;
  39. ii_imem_u_t imem; /* II IMEM Register */
  40. ii_icrb0_d_u_t icrbd; /* II CRB Register D */
  41. ii_ibcr_u_t ibcr;
  42. ii_icmr_u_t icmr;
  43. ii_ieclr_u_t ieclr;
  44. BTE_PRINTK(("shub1_bte_error_handler(%p) - %d\n", err_nodepda,
  45. smp_processor_id()));
  46. if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
  47. (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
  48. BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
  49. smp_processor_id()));
  50. return 1;
  51. }
  52. /* Determine information about our hub */
  53. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  54. /*
  55. * A BTE transfer can use multiple CRBs. We need to make sure
  56. * that all the BTE CRBs are complete (or timed out) before
  57. * attempting to clean up the error. Resetting the BTE while
  58. * there are still BTE CRBs active will hang the BTE.
  59. * We should look at all the CRBs to see if they are allocated
  60. * to the BTE and see if they are still active. When none
  61. * are active, we can continue with the cleanup.
  62. *
  63. * We also want to make sure that the local NI port is up.
  64. * When a router resets the NI port can go down, while it
  65. * goes through the LLP handshake, but then comes back up.
  66. */
  67. icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
  68. if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
  69. /*
  70. * There are errors which still need to be cleaned up by
  71. * hubiio_crb_error_handler
  72. */
  73. mod_timer(recovery_timer, jiffies + (HZ * 5));
  74. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  75. smp_processor_id()));
  76. return 1;
  77. }
  78. if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
  79. valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
  80. for (i = 0; i < IIO_NUM_CRBS; i++) {
  81. if (!((1 << i) & valid_crbs)) {
  82. /* This crb was not marked as valid, ignore */
  83. continue;
  84. }
  85. icrbd.ii_icrb0_d_regval =
  86. REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
  87. if (icrbd.d_bteop) {
  88. mod_timer(recovery_timer, jiffies + (HZ * 5));
  89. BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
  90. err_nodepda, smp_processor_id(),
  91. i));
  92. return 1;
  93. }
  94. }
  95. }
  96. BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
  97. /* Re-enable both bte interfaces */
  98. imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
  99. imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
  100. REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
  101. /* Clear BTE0/1 error bits */
  102. ieclr.ii_ieclr_regval = 0;
  103. if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
  104. ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
  105. if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
  106. ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
  107. REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
  108. /* Reinitialize both BTE state machines. */
  109. ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
  110. ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
  111. REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
  112. del_timer(recovery_timer);
  113. return 0;
  114. }
  115. /*
  116. * Wait until all BTE related CRBs are completed
  117. * and then reset the interfaces.
  118. */
  119. int shub2_bte_error_handler(unsigned long _nodepda)
  120. {
  121. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  122. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  123. struct bteinfo_s *bte;
  124. nasid_t nasid;
  125. u64 status;
  126. int i;
  127. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  128. /*
  129. * Verify that all the BTEs are complete
  130. */
  131. for (i = 0; i < BTES_PER_NODE; i++) {
  132. bte = &err_nodepda->bte_if[i];
  133. status = BTE_LNSTAT_LOAD(bte);
  134. if (status & IBLS_ERROR) {
  135. bte->bh_error = BTE_SHUB2_ERROR(status);
  136. continue;
  137. }
  138. if (!(status & IBLS_BUSY))
  139. continue;
  140. mod_timer(recovery_timer, jiffies + (HZ * 5));
  141. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  142. smp_processor_id()));
  143. return 1;
  144. }
  145. if (ia64_sn_bte_recovery(nasid))
  146. panic("bte_error_handler(): Fatal BTE Error");
  147. del_timer(recovery_timer);
  148. return 0;
  149. }
  150. /*
  151. * Wait until all BTE related CRBs are completed
  152. * and then reset the interfaces.
  153. */
  154. void bte_error_handler(unsigned long _nodepda)
  155. {
  156. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  157. spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
  158. int i;
  159. unsigned long irq_flags;
  160. volatile u64 *notify;
  161. bte_result_t bh_error;
  162. BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
  163. smp_processor_id()));
  164. spin_lock_irqsave(recovery_lock, irq_flags);
  165. /*
  166. * Lock all interfaces on this node to prevent new transfers
  167. * from being queued.
  168. */
  169. for (i = 0; i < BTES_PER_NODE; i++) {
  170. if (err_nodepda->bte_if[i].cleanup_active) {
  171. continue;
  172. }
  173. spin_lock(&err_nodepda->bte_if[i].spinlock);
  174. BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
  175. smp_processor_id(), i));
  176. err_nodepda->bte_if[i].cleanup_active = 1;
  177. }
  178. if (is_shub1()) {
  179. if (shub1_bte_error_handler(_nodepda)) {
  180. spin_unlock_irqrestore(recovery_lock, irq_flags);
  181. return;
  182. }
  183. } else {
  184. if (shub2_bte_error_handler(_nodepda)) {
  185. spin_unlock_irqrestore(recovery_lock, irq_flags);
  186. return;
  187. }
  188. }
  189. for (i = 0; i < BTES_PER_NODE; i++) {
  190. bh_error = err_nodepda->bte_if[i].bh_error;
  191. if (bh_error != BTE_SUCCESS) {
  192. /* There is an error which needs to be notified */
  193. notify = err_nodepda->bte_if[i].most_rcnt_na;
  194. BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
  195. err_nodepda->bte_if[i].bte_cnode,
  196. err_nodepda->bte_if[i].bte_num,
  197. IBLS_ERROR | (u64) bh_error));
  198. *notify = IBLS_ERROR | bh_error;
  199. err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
  200. }
  201. err_nodepda->bte_if[i].cleanup_active = 0;
  202. BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
  203. smp_processor_id(), i));
  204. spin_unlock(&err_nodepda->bte_if[i].spinlock);
  205. }
  206. spin_unlock_irqrestore(recovery_lock, irq_flags);
  207. }
  208. /*
  209. * First part error handler. This is called whenever any error CRB interrupt
  210. * is generated by the II.
  211. */
  212. void
  213. bte_crb_error_handler(cnodeid_t cnode, int btenum,
  214. int crbnum, ioerror_t * ioe, int bteop)
  215. {
  216. struct bteinfo_s *bte;
  217. bte = &(NODEPDA(cnode)->bte_if[btenum]);
  218. /*
  219. * The caller has already figured out the error type, we save that
  220. * in the bte handle structure for the thread exercising the
  221. * interface to consume.
  222. */
  223. bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
  224. bte->bte_error_count++;
  225. BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
  226. bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
  227. bte_error_handler((unsigned long) NODEPDA(cnode));
  228. }