vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go

   1 // Copyright ©2016 The Gonum Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package gonum
   6
   7 import (
   8         "math"
   9
  10         "gonum.org/v1/gonum/blas"
  11         "gonum.org/v1/gonum/blas/blas64"
  12 )
  13
  14 // Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
  15 // block of a Hessenberg matrix.
  16 //
  17 // wantt and wantz determine whether the quasi-triangular Schur factor and the
  18 // orthogonal Schur factor, respectively, will be computed.
  19 //
  20 // kacc22 specifies the computation mode of far-from-diagonal orthogonal
  21 // updates. Permitted values are:
  22 //  0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
  23 //     multiply to update far-from-diagonal matrix entries.
  24 //  1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
  25 //     update far-from-diagonal matrix entries.
  26 //  2: Dlaqr5 will accumulate reflections, use matrix-matrix multiply to update
  27 //     far-from-diagonal matrix entries, and take advantage of 2×2 block
  28 //     structure during matrix multiplies.
  29 // For other values of kacc2 Dlaqr5 will panic.
  30 //
  31 // n is the order of the Hessenberg matrix H.
  32 //
  33 // ktop and kbot are indices of the first and last row and column of an isolated
  34 // diagonal block upon which the QR sweep will be applied. It must hold that
  35 //  ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
  36 //  kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
  37 // otherwise Dlaqr5 will panic.
  38 //
  39 // nshfts is the number of simultaneous shifts. It must be positive and even,
  40 // otherwise Dlaqr5 will panic.
  41 //
  42 // sr and si contain the real and imaginary parts, respectively, of the shifts
  43 // of origin that define the multi-shift QR sweep. On return both slices may be
  44 // reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
  45 // will panic.
  46 //
  47 // h and ldh represent the Hessenberg matrix H of size n×n. On return
  48 // multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
  49 // diagonal block in rows and columns ktop through kbot, inclusive.
  50 //
  51 // iloz and ihiz specify the rows of Z to which transformations will be applied
  52 // if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
  53 // will panic.
  54 //
  55 // z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
  56 // orthogonal similarity transformation is accumulated into
  57 // z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
  58 //
  59 // v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
  60 // is transposed with respect to the reference netlib implementation.
  61 //
  62 // u and ldu represent an auxiliary matrix of size (3*nshfts-3)×(3*nshfts-3).
  63 //
  64 // wh and ldwh represent an auxiliary matrix of size (3*nshfts-3)×nh.
  65 //
  66 // wv and ldwv represent an auxiliary matrix of size nv×(3*nshfts-3).
  67 //
  68 // Dlaqr5 is an internal routine. It is exported for testing purposes.
  69 func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
  70         checkMatrix(n, n, h, ldh)
  71         if ktop < 0 || n <= ktop {
  72                 panic("lapack: invalid value of ktop")
  73         }
  74         if ktop > 0 && h[ktop*ldh+ktop-1] != 0 {
  75                 panic("lapack: diagonal block is not isolated")
  76         }
  77         if kbot < 0 || n <= kbot {
  78                 panic("lapack: invalid value of kbot")
  79         }
  80         if kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0 {
  81                 panic("lapack: diagonal block is not isolated")
  82         }
  83         if nshfts < 0 || nshfts&0x1 != 0 {
  84                 panic("lapack: invalid number of shifts")
  85         }
  86         if len(sr) != nshfts || len(si) != nshfts {
  87                 panic(badSlice) // TODO(vladimir-ch) Another message?
  88         }
  89         if wantz {
  90                 if ihiz >= n {
  91                         panic("lapack: invalid value of ihiz")
  92                 }
  93                 if iloz < 0 || ihiz < iloz {
  94                         panic("lapack: invalid value of iloz")
  95                 }
  96                 checkMatrix(n, n, z, ldz)
  97         }
  98         checkMatrix(nshfts/2, 3, v, ldv) // Transposed w.r.t. lapack.
  99         checkMatrix(3*nshfts-3, 3*nshfts-3, u, ldu)
 100         checkMatrix(nv, 3*nshfts-3, wv, ldwv)
 101         checkMatrix(3*nshfts-3, nh, wh, ldwh)
 102         if kacc22 != 0 && kacc22 != 1 && kacc22 != 2 {
 103                 panic("lapack: invalid value of kacc22")
 104         }
 105
 106         // If there are no shifts, then there is nothing to do.
 107         if nshfts < 2 {
 108                 return
 109         }
 110         // If the active block is empty or 1×1, then there is nothing to do.
 111         if ktop >= kbot {
 112                 return
 113         }
 114
 115         // Shuffle shifts into pairs of real shifts and pairs of complex
 116         // conjugate shifts assuming complex conjugate shifts are already
 117         // adjacent to one another.
 118         for i := 0; i < nshfts-2; i += 2 {
 119                 if si[i] == -si[i+1] {
 120                         continue
 121                 }
 122                 sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
 123                 si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
 124         }
 125
 126         // Note: lapack says that nshfts must be even but allows it to be odd
 127         // anyway. We panic above if nshfts is not even, so reducing it by one
 128         // is unnecessary. The only caller Dlaqr04 uses only even nshfts.
 129         //
 130         // The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
 131         // *     ==== NSHFTS is supposed to be even, but if it is odd,
 132         // *     .    then simply reduce it by one.  The shuffle above
 133         // *     .    ensures that the dropped shift is real and that
 134         // *     .    the remaining shifts are paired. ====
 135         // *
 136         //      NS = NSHFTS - MOD( NSHFTS, 2 )
 137         ns := nshfts
 138
 139         safmin := dlamchS
 140         ulp := dlamchP
 141         smlnum := safmin * float64(n) / ulp
 142
 143         // Use accumulated reflections to update far-from-diagonal entries?
 144         accum := kacc22 == 1 || kacc22 == 2
 145         // If so, exploit the 2×2 block structure?
 146         blk22 := ns > 2 && kacc22 == 2
 147
 148         // Clear trash.
 149         if ktop+2 <= kbot {
 150                 h[(ktop+2)*ldh+ktop] = 0
 151         }
 152
 153         // nbmps = number of 2-shift bulges in the chain.
 154         nbmps := ns / 2
 155
 156         // kdu = width of slab.
 157         kdu := 6*nbmps - 3
 158
 159         // Create and chase chains of nbmps bulges.
 160         for incol := 3*(1-nbmps) + ktop - 1; incol <= kbot-2; incol += 3*nbmps - 2 {
 161                 ndcol := incol + kdu
 162                 if accum {
 163                         impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
 164                 }
 165
 166                 // Near-the-diagonal bulge chase. The following loop performs
 167                 // the near-the-diagonal part of a small bulge multi-shift QR
 168                 // sweep. Each 6*nbmps-2 column diagonal chunk extends from
 169                 // column incol to column ndcol (including both column incol and
 170                 // column ndcol). The following loop chases a 3*nbmps column
 171                 // long chain of nbmps bulges 3*nbmps-2 columns to the right.
 172                 // (incol may be less than ktop and ndcol may be greater than
 173                 // kbot indicating phantom columns from which to chase bulges
 174                 // before they are actually introduced or to which to chase
 175                 // bulges beyond column kbot.)
 176                 for krcol := incol; krcol <= min(incol+3*nbmps-3, kbot-2); krcol++ {
 177                         // Bulges number mtop to mbot are active double implicit
 178                         // shift bulges. There may or may not also be small 2×2
 179                         // bulge, if there is room. The inactive bulges (if any)
 180                         // must wait until the active bulges have moved down the
 181                         // diagonal to make room. The phantom matrix paradigm
 182                         // described above helps keep track.
 183
 184                         mtop := max(0, ((ktop-1)-krcol+2)/3)
 185                         mbot := min(nbmps, (kbot-krcol)/3) - 1
 186                         m22 := mbot + 1
 187                         bmp22 := (mbot < nbmps-1) && (krcol+3*m22 == kbot-2)
 188
 189                         // Generate reflections to chase the chain right one
 190                         // column. (The minimum value of k is ktop-1.)
 191                         for m := mtop; m <= mbot; m++ {
 192                                 k := krcol + 3*m
 193                                 if k == ktop-1 {
 194                                         impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
 195                                                 sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
 196                                                 v[m*ldv:m*ldv+3])
 197                                         alpha := v[m*ldv]
 198                                         _, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
 199                                         continue
 200                                 }
 201                                 beta := h[(k+1)*ldh+k]
 202                                 v[m*ldv+1] = h[(k+2)*ldh+k]
 203                                 v[m*ldv+2] = h[(k+3)*ldh+k]
 204                                 beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
 205
 206                                 // A bulge may collapse because of vigilant deflation or
 207                                 // destructive underflow. In the underflow case, try the
 208                                 // two-small-subdiagonals trick to try to reinflate the
 209                                 // bulge.
 210                                 if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
 211                                         // Typical case: not collapsed (yet).
 212                                         h[(k+1)*ldh+k] = beta
 213                                         h[(k+2)*ldh+k] = 0
 214                                         h[(k+3)*ldh+k] = 0
 215                                         continue
 216                                 }
 217
 218                                 // Atypical case: collapsed. Attempt to reintroduce
 219                                 // ignoring H[k+1,k] and H[k+2,k]. If the fill
 220                                 // resulting from the new reflector is too large,
 221                                 // then abandon it. Otherwise, use the new one.
 222                                 var vt [3]float64
 223                                 impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh, sr[2*m],
 224                                         si[2*m], sr[2*m+1], si[2*m+1], vt[:])
 225                                 alpha := vt[0]
 226                                 _, vt[0] = impl.Dlarfg(3, alpha, vt[1:3], 1)
 227                                 refsum := vt[0] * (h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k])
 228
 229                                 dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
 230                                 if math.Abs(h[(k+2)*ldh+k]-refsum*vt[1])+math.Abs(refsum*vt[2]) > ulp*dsum {
 231                                         // Starting a new bulge here would create
 232                                         // non-negligible fill. Use the old one with
 233                                         // trepidation.
 234                                         h[(k+1)*ldh+k] = beta
 235                                         h[(k+2)*ldh+k] = 0
 236                                         h[(k+3)*ldh+k] = 0
 237                                         continue
 238                                 } else {
 239                                         // Starting a new bulge here would create
 240                                         // only negligible fill. Replace the old
 241                                         // reflector with the new one.
 242                                         h[(k+1)*ldh+k] -= refsum
 243                                         h[(k+2)*ldh+k] = 0
 244                                         h[(k+3)*ldh+k] = 0
 245                                         v[m*ldv] = vt[0]
 246                                         v[m*ldv+1] = vt[1]
 247                                         v[m*ldv+2] = vt[2]
 248                                 }
 249                         }
 250
 251                         // Generate a 2×2 reflection, if needed.
 252                         if bmp22 {
 253                                 k := krcol + 3*m22
 254                                 if k == ktop-1 {
 255                                         impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
 256                                                 sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
 257                                                 v[m22*ldv:m22*ldv+2])
 258                                         beta := v[m22*ldv]
 259                                         _, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
 260                                 } else {
 261                                         beta := h[(k+1)*ldh+k]
 262                                         v[m22*ldv+1] = h[(k+2)*ldh+k]
 263                                         beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
 264                                         h[(k+1)*ldh+k] = beta
 265                                         h[(k+2)*ldh+k] = 0
 266                                 }
 267                         }
 268
 269                         // Multiply H by reflections from the left.
 270                         var jbot int
 271                         switch {
 272                         case accum:
 273                                 jbot = min(ndcol, kbot)
 274                         case wantt:
 275                                 jbot = n - 1
 276                         default:
 277                                 jbot = kbot
 278                         }
 279                         for j := max(ktop, krcol); j <= jbot; j++ {
 280                                 mend := min(mbot+1, (j-krcol+2)/3) - 1
 281                                 for m := mtop; m <= mend; m++ {
 282                                         k := krcol + 3*m
 283                                         refsum := v[m*ldv] * (h[(k+1)*ldh+j] +
 284                                                 v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j])
 285                                         h[(k+1)*ldh+j] -= refsum
 286                                         h[(k+2)*ldh+j] -= refsum * v[m*ldv+1]
 287                                         h[(k+3)*ldh+j] -= refsum * v[m*ldv+2]
 288                                 }
 289                         }
 290                         if bmp22 {
 291                                 k := krcol + 3*m22
 292                                 for j := max(k+1, ktop); j <= jbot; j++ {
 293                                         refsum := v[m22*ldv] * (h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j])
 294                                         h[(k+1)*ldh+j] -= refsum
 295                                         h[(k+2)*ldh+j] -= refsum * v[m22*ldv+1]
 296                                 }
 297                         }
 298
 299                         // Multiply H by reflections from the right. Delay filling in the last row
 300                         // until the vigilant deflation check is complete.
 301                         var jtop int
 302                         switch {
 303                         case accum:
 304                                 jtop = max(ktop, incol)
 305                         case wantt:
 306                                 jtop = 0
 307                         default:
 308                                 jtop = ktop
 309                         }
 310                         for m := mtop; m <= mbot; m++ {
 311                                 if v[m*ldv] == 0 {
 312                                         continue
 313                                 }
 314                                 k := krcol + 3*m
 315                                 for j := jtop; j <= min(kbot, k+3); j++ {
 316                                         refsum := v[m*ldv] * (h[j*ldh+k+1] +
 317                                                 v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3])
 318                                         h[j*ldh+k+1] -= refsum
 319                                         h[j*ldh+k+2] -= refsum * v[m*ldv+1]
 320                                         h[j*ldh+k+3] -= refsum * v[m*ldv+2]
 321                                 }
 322                                 if accum {
 323                                         // Accumulate U. (If necessary, update Z later with with an
 324                                         // efficient matrix-matrix multiply.)
 325                                         kms := k - incol
 326                                         for j := max(0, ktop-incol-1); j < kdu; j++ {
 327                                                 refsum := v[m*ldv] * (u[j*ldu+kms] +
 328                                                         v[m*ldv+1]*u[j*ldu+kms+1] + v[m*ldv+2]*u[j*ldu+kms+2])
 329                                                 u[j*ldu+kms] -= refsum
 330                                                 u[j*ldu+kms+1] -= refsum * v[m*ldv+1]
 331                                                 u[j*ldu+kms+2] -= refsum * v[m*ldv+2]
 332                                         }
 333                                 } else if wantz {
 334                                         // U is not accumulated, so update Z now by multiplying by
 335                                         // reflections from the right.
 336                                         for j := iloz; j <= ihiz; j++ {
 337                                                 refsum := v[m*ldv] * (z[j*ldz+k+1] +
 338                                                         v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3])
 339                                                 z[j*ldz+k+1] -= refsum
 340                                                 z[j*ldz+k+2] -= refsum * v[m*ldv+1]
 341                                                 z[j*ldz+k+3] -= refsum * v[m*ldv+2]
 342                                         }
 343                                 }
 344                         }
 345
 346                         // Special case: 2×2 reflection (if needed).
 347                         if bmp22 && v[m22*ldv] != 0 {
 348                                 k := krcol + 3*m22
 349                                 for j := jtop; j <= min(kbot, k+3); j++ {
 350                                         refsum := v[m22*ldv] * (h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2])
 351                                         h[j*ldh+k+1] -= refsum
 352                                         h[j*ldh+k+2] -= refsum * v[m22*ldv+1]
 353                                 }
 354                                 if accum {
 355                                         kms := k - incol
 356                                         for j := max(0, ktop-incol-1); j < kdu; j++ {
 357                                                 refsum := v[m22*ldv] * (u[j*ldu+kms] + v[m22*ldv+1]*u[j*ldu+kms+1])
 358                                                 u[j*ldu+kms] -= refsum
 359                                                 u[j*ldu+kms+1] -= refsum * v[m22*ldv+1]
 360                                         }
 361                                 } else if wantz {
 362                                         for j := iloz; j <= ihiz; j++ {
 363                                                 refsum := v[m22*ldv] * (z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2])
 364                                                 z[j*ldz+k+1] -= refsum
 365                                                 z[j*ldz+k+2] -= refsum * v[m22*ldv+1]
 366                                         }
 367                                 }
 368                         }
 369
 370                         // Vigilant deflation check.
 371                         mstart := mtop
 372                         if krcol+3*mstart < ktop {
 373                                 mstart++
 374                         }
 375                         mend := mbot
 376                         if bmp22 {
 377                                 mend++
 378                         }
 379                         if krcol == kbot-2 {
 380                                 mend++
 381                         }
 382                         for m := mstart; m <= mend; m++ {
 383                                 k := min(kbot-1, krcol+3*m)
 384
 385                                 // The following convergence test requires that the tradition
 386                                 // small-compared-to-nearby-diagonals criterion and the Ahues &
 387                                 // Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
 388                                 // improves accuracy in some examples. Falling back on an alternate
 389                                 // convergence criterion when tst1 or tst2 is zero (as done here) is
 390                                 // traditional but probably unnecessary.
 391
 392                                 if h[(k+1)*ldh+k] == 0 {
 393                                         continue
 394                                 }
 395                                 tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
 396                                 if tst1 == 0 {
 397                                         if k >= ktop+1 {
 398                                                 tst1 += math.Abs(h[k*ldh+k-1])
 399                                         }
 400                                         if k >= ktop+2 {
 401                                                 tst1 += math.Abs(h[k*ldh+k-2])
 402                                         }
 403                                         if k >= ktop+3 {
 404                                                 tst1 += math.Abs(h[k*ldh+k-3])
 405                                         }
 406                                         if k <= kbot-2 {
 407                                                 tst1 += math.Abs(h[(k+2)*ldh+k+1])
 408                                         }
 409                                         if k <= kbot-3 {
 410                                                 tst1 += math.Abs(h[(k+3)*ldh+k+1])
 411                                         }
 412                                         if k <= kbot-4 {
 413                                                 tst1 += math.Abs(h[(k+4)*ldh+k+1])
 414                                         }
 415                                 }
 416                                 if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
 417                                         h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
 418                                         h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
 419                                         h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
 420                                         h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
 421                                         scl := h11 + h12
 422                                         tst2 := h22 * (h11 / scl)
 423                                         if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
 424                                                 h[(k+1)*ldh+k] = 0
 425                                         }
 426                                 }
 427                         }
 428
 429                         // Fill in the last row of each bulge.
 430                         mend = min(nbmps, (kbot-krcol-1)/3) - 1
 431                         for m := mtop; m <= mend; m++ {
 432                                 k := krcol + 3*m
 433                                 refsum := v[m*ldv] * v[m*ldv+2] * h[(k+4)*ldh+k+3]
 434                                 h[(k+4)*ldh+k+1] = -refsum
 435                                 h[(k+4)*ldh+k+2] = -refsum * v[m*ldv+1]
 436                                 h[(k+4)*ldh+k+3] -= refsum * v[m*ldv+2]
 437                         }
 438                 }
 439
 440                 // Use U (if accumulated) to update far-from-diagonal entries in H.
 441                 // If required, use U to update Z as well.
 442                 if !accum {
 443                         continue
 444                 }
 445                 var jtop, jbot int
 446                 if wantt {
 447                         jtop = 0
 448                         jbot = n - 1
 449                 } else {
 450                         jtop = ktop
 451                         jbot = kbot
 452                 }
 453                 bi := blas64.Implementation()
 454                 if !blk22 || incol < ktop || kbot < ndcol || ns <= 2 {
 455                         // Updates not exploiting the 2×2 block structure of U. k0 and nu keep track
 456                         // of the location and size of U in the special cases of introducing bulges
 457                         // and chasing bulges off the bottom. In these special cases and in case the
 458                         // number of shifts is ns = 2, there is no 2×2 block structure to exploit.
 459
 460                         k0 := max(0, ktop-incol-1)
 461                         nu := kdu - max(0, ndcol-kbot) - k0
 462
 463                         // Horizontal multiply.
 464                         for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
 465                                 jlen := min(nh, jbot-jcol+1)
 466                                 bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
 467                                         1, u[k0*ldu+k0:], ldu,
 468                                         h[(incol+k0+1)*ldh+jcol:], ldh,
 469                                         0, wh, ldwh)
 470                                 impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k0+1)*ldh+jcol:], ldh)
 471                         }
 472
 473                         // Vertical multiply.
 474                         for jrow := jtop; jrow <= max(ktop, incol)-1; jrow += nv {
 475                                 jlen := min(nv, max(ktop, incol)-jrow)
 476                                 bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
 477                                         1, h[jrow*ldh+incol+k0+1:], ldh,
 478                                         u[k0*ldu+k0:], ldu,
 479                                         0, wv, ldwv)
 480                                 impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k0+1:], ldh)
 481                         }
 482
 483                         // Z multiply (also vertical).
 484                         if wantz {
 485                                 for jrow := iloz; jrow <= ihiz; jrow += nv {
 486                                         jlen := min(nv, ihiz-jrow+1)
 487                                         bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
 488                                                 1, z[jrow*ldz+incol+k0+1:], ldz,
 489                                                 u[k0*ldu+k0:], ldu,
 490                                                 0, wv, ldwv)
 491                                         impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k0+1:], ldz)
 492                                 }
 493                         }
 494
 495                         continue
 496                 }
 497
 498                 // Updates exploiting U's 2×2 block structure.
 499
 500                 // i2, i4, j2, j4 are the last rows and columns of the blocks.
 501                 i2 := (kdu + 1) / 2
 502                 i4 := kdu
 503                 j2 := i4 - i2
 504                 j4 := kdu
 505
 506                 // kzs and knz deal with the band of zeros along the diagonal of one of the
 507                 // triangular blocks.
 508                 kzs := (j4 - j2) - (ns + 1)
 509                 knz := ns + 1
 510
 511                 // Horizontal multiply.
 512                 for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
 513                         jlen := min(nh, jbot-jcol+1)
 514
 515                         // Copy bottom of H to top+kzs of scratch (the first kzs
 516                         // rows get multiplied by zero).
 517                         impl.Dlacpy(blas.All, knz, jlen, h[(incol+1+j2)*ldh+jcol:], ldh, wh[kzs*ldwh:], ldwh)
 518
 519                         // Multiply by U21^T.
 520                         impl.Dlaset(blas.All, kzs, jlen, 0, 0, wh, ldwh)
 521                         bi.Dtrmm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, knz, jlen,
 522                                 1, u[j2*ldu+kzs:], ldu, wh[kzs*ldwh:], ldwh)
 523
 524                         // Multiply top of H by U11^T.
 525                         bi.Dgemm(blas.Trans, blas.NoTrans, i2, jlen, j2,
 526                                 1, u, ldu, h[(incol+1)*ldh+jcol:], ldh,
 527                                 1, wh, ldwh)
 528
 529                         // Copy top of H to bottom of WH.
 530                         impl.Dlacpy(blas.All, j2, jlen, h[(incol+1)*ldh+jcol:], ldh, wh[i2*ldwh:], ldwh)
 531
 532                         // Multiply by U21^T.
 533                         bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, j2, jlen,
 534                                 1, u[i2:], ldu, wh[i2*ldwh:], ldwh)
 535
 536                         // Multiply by U22.
 537                         bi.Dgemm(blas.Trans, blas.NoTrans, i4-i2, jlen, j4-j2,
 538                                 1, u[j2*ldu+i2:], ldu, h[(incol+1+j2)*ldh+jcol:], ldh,
 539                                 1, wh[i2*ldwh:], ldwh)
 540
 541                         // Copy it back.
 542                         impl.Dlacpy(blas.All, kdu, jlen, wh, ldwh, h[(incol+1)*ldh+jcol:], ldh)
 543                 }
 544
 545                 // Vertical multiply.
 546                 for jrow := jtop; jrow <= max(incol, ktop)-1; jrow += nv {
 547                         jlen := min(nv, max(incol, ktop)-jrow)
 548
 549                         // Copy right of H to scratch (the first kzs columns get multiplied
 550                         // by zero).
 551                         impl.Dlacpy(blas.All, jlen, knz, h[jrow*ldh+incol+1+j2:], ldh, wv[kzs:], ldwv)
 552
 553                         // Multiply by U21.
 554                         impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
 555                         bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
 556                                 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
 557
 558                         // Multiply by U11.
 559                         bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
 560                                 1, h[jrow*ldh+incol+1:], ldh, u, ldu,
 561                                 1, wv, ldwv)
 562
 563                         // Copy left of H to right of scratch.
 564                         impl.Dlacpy(blas.All, jlen, j2, h[jrow*ldh+incol+1:], ldh, wv[i2:], ldwv)
 565
 566                         // Multiply by U21.
 567                         bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
 568                                 1, u[i2:], ldu, wv[i2:], ldwv)
 569
 570                         // Multiply by U22.
 571                         bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
 572                                 1, h[jrow*ldh+incol+1+j2:], ldh, u[j2*ldu+i2:], ldu,
 573                                 1, wv[i2:], ldwv)
 574
 575                         // Copy it back.
 576                         impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, h[jrow*ldh+incol+1:], ldh)
 577                 }
 578
 579                 if !wantz {
 580                         continue
 581                 }
 582                 // Multiply Z (also vertical).
 583                 for jrow := iloz; jrow <= ihiz; jrow += nv {
 584                         jlen := min(nv, ihiz-jrow+1)
 585
 586                         // Copy right of Z to left of scratch (first kzs columns get
 587                         // multiplied by zero).
 588                         impl.Dlacpy(blas.All, jlen, knz, z[jrow*ldz+incol+1+j2:], ldz, wv[kzs:], ldwv)
 589
 590                         // Multiply by U12.
 591                         impl.Dlaset(blas.All, jlen, kzs, 0, 0, wv, ldwv)
 592                         bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, jlen, knz,
 593                                 1, u[j2*ldu+kzs:], ldu, wv[kzs:], ldwv)
 594
 595                         // Multiply by U11.
 596                         bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i2, j2,
 597                                 1, z[jrow*ldz+incol+1:], ldz, u, ldu,
 598                                 1, wv, ldwv)
 599
 600                         // Copy left of Z to right of scratch.
 601                         impl.Dlacpy(blas.All, jlen, j2, z[jrow*ldz+incol+1:], ldz, wv[i2:], ldwv)
 602
 603                         // Multiply by U21.
 604                         bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.NonUnit, jlen, i4-i2,
 605                                 1, u[i2:], ldu, wv[i2:], ldwv)
 606
 607                         // Multiply by U22.
 608                         bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, i4-i2, j4-j2,
 609                                 1, z[jrow*ldz+incol+1+j2:], ldz, u[j2*ldu+i2:], ldu,
 610                                 1, wv[i2:], ldwv)
 611
 612                         // Copy the result back to Z.
 613                         impl.Dlacpy(blas.All, jlen, kdu, wv, ldwv, z[jrow*ldz+incol+1:], ldz)
 614                 }
 615         }
 616 }