1 // Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
3 // Copyright ©2014 The Gonum Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
10 "gonum.org/v1/gonum/blas"
11 "gonum.org/v1/gonum/internal/asm/f32"
14 var _ blas.Float32Level3 = Implementation{}
17 // A * X = alpha * B, if tA == blas.NoTrans side == blas.Left,
18 // A^T * X = alpha * B, if tA == blas.Trans or blas.ConjTrans, and side == blas.Left,
19 // X * A = alpha * B, if tA == blas.NoTrans side == blas.Right,
20 // X * A^T = alpha * B, if tA == blas.Trans or blas.ConjTrans, and side == blas.Right,
21 // where A is an n×n or m×m triangular matrix, X is an m×n matrix, and alpha is a
24 // At entry to the function, X contains the values of B, and the result is
25 // stored in place into X.
27 // No check is made that A is invertible.
29 // Float32 implementations are autogenerated and not directly tested.
30 func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
31 if s != blas.Left && s != blas.Right {
34 if ul != blas.Lower && ul != blas.Upper {
37 if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
40 if d != blas.NonUnit && d != blas.Unit {
58 if lda*(k-1)+k > len(a) || lda < max(1, k) {
61 if ldb*(m-1)+n > len(b) || ldb < max(1, n) {
70 for i := 0; i < m; i++ {
71 btmp := b[i*ldb : i*ldb+n]
78 nonUnit := d == blas.NonUnit
80 if tA == blas.NoTrans {
82 for i := m - 1; i >= 0; i-- {
83 btmp := b[i*ldb : i*ldb+n]
89 for ka, va := range a[i*lda+i+1 : i*lda+m] {
92 f32.AxpyUnitaryTo(btmp, -va, b[k*ldb:k*ldb+n], btmp)
97 for j := 0; j < n; j++ {
104 for i := 0; i < m; i++ {
105 btmp := b[i*ldb : i*ldb+n]
107 for j := 0; j < n; j++ {
111 for k, va := range a[i*lda : i*lda+i] {
113 f32.AxpyUnitaryTo(btmp, -va, b[k*ldb:k*ldb+n], btmp)
117 tmp := 1 / a[i*lda+i]
118 for j := 0; j < n; j++ {
125 // Cases where a is transposed
126 if ul == blas.Upper {
127 for k := 0; k < m; k++ {
128 btmpk := b[k*ldb : k*ldb+n]
130 tmp := 1 / a[k*lda+k]
131 for j := 0; j < n; j++ {
135 for ia, va := range a[k*lda+k+1 : k*lda+m] {
138 btmp := b[i*ldb : i*ldb+n]
139 f32.AxpyUnitaryTo(btmp, -va, btmpk, btmp)
143 for j := 0; j < n; j++ {
150 for k := m - 1; k >= 0; k-- {
151 btmpk := b[k*ldb : k*ldb+n]
153 tmp := 1 / a[k*lda+k]
154 for j := 0; j < n; j++ {
158 for i, va := range a[k*lda : k*lda+k] {
160 btmp := b[i*ldb : i*ldb+n]
161 f32.AxpyUnitaryTo(btmp, -va, btmpk, btmp)
165 for j := 0; j < n; j++ {
172 // Cases where a is to the right of X.
173 if tA == blas.NoTrans {
174 if ul == blas.Upper {
175 for i := 0; i < m; i++ {
176 btmp := b[i*ldb : i*ldb+n]
178 for j := 0; j < n; j++ {
182 for k, vb := range btmp {
186 btmp[k] /= a[k*lda+k]
188 btmpk := btmp[k+1 : n]
189 f32.AxpyUnitaryTo(btmpk, -btmp[k], a[k*lda+k+1:k*lda+n], btmpk)
196 for i := 0; i < m; i++ {
197 btmp := b[i*lda : i*lda+n]
199 for j := 0; j < n; j++ {
203 for k := n - 1; k >= 0; k-- {
206 btmp[k] /= a[k*lda+k]
208 f32.AxpyUnitaryTo(btmp, -btmp[k], a[k*lda:k*lda+k], btmp)
214 // Cases where a is transposed.
215 if ul == blas.Upper {
216 for i := 0; i < m; i++ {
217 btmp := b[i*lda : i*lda+n]
218 for j := n - 1; j >= 0; j-- {
219 tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
228 for i := 0; i < m; i++ {
229 btmp := b[i*lda : i*lda+n]
230 for j := 0; j < n; j++ {
231 tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp)
240 // Ssymm performs one of
241 // C = alpha * A * B + beta * C, if side == blas.Left,
242 // C = alpha * B * A + beta * C, if side == blas.Right,
243 // where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
246 // Float32 implementations are autogenerated and not directly tested.
247 func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
248 if s != blas.Right && s != blas.Left {
249 panic("goblas: bad side")
251 if ul != blas.Lower && ul != blas.Upper {
266 if lda*(k-1)+k > len(a) || lda < max(1, k) {
269 if ldb*(m-1)+n > len(b) || ldb < max(1, n) {
272 if ldc*(m-1)+n > len(c) || ldc < max(1, n) {
275 if m == 0 || n == 0 {
278 if alpha == 0 && beta == 1 {
283 for i := 0; i < m; i++ {
284 ctmp := c[i*ldc : i*ldc+n]
285 for j := range ctmp {
291 for i := 0; i < m; i++ {
292 ctmp := c[i*ldc : i*ldc+n]
293 for j := 0; j < n; j++ {
300 isUpper := ul == blas.Upper
302 for i := 0; i < m; i++ {
303 atmp := alpha * a[i*lda+i]
304 btmp := b[i*ldb : i*ldb+n]
305 ctmp := c[i*ldc : i*ldc+n]
306 for j, v := range btmp {
311 for k := 0; k < i; k++ {
319 ctmp := c[i*ldc : i*ldc+n]
320 f32.AxpyUnitaryTo(ctmp, atmp, b[k*ldb:k*ldb+n], ctmp)
322 for k := i + 1; k < m; k++ {
330 ctmp := c[i*ldc : i*ldc+n]
331 f32.AxpyUnitaryTo(ctmp, atmp, b[k*ldb:k*ldb+n], ctmp)
337 for i := 0; i < m; i++ {
338 for j := n - 1; j >= 0; j-- {
339 tmp := alpha * b[i*ldb+j]
341 atmp := a[j*lda+j+1 : j*lda+n]
342 btmp := b[i*ldb+j+1 : i*ldb+n]
343 ctmp := c[i*ldc+j+1 : i*ldc+n]
344 for k, v := range atmp {
349 c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
354 for i := 0; i < m; i++ {
355 for j := 0; j < n; j++ {
356 tmp := alpha * b[i*ldb+j]
358 atmp := a[j*lda : j*lda+j]
359 btmp := b[i*ldb : i*ldb+j]
360 ctmp := c[i*ldc : i*ldc+j]
361 for k, v := range atmp {
366 c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
371 // Ssyrk performs the symmetric rank-k operation
372 // C = alpha * A * A^T + beta*C
373 // C is an n×n symmetric matrix. A is an n×k matrix if tA == blas.NoTrans, and
374 // a k×n matrix otherwise. alpha and beta are scalars.
376 // Float32 implementations are autogenerated and not directly tested.
377 func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
378 if ul != blas.Lower && ul != blas.Upper {
381 if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
394 if tA == blas.NoTrans {
399 if lda*(row-1)+col > len(a) || lda < max(1, col) {
402 if ldc*(n-1)+n > len(c) || ldc < max(1, n) {
407 if ul == blas.Upper {
408 for i := 0; i < n; i++ {
409 ctmp := c[i*ldc+i : i*ldc+n]
410 for j := range ctmp {
416 for i := 0; i < n; i++ {
417 ctmp := c[i*ldc : i*ldc+i+1]
418 for j := range ctmp {
424 if ul == blas.Upper {
425 for i := 0; i < n; i++ {
426 ctmp := c[i*ldc+i : i*ldc+n]
427 for j := range ctmp {
433 for i := 0; i < n; i++ {
434 ctmp := c[i*ldc : i*ldc+i+1]
435 for j := range ctmp {
441 if tA == blas.NoTrans {
442 if ul == blas.Upper {
443 for i := 0; i < n; i++ {
444 ctmp := c[i*ldc+i : i*ldc+n]
445 atmp := a[i*lda : i*lda+k]
446 for jc, vc := range ctmp {
448 ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
453 for i := 0; i < n; i++ {
454 atmp := a[i*lda : i*lda+k]
455 for j, vc := range c[i*ldc : i*ldc+i+1] {
456 c[i*ldc+j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
461 // Cases where a is transposed.
462 if ul == blas.Upper {
463 for i := 0; i < n; i++ {
464 ctmp := c[i*ldc+i : i*ldc+n]
466 for j := range ctmp {
470 for l := 0; l < k; l++ {
471 tmp := alpha * a[l*lda+i]
473 f32.AxpyUnitaryTo(ctmp, tmp, a[l*lda+i:l*lda+n], ctmp)
479 for i := 0; i < n; i++ {
480 ctmp := c[i*ldc : i*ldc+i+1]
482 for j := range ctmp {
486 for l := 0; l < k; l++ {
487 tmp := alpha * a[l*lda+i]
489 f32.AxpyUnitaryTo(ctmp, tmp, a[l*lda:l*lda+i+1], ctmp)
495 // Ssyr2k performs the symmetric rank 2k operation
496 // C = alpha * A * B^T + alpha * B * A^T + beta * C
497 // where C is an n×n symmetric matrix. A and B are n×k matrices if
498 // tA == NoTrans and k×n otherwise. alpha and beta are scalars.
500 // Float32 implementations are autogenerated and not directly tested.
501 func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
502 if ul != blas.Lower && ul != blas.Upper {
505 if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
518 if tA == blas.NoTrans {
523 if lda*(row-1)+col > len(a) || lda < max(1, col) {
526 if ldb*(row-1)+col > len(b) || ldb < max(1, col) {
529 if ldc*(n-1)+n > len(c) || ldc < max(1, n) {
534 if ul == blas.Upper {
535 for i := 0; i < n; i++ {
536 ctmp := c[i*ldc+i : i*ldc+n]
537 for j := range ctmp {
543 for i := 0; i < n; i++ {
544 ctmp := c[i*ldc : i*ldc+i+1]
545 for j := range ctmp {
551 if ul == blas.Upper {
552 for i := 0; i < n; i++ {
553 ctmp := c[i*ldc+i : i*ldc+n]
554 for j := range ctmp {
560 for i := 0; i < n; i++ {
561 ctmp := c[i*ldc : i*ldc+i+1]
562 for j := range ctmp {
568 if tA == blas.NoTrans {
569 if ul == blas.Upper {
570 for i := 0; i < n; i++ {
571 atmp := a[i*lda : i*lda+k]
572 btmp := b[i*ldb : i*ldb+k]
573 ctmp := c[i*ldc+i : i*ldc+n]
574 for jc := range ctmp {
576 var tmp1, tmp2 float32
577 binner := b[j*ldb : j*ldb+k]
578 for l, v := range a[j*lda : j*lda+k] {
580 tmp2 += atmp[l] * binner[l]
583 ctmp[jc] += alpha * (tmp1 + tmp2)
588 for i := 0; i < n; i++ {
589 atmp := a[i*lda : i*lda+k]
590 btmp := b[i*ldb : i*ldb+k]
591 ctmp := c[i*ldc : i*ldc+i+1]
592 for j := 0; j <= i; j++ {
593 var tmp1, tmp2 float32
594 binner := b[j*ldb : j*ldb+k]
595 for l, v := range a[j*lda : j*lda+k] {
597 tmp2 += atmp[l] * binner[l]
600 ctmp[j] += alpha * (tmp1 + tmp2)
605 if ul == blas.Upper {
606 for i := 0; i < n; i++ {
607 ctmp := c[i*ldc+i : i*ldc+n]
609 for j := range ctmp {
613 for l := 0; l < k; l++ {
614 tmp1 := alpha * b[l*lda+i]
615 tmp2 := alpha * a[l*lda+i]
616 btmp := b[l*ldb+i : l*ldb+n]
617 if tmp1 != 0 || tmp2 != 0 {
618 for j, v := range a[l*lda+i : l*lda+n] {
619 ctmp[j] += v*tmp1 + btmp[j]*tmp2
626 for i := 0; i < n; i++ {
627 ctmp := c[i*ldc : i*ldc+i+1]
629 for j := range ctmp {
633 for l := 0; l < k; l++ {
634 tmp1 := alpha * b[l*lda+i]
635 tmp2 := alpha * a[l*lda+i]
636 btmp := b[l*ldb : l*ldb+i+1]
637 if tmp1 != 0 || tmp2 != 0 {
638 for j, v := range a[l*lda : l*lda+i+1] {
639 ctmp[j] += v*tmp1 + btmp[j]*tmp2
647 // B = alpha * A * B, if tA == blas.NoTrans and side == blas.Left,
648 // B = alpha * A^T * B, if tA == blas.Trans or blas.ConjTrans, and side == blas.Left,
649 // B = alpha * B * A, if tA == blas.NoTrans and side == blas.Right,
650 // B = alpha * B * A^T, if tA == blas.Trans or blas.ConjTrans, and side == blas.Right,
651 // where A is an n×n or m×m triangular matrix, and B is an m×n matrix.
653 // Float32 implementations are autogenerated and not directly tested.
654 func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
655 if s != blas.Left && s != blas.Right {
658 if ul != blas.Lower && ul != blas.Upper {
661 if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
664 if d != blas.NonUnit && d != blas.Unit {
679 if lda*(k-1)+k > len(a) || lda < max(1, k) {
682 if ldb*(m-1)+n > len(b) || ldb < max(1, n) {
686 for i := 0; i < m; i++ {
687 btmp := b[i*ldb : i*ldb+n]
688 for j := range btmp {
695 nonUnit := d == blas.NonUnit
697 if tA == blas.NoTrans {
698 if ul == blas.Upper {
699 for i := 0; i < m; i++ {
704 btmp := b[i*ldb : i*ldb+n]
705 for j := range btmp {
708 for ka, va := range a[i*lda+i+1 : i*lda+m] {
712 f32.AxpyUnitaryTo(btmp, tmp, b[k*ldb:k*ldb+n], btmp)
718 for i := m - 1; i >= 0; i-- {
723 btmp := b[i*ldb : i*ldb+n]
724 for j := range btmp {
727 for k, va := range a[i*lda : i*lda+i] {
730 f32.AxpyUnitaryTo(btmp, tmp, b[k*ldb:k*ldb+n], btmp)
736 // Cases where a is transposed.
737 if ul == blas.Upper {
738 for k := m - 1; k >= 0; k-- {
739 btmpk := b[k*ldb : k*ldb+n]
740 for ia, va := range a[k*lda+k+1 : k*lda+m] {
742 btmp := b[i*ldb : i*ldb+n]
745 f32.AxpyUnitaryTo(btmp, tmp, btmpk, btmp)
753 for j := 0; j < n; j++ {
760 for k := 0; k < m; k++ {
761 btmpk := b[k*ldb : k*ldb+n]
762 for i, va := range a[k*lda : k*lda+k] {
763 btmp := b[i*ldb : i*ldb+n]
766 f32.AxpyUnitaryTo(btmp, tmp, btmpk, btmp)
774 for j := 0; j < n; j++ {
781 // Cases where a is on the right
782 if tA == blas.NoTrans {
783 if ul == blas.Upper {
784 for i := 0; i < m; i++ {
785 btmp := b[i*ldb : i*ldb+n]
786 for k := n - 1; k >= 0; k-- {
787 tmp := alpha * btmp[k]
791 btmp[k] *= a[k*lda+k]
793 for ja, v := range a[k*lda+k+1 : k*lda+n] {
802 for i := 0; i < m; i++ {
803 btmp := b[i*ldb : i*ldb+n]
804 for k := 0; k < n; k++ {
805 tmp := alpha * btmp[k]
809 btmp[k] *= a[k*lda+k]
811 f32.AxpyUnitaryTo(btmp, tmp, a[k*lda:k*lda+k], btmp)
817 // Cases where a is transposed.
818 if ul == blas.Upper {
819 for i := 0; i < m; i++ {
820 btmp := b[i*ldb : i*ldb+n]
821 for j, vb := range btmp {
826 tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
827 btmp[j] = alpha * tmp
832 for i := 0; i < m; i++ {
833 btmp := b[i*ldb : i*ldb+n]
834 for j := n - 1; j >= 0; j-- {
839 tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
840 btmp[j] = alpha * tmp