1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
/* generated code, do not edit. */
#include "ode/matrix.h"
/* solve L^T * x=b, with b containing 1 right hand side.
* L is an n*n lower triangular matrix with ones on the diagonal.
* L is stored by rows and its leading dimension is lskip.
* b is an n*1 matrix that contains the right hand side.
* b is overwritten with x.
* this processes blocks of 4.
*/
void dSolveL1T (const dReal *L, dReal *B, int n, int lskip1)
{
/* declare variables - Z matrix, p and q vectors, etc */
dReal Z11,m11,Z21,m21,Z31,m31,Z41,m41,p1,q1,p2,p3,p4,*ex;
const dReal *ell;
int lskip2,lskip3,i,j;
/* special handling for L and B because we're solving L1 *transpose* */
L = L + (n-1)*(lskip1+1);
B = B + n-1;
lskip1 = -lskip1;
/* compute lskip values */
lskip2 = 2*lskip1;
lskip3 = 3*lskip1;
/* compute all 4 x 1 blocks of X */
for (i=0; i <= n-4; i+=4) {
/* compute all 4 x 1 block of X, from rows i..i+4-1 */
/* set the Z matrix to 0 */
Z11=0;
Z21=0;
Z31=0;
Z41=0;
ell = L - i;
ex = B;
/* the inner loop that computes outer products and adds them to Z */
for (j=i-4; j >= 0; j -= 4) {
/* load p and q values */
p1=ell[0];
q1=ex[0];
p2=ell[-1];
p3=ell[-2];
p4=ell[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
m21 = p2 * q1;
m31 = p3 * q1;
m41 = p4 * q1;
ell += lskip1;
Z11 += m11;
Z21 += m21;
Z31 += m31;
Z41 += m41;
/* load p and q values */
p1=ell[0];
q1=ex[-1];
p2=ell[-1];
p3=ell[-2];
p4=ell[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
m21 = p2 * q1;
m31 = p3 * q1;
m41 = p4 * q1;
ell += lskip1;
Z11 += m11;
Z21 += m21;
Z31 += m31;
Z41 += m41;
/* load p and q values */
p1=ell[0];
q1=ex[-2];
p2=ell[-1];
p3=ell[-2];
p4=ell[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
m21 = p2 * q1;
m31 = p3 * q1;
m41 = p4 * q1;
ell += lskip1;
Z11 += m11;
Z21 += m21;
Z31 += m31;
Z41 += m41;
/* load p and q values */
p1=ell[0];
q1=ex[-3];
p2=ell[-1];
p3=ell[-2];
p4=ell[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
m21 = p2 * q1;
m31 = p3 * q1;
m41 = p4 * q1;
ell += lskip1;
ex -= 4;
Z11 += m11;
Z21 += m21;
Z31 += m31;
Z41 += m41;
/* end of inner loop */
}
/* compute left-over iterations */
j += 4;
for (; j > 0; j--) {
/* load p and q values */
p1=ell[0];
q1=ex[0];
p2=ell[-1];
p3=ell[-2];
p4=ell[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
m21 = p2 * q1;
m31 = p3 * q1;
m41 = p4 * q1;
ell += lskip1;
ex -= 1;
Z11 += m11;
Z21 += m21;
Z31 += m31;
Z41 += m41;
}
/* finish computing the X(i) block */
Z11 = ex[0] - Z11;
ex[0] = Z11;
p1 = ell[-1];
Z21 = ex[-1] - Z21 - p1*Z11;
ex[-1] = Z21;
p1 = ell[-2];
p2 = ell[-2+lskip1];
Z31 = ex[-2] - Z31 - p1*Z11 - p2*Z21;
ex[-2] = Z31;
p1 = ell[-3];
p2 = ell[-3+lskip1];
p3 = ell[-3+lskip2];
Z41 = ex[-3] - Z41 - p1*Z11 - p2*Z21 - p3*Z31;
ex[-3] = Z41;
/* end of outer loop */
}
/* compute rows at end that are not a multiple of block size */
for (; i < n; i++) {
/* compute all 1 x 1 block of X, from rows i..i+1-1 */
/* set the Z matrix to 0 */
Z11=0;
ell = L - i;
ex = B;
/* the inner loop that computes outer products and adds them to Z */
for (j=i-4; j >= 0; j -= 4) {
/* load p and q values */
p1=ell[0];
q1=ex[0];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
ell += lskip1;
Z11 += m11;
/* load p and q values */
p1=ell[0];
q1=ex[-1];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
ell += lskip1;
Z11 += m11;
/* load p and q values */
p1=ell[0];
q1=ex[-2];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
ell += lskip1;
Z11 += m11;
/* load p and q values */
p1=ell[0];
q1=ex[-3];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
ell += lskip1;
ex -= 4;
Z11 += m11;
/* end of inner loop */
}
/* compute left-over iterations */
j += 4;
for (; j > 0; j--) {
/* load p and q values */
p1=ell[0];
q1=ex[0];
/* compute outer product and add it to the Z matrix */
m11 = p1 * q1;
ell += lskip1;
ex -= 1;
Z11 += m11;
}
/* finish computing the X(i) block */
Z11 = ex[0] - Z11;
ex[0] = Z11;
}
}
|