WPILibC++ 2023.4.3-108-ge5452e3
GeneralMatrixVector.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11#define EIGEN_GENERAL_MATRIX_VECTOR_H
12
13namespace Eigen {
14
15namespace internal {
16
21};
22
23template <int N, typename T1, typename T2, typename T3>
24struct gemv_packet_cond { typedef T3 type; };
25
26template <typename T1, typename T2, typename T3>
27struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28
29template <typename T1, typename T2, typename T3>
30struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
31
32template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
34{
36
37#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38 typedef typename gemv_packet_cond<packet_size, \
39 typename packet_traits<name ## Scalar>::type, \
40 typename packet_traits<name ## Scalar>::half, \
41 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42 prefix ## name ## Packet
43
44 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47#undef PACKET_DECL_COND_PREFIX
48
49public:
50 enum {
57 };
58
62};
63
64
65/* Optimized col-major matrix * vector product:
66 * This algorithm processes the matrix per vertical panels,
67 * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
68 *
69 * Mixing type logic: C += alpha * A * B
70 * | A | B |alpha| comments
71 * |real |cplx |cplx | no vectorization
72 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
73 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
74 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
75 *
76 * The same reasoning apply for the transposed case.
77 */
78template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
79struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
80{
84
86
87 typedef typename Traits::LhsPacket LhsPacket;
88 typedef typename Traits::RhsPacket RhsPacket;
89 typedef typename Traits::ResPacket ResPacket;
90
94
98
100 Index rows, Index cols,
101 const LhsMapper& lhs,
102 const RhsMapper& rhs,
103 ResScalar* res, Index resIncr,
104 RhsScalar alpha);
105};
106
107template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
109 Index rows, Index cols,
110 const LhsMapper& alhs,
111 const RhsMapper& rhs,
112 ResScalar* res, Index resIncr,
113 RhsScalar alpha)
114{
115 EIGEN_UNUSED_VARIABLE(resIncr);
116 eigen_internal_assert(resIncr==1);
117
118 // The following copy tells the compiler that lhs's attributes are not modified outside this function
119 // This helps GCC to generate propoer code.
120 LhsMapper lhs(alhs);
121
126
127 const Index lhsStride = lhs.stride();
128 // TODO: for padded aligned inputs, we could enable aligned reads
129 enum { LhsAlignment = Unaligned,
130 ResPacketSize = Traits::ResPacketSize,
131 ResPacketSizeHalf = HalfTraits::ResPacketSize,
132 ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133 LhsPacketSize = Traits::LhsPacketSize,
134 HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135 HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136 };
137
138 const Index n8 = rows-8*ResPacketSize+1;
139 const Index n4 = rows-4*ResPacketSize+1;
140 const Index n3 = rows-3*ResPacketSize+1;
141 const Index n2 = rows-2*ResPacketSize+1;
142 const Index n1 = rows-1*ResPacketSize+1;
143 const Index n_half = rows-1*ResPacketSizeHalf+1;
144 const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
145
146 // TODO: improve the following heuristic:
147 const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
148 ResPacket palpha = pset1<ResPacket>(alpha);
149 ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150 ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151
152 for(Index j2=0; j2<cols; j2+=block_cols)
153 {
154 Index jend = numext::mini(j2+block_cols,cols);
155 Index i=0;
156 for(; i<n8; i+=ResPacketSize*8)
157 {
158 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159 c1 = pset1<ResPacket>(ResScalar(0)),
160 c2 = pset1<ResPacket>(ResScalar(0)),
161 c3 = pset1<ResPacket>(ResScalar(0)),
162 c4 = pset1<ResPacket>(ResScalar(0)),
163 c5 = pset1<ResPacket>(ResScalar(0)),
164 c6 = pset1<ResPacket>(ResScalar(0)),
165 c7 = pset1<ResPacket>(ResScalar(0));
166
167 for(Index j=j2; j<jend; j+=1)
168 {
169 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
178 }
179 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183 pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184 pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185 pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186 pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
187 }
188 if(i<n4)
189 {
190 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191 c1 = pset1<ResPacket>(ResScalar(0)),
192 c2 = pset1<ResPacket>(ResScalar(0)),
193 c3 = pset1<ResPacket>(ResScalar(0));
194
195 for(Index j=j2; j<jend; j+=1)
196 {
197 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
202 }
203 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
207
208 i+=ResPacketSize*4;
209 }
210 if(i<n3)
211 {
212 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213 c1 = pset1<ResPacket>(ResScalar(0)),
214 c2 = pset1<ResPacket>(ResScalar(0));
215
216 for(Index j=j2; j<jend; j+=1)
217 {
218 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
222 }
223 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
226
227 i+=ResPacketSize*3;
228 }
229 if(i<n2)
230 {
231 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232 c1 = pset1<ResPacket>(ResScalar(0));
233
234 for(Index j=j2; j<jend; j+=1)
235 {
236 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
239 }
240 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242 i+=ResPacketSize*2;
243 }
244 if(i<n1)
245 {
246 ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247 for(Index j=j2; j<jend; j+=1)
248 {
249 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
251 }
252 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253 i+=ResPacketSize;
254 }
255 if(HasHalf && i<n_half)
256 {
257 ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258 for(Index j=j2; j<jend; j+=1)
259 {
260 RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261 c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
262 }
263 pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264 i+=ResPacketSizeHalf;
265 }
266 if(HasQuarter && i<n_quarter)
267 {
268 ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269 for(Index j=j2; j<jend; j+=1)
270 {
271 RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272 c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
273 }
274 pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275 i+=ResPacketSizeQuarter;
276 }
277 for(;i<rows;++i)
278 {
279 ResScalar c0(0);
280 for(Index j=j2; j<jend; j+=1)
281 c0 += cj.pmul(lhs(i,j), rhs(j,0));
282 res[i] += alpha*c0;
283 }
284 }
285}
286
287/* Optimized row-major matrix * vector product:
288 * This algorithm processes 4 rows at once that allows to both reduce
289 * the number of load/stores of the result by a factor 4 and to reduce
290 * the instruction dependency. Moreover, we know that all bands have the
291 * same alignment pattern.
292 *
293 * Mixing type logic:
294 * - alpha is always a complex (or converted to a complex)
295 * - no vectorization
296 */
297template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
298struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
299{
303
305
306 typedef typename Traits::LhsPacket LhsPacket;
307 typedef typename Traits::RhsPacket RhsPacket;
308 typedef typename Traits::ResPacket ResPacket;
309
313
317
319 Index rows, Index cols,
320 const LhsMapper& lhs,
321 const RhsMapper& rhs,
322 ResScalar* res, Index resIncr,
323 ResScalar alpha);
324};
325
326template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
328 Index rows, Index cols,
329 const LhsMapper& alhs,
330 const RhsMapper& rhs,
331 ResScalar* res, Index resIncr,
332 ResScalar alpha)
333{
334 // The following copy tells the compiler that lhs's attributes are not modified outside this function
335 // This helps GCC to generate propoer code.
336 LhsMapper lhs(alhs);
337
338 eigen_internal_assert(rhs.stride()==1);
343
344 // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345 // processing 8 rows at once might be counter productive wrt cache.
346 const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347 const Index n4 = rows-3;
348 const Index n2 = rows-1;
349
350 // TODO: for padded aligned inputs, we could enable aligned reads
351 enum { LhsAlignment = Unaligned,
352 ResPacketSize = Traits::ResPacketSize,
353 ResPacketSizeHalf = HalfTraits::ResPacketSize,
354 ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355 LhsPacketSize = Traits::LhsPacketSize,
356 LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357 LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358 HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359 HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360 };
361
362 Index i=0;
363 for(; i<n8; i+=8)
364 {
365 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366 c1 = pset1<ResPacket>(ResScalar(0)),
367 c2 = pset1<ResPacket>(ResScalar(0)),
368 c3 = pset1<ResPacket>(ResScalar(0)),
369 c4 = pset1<ResPacket>(ResScalar(0)),
370 c5 = pset1<ResPacket>(ResScalar(0)),
371 c6 = pset1<ResPacket>(ResScalar(0)),
372 c7 = pset1<ResPacket>(ResScalar(0));
373
374 Index j=0;
375 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
376 {
377 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378
379 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
387 }
388 ResScalar cc0 = predux(c0);
389 ResScalar cc1 = predux(c1);
390 ResScalar cc2 = predux(c2);
391 ResScalar cc3 = predux(c3);
392 ResScalar cc4 = predux(c4);
393 ResScalar cc5 = predux(c5);
394 ResScalar cc6 = predux(c6);
395 ResScalar cc7 = predux(c7);
396 for(; j<cols; ++j)
397 {
398 RhsScalar b0 = rhs(j,0);
399
400 cc0 += cj.pmul(lhs(i+0,j), b0);
401 cc1 += cj.pmul(lhs(i+1,j), b0);
402 cc2 += cj.pmul(lhs(i+2,j), b0);
403 cc3 += cj.pmul(lhs(i+3,j), b0);
404 cc4 += cj.pmul(lhs(i+4,j), b0);
405 cc5 += cj.pmul(lhs(i+5,j), b0);
406 cc6 += cj.pmul(lhs(i+6,j), b0);
407 cc7 += cj.pmul(lhs(i+7,j), b0);
408 }
409 res[(i+0)*resIncr] += alpha*cc0;
410 res[(i+1)*resIncr] += alpha*cc1;
411 res[(i+2)*resIncr] += alpha*cc2;
412 res[(i+3)*resIncr] += alpha*cc3;
413 res[(i+4)*resIncr] += alpha*cc4;
414 res[(i+5)*resIncr] += alpha*cc5;
415 res[(i+6)*resIncr] += alpha*cc6;
416 res[(i+7)*resIncr] += alpha*cc7;
417 }
418 for(; i<n4; i+=4)
419 {
420 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421 c1 = pset1<ResPacket>(ResScalar(0)),
422 c2 = pset1<ResPacket>(ResScalar(0)),
423 c3 = pset1<ResPacket>(ResScalar(0));
424
425 Index j=0;
426 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427 {
428 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
429
430 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
434 }
435 ResScalar cc0 = predux(c0);
436 ResScalar cc1 = predux(c1);
437 ResScalar cc2 = predux(c2);
438 ResScalar cc3 = predux(c3);
439 for(; j<cols; ++j)
440 {
441 RhsScalar b0 = rhs(j,0);
442
443 cc0 += cj.pmul(lhs(i+0,j), b0);
444 cc1 += cj.pmul(lhs(i+1,j), b0);
445 cc2 += cj.pmul(lhs(i+2,j), b0);
446 cc3 += cj.pmul(lhs(i+3,j), b0);
447 }
448 res[(i+0)*resIncr] += alpha*cc0;
449 res[(i+1)*resIncr] += alpha*cc1;
450 res[(i+2)*resIncr] += alpha*cc2;
451 res[(i+3)*resIncr] += alpha*cc3;
452 }
453 for(; i<n2; i+=2)
454 {
455 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456 c1 = pset1<ResPacket>(ResScalar(0));
457
458 Index j=0;
459 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460 {
461 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
462
463 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
465 }
466 ResScalar cc0 = predux(c0);
467 ResScalar cc1 = predux(c1);
468 for(; j<cols; ++j)
469 {
470 RhsScalar b0 = rhs(j,0);
471
472 cc0 += cj.pmul(lhs(i+0,j), b0);
473 cc1 += cj.pmul(lhs(i+1,j), b0);
474 }
475 res[(i+0)*resIncr] += alpha*cc0;
476 res[(i+1)*resIncr] += alpha*cc1;
477 }
478 for(; i<rows; ++i)
479 {
480 ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481 ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482 ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483 Index j=0;
484 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
485 {
486 RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
488 }
489 ResScalar cc0 = predux(c0);
490 if (HasHalf) {
491 for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492 {
493 RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494 c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495 }
496 cc0 += predux(c0_h);
497 }
498 if (HasQuarter) {
499 for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500 {
501 RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502 c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503 }
504 cc0 += predux(c0_q);
505 }
506 for(; j<cols; ++j)
507 {
508 cc0 += cj.pmul(lhs(i,j), rhs(j,0));
509 }
510 res[i*resIncr] += alpha*cc0;
511 }
512}
513
514} // end namespace internal
515
516} // end namespace Eigen
517
518#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
#define eigen_internal_assert(x)
Definition: Macros.h:1053
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1086
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:986
#define EIGEN_DONT_INLINE
Definition: Macros.h:950
Definition: GeneralMatrixVector.h:34
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: GeneralMatrixVector.h:59
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: GeneralMatrixVector.h:61
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: GeneralMatrixVector.h:60
@ Vectorizable
Definition: GeneralMatrixVector.h:51
@ ResPacketSize
Definition: GeneralMatrixVector.h:56
@ RhsPacketSize
Definition: GeneralMatrixVector.h:55
@ LhsPacketSize
Definition: GeneralMatrixVector.h:54
@ Unaligned
Data pointer has no specific alignment.
Definition: Constants.h:233
@ ColMajor
Storage order is column major (see TopicStorageOrders).
Definition: Constants.h:319
@ RowMajor
Storage order is row major (see TopicStorageOrders).
Definition: Constants.h:321
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition: GenericPacketMath.h:875
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i &a, const Packet4i &b, const Packet4i &c)
Definition: PacketMath.h:370
GEMVPacketSizeType
Definition: GeneralMatrixVector.h:17
@ GEMVPacketFull
Definition: GeneralMatrixVector.h:18
@ GEMVPacketHalf
Definition: GeneralMatrixVector.h:19
@ GEMVPacketQuarter
Definition: GeneralMatrixVector.h:20
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:700
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition: Core:141
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Definition: Eigen_Colamd.h:50
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:806
Definition: ConjHelper.h:63
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:67
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:71
Definition: GeneralMatrixVector.h:24
T3 type
Definition: GeneralMatrixVector.h:24
Definition: Meta.h:96
Definition: GenericPacketMath.h:133
@ vectorizable
Definition: GenericPacketMath.h:140