10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
30 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
31 struct general_matrix_vector_product<Index,LhsScalar,
ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
33 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
36 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
37 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
38 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
39 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
40 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
43 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
44 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
45 typedef typename packet_traits<ResScalar>::type _ResPacket;
47 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
48 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
49 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
51 EIGEN_DONT_INLINE
static void run(
52 Index rows, Index cols,
53 const LhsScalar* lhs, Index lhsStride,
54 const RhsScalar* rhs, Index rhsIncr,
56 #ifdef EIGEN_INTERNAL_DEBUGGING
62 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
63 EIGEN_DONT_INLINE
void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
64 Index rows, Index cols,
65 const LhsScalar* lhs, Index lhsStride,
66 const RhsScalar* rhs, Index rhsIncr,
68 #ifdef EIGEN_INTERNAL_DEBUGGING
73 eigen_internal_assert(resIncr==1);
74 #ifdef _EIGEN_ACCUMULATE_PACKETS
75 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
77 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
79 padd(pload<ResPacket>(&res[j]), \
81 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
82 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
83 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
84 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
86 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
87 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
89 alpha = numext::conj(alpha);
91 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
92 const Index columnsAtOnce = 4;
93 const Index peels = 2;
94 const Index LhsPacketAlignedMask = LhsPacketSize-1;
95 const Index ResPacketAlignedMask = ResPacketSize-1;
97 const Index size = rows;
101 Index alignedStart = internal::first_aligned(res,size);
102 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
103 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
105 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
106 Index alignmentPattern = alignmentStep==0 ? AllAligned
107 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
111 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
114 Index skipColumns = 0;
116 if( (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(res)%
sizeof(ResScalar)) )
121 else if (LhsPacketSize>1)
123 eigen_internal_assert(
size_t(lhs+lhsAlignmentOffset)%
sizeof(LhsPacket)==0 || size<LhsPacketSize);
125 while (skipColumns<LhsPacketSize &&
126 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
128 if (skipColumns==LhsPacketSize)
131 alignmentPattern = NoneAligned;
136 skipColumns = (std::min)(skipColumns,cols);
140 eigen_internal_assert( (alignmentPattern==NoneAligned)
141 || (skipColumns + columnsAtOnce >= cols)
142 || LhsPacketSize > size
143 || (
size_t(lhs+alignedStart+lhsStride*skipColumns)%
sizeof(LhsPacket))==0);
145 else if(Vectorizable)
149 alignmentPattern = AllAligned;
152 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
153 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
155 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
156 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
158 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
159 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
160 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
161 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
164 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
165 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
171 for (Index j=0; j<alignedStart; ++j)
173 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
174 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
175 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
176 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
179 if (alignedSize>alignedStart)
181 switch(alignmentPattern)
184 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
185 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
188 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
189 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
193 Index j = alignedStart;
196 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
199 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
200 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
201 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
203 for (; j<peeledSize; j+=peels*ResPacketSize)
205 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
206 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
207 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
209 A00 = pload<LhsPacket>(&lhs0[j]);
210 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
211 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
212 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
214 T0 = pcj.pmadd(A01, ptmp1, T0);
215 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
216 T0 = pcj.pmadd(A02, ptmp2, T0);
217 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
218 T0 = pcj.pmadd(A03, ptmp3, T0);
220 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
221 T1 = pcj.pmadd(A11, ptmp1, T1);
222 T1 = pcj.pmadd(A12, ptmp2, T1);
223 T1 = pcj.pmadd(A13, ptmp3, T1);
224 pstore(&res[j+ResPacketSize],T1);
227 for (; j<alignedSize; j+=ResPacketSize)
228 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
232 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
233 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
240 for (Index j=alignedSize; j<size; ++j)
242 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
243 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
244 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
245 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
251 Index start = columnBound;
254 for (Index k=start; k<end; ++k)
256 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
257 const LhsScalar* lhs0 = lhs + k*lhsStride;
263 for (Index j=0; j<alignedStart; ++j)
264 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
266 if ((
size_t(lhs0+alignedStart)%
sizeof(LhsPacket))==0)
267 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
268 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
270 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
271 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
275 for (Index i=alignedSize; i<size; ++i)
276 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
286 }
while(Vectorizable);
287 #undef _EIGEN_ACCUMULATE_PACKETS
300 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
301 struct general_matrix_vector_product<Index,LhsScalar,
RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
303 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
306 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
307 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
308 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
309 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
310 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
313 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
314 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
315 typedef typename packet_traits<ResScalar>::type _ResPacket;
317 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
318 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
319 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
321 EIGEN_DONT_INLINE
static void run(
322 Index rows, Index cols,
323 const LhsScalar* lhs, Index lhsStride,
324 const RhsScalar* rhs, Index rhsIncr,
325 ResScalar* res, Index resIncr,
329 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
330 EIGEN_DONT_INLINE
void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
331 Index rows, Index cols,
332 const LhsScalar* lhs, Index lhsStride,
333 const RhsScalar* rhs, Index rhsIncr,
334 ResScalar* res, Index resIncr,
337 EIGEN_UNUSED_VARIABLE(rhsIncr);
338 eigen_internal_assert(rhsIncr==1);
339 #ifdef _EIGEN_ACCUMULATE_PACKETS
340 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
343 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
344 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
345 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
346 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
347 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
348 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
350 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
351 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
353 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
354 const Index rowsAtOnce = 4;
355 const Index peels = 2;
356 const Index RhsPacketAlignedMask = RhsPacketSize-1;
357 const Index LhsPacketAlignedMask = LhsPacketSize-1;
359 const Index depth = cols;
364 Index alignedStart = internal::first_aligned(rhs, depth);
365 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
366 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
368 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
369 Index alignmentPattern = alignmentStep==0 ? AllAligned
370 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
374 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
379 if( (
sizeof(LhsScalar)!=
sizeof(RhsScalar)) || (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(rhs)%
sizeof(RhsScalar)) )
384 else if (LhsPacketSize>1)
386 eigen_internal_assert(
size_t(lhs+lhsAlignmentOffset)%
sizeof(LhsPacket)==0 || depth<LhsPacketSize);
388 while (skipRows<LhsPacketSize &&
389 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
391 if (skipRows==LhsPacketSize)
394 alignmentPattern = NoneAligned;
399 skipRows = (std::min)(skipRows,Index(rows));
402 eigen_internal_assert( alignmentPattern==NoneAligned
404 || (skipRows + rowsAtOnce >= rows)
405 || LhsPacketSize > depth
406 || (
size_t(lhs+alignedStart+lhsStride*skipRows)%
sizeof(LhsPacket))==0);
408 else if(Vectorizable)
412 alignmentPattern = AllAligned;
415 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
416 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
418 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
419 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
421 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
422 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
425 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
426 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
431 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
432 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
436 for (Index j=0; j<alignedStart; ++j)
438 RhsScalar b = rhs[j];
439 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
440 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
443 if (alignedSize>alignedStart)
445 switch(alignmentPattern)
448 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
449 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
452 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
453 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
457 Index j = alignedStart;
466 LhsPacket A01, A02, A03, A11, A12, A13;
467 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
468 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
469 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
471 for (; j<peeledSize; j+=peels*RhsPacketSize)
473 RhsPacket b = pload<RhsPacket>(&rhs[j]);
474 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
475 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
476 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
478 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
479 ptmp1 = pcj.pmadd(A01, b, ptmp1);
480 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
481 ptmp2 = pcj.pmadd(A02, b, ptmp2);
482 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
483 ptmp3 = pcj.pmadd(A03, b, ptmp3);
484 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
486 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
487 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
488 ptmp1 = pcj.pmadd(A11, b, ptmp1);
489 ptmp2 = pcj.pmadd(A12, b, ptmp2);
490 ptmp3 = pcj.pmadd(A13, b, ptmp3);
493 for (; j<alignedSize; j+=RhsPacketSize)
494 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
498 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
499 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
502 tmp0 += predux(ptmp0);
503 tmp1 += predux(ptmp1);
504 tmp2 += predux(ptmp2);
505 tmp3 += predux(ptmp3);
511 for (Index j=alignedSize; j<depth; ++j)
513 RhsScalar b = rhs[j];
514 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
515 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
517 res[i*resIncr] += alpha*tmp0;
518 res[(i+offset1)*resIncr] += alpha*tmp1;
519 res[(i+2)*resIncr] += alpha*tmp2;
520 res[(i+offset3)*resIncr] += alpha*tmp3;
525 Index start = rowBound;
528 for (Index i=start; i<end; ++i)
530 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
531 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
532 const LhsScalar* lhs0 = lhs + i*lhsStride;
535 for (Index j=0; j<alignedStart; ++j)
536 tmp0 += cj.pmul(lhs0[j], rhs[j]);
538 if (alignedSize>alignedStart)
541 if ((
size_t(lhs0+alignedStart)%
sizeof(LhsPacket))==0)
542 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
543 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
545 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
546 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
547 tmp0 += predux(ptmp0);
552 for (Index j=alignedSize; j<depth; ++j)
553 tmp0 += cj.pmul(lhs0[j], rhs[j]);
554 res[i*resIncr] += alpha*tmp0;
564 }
while(Vectorizable);
566 #undef _EIGEN_ACCUMULATE_PACKETS
573 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
Definition: Constants.h:266
Definition: Constants.h:264