libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 #define MAC_Apply_G_mx3_ops( m_A, \ 00034 gamma12, \ 00035 sigma12, \ 00036 gamma23, \ 00037 sigma23, \ 00038 a1, inc_a1, \ 00039 a2, inc_a2, \ 00040 a3, inc_a3 ) \ 00041 { \ 00042 float ga12 = *gamma12; \ 00043 float si12 = *sigma12; \ 00044 float ga23 = *gamma23; \ 00045 float si23 = *sigma23; \ 00046 float* restrict alpha1 = a1; \ 00047 float* restrict alpha2 = a2; \ 00048 float* restrict alpha3 = a3; \ 00049 float temp1; \ 00050 float temp2; \ 00051 float temp3; \ 00052 int i; \ 00053 \ 00054 for ( i = 0; i < m_A; ++i ) \ 00055 { \ 00056 temp1 = *alpha1; \ 00057 temp2 = *alpha2; \ 00058 \ 00059 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00060 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00061 \ 00062 temp2 = *alpha2; \ 00063 temp3 = *alpha3; \ 00064 \ 00065 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00066 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00067 \ 00068 alpha1 += inc_a1; \ 00069 alpha2 += inc_a2; \ 00070 alpha3 += inc_a3; \ 00071 } \ 00072 } 00073 00074 #define MAC_Apply_G_mx3_opd( m_A, \ 00075 gamma12, \ 00076 sigma12, \ 00077 gamma23, \ 00078 sigma23, \ 00079 a1, inc_a1, \ 00080 a2, inc_a2, \ 00081 a3, inc_a3 ) \ 00082 { \ 00083 double ga12 = *gamma12; \ 00084 double si12 = *sigma12; \ 00085 double ga23 = *gamma23; \ 00086 double si23 = *sigma23; \ 00087 double* restrict alpha1 = a1; \ 00088 double* restrict alpha2 = a2; \ 00089 double* restrict alpha3 = a3; \ 00090 double temp1; \ 00091 double temp2; \ 00092 double temp3; \ 00093 int i; \ 00094 \ 00095 for ( i = 0; i < m_A; ++i ) \ 00096 { \ 00097 temp1 = *alpha1; \ 00098 temp2 = *alpha2; \ 00099 \ 00100 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00101 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00102 \ 00103 temp2 = *alpha2; \ 00104 temp3 = *alpha3; \ 00105 \ 00106 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00107 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00108 \ 00109 alpha1 += inc_a1; \ 00110 alpha2 += inc_a2; \ 00111 alpha3 += inc_a3; \ 00112 } \ 00113 } 00114 00115 #define MAC_Apply_G_mx3_opc( m_A, \ 00116 gamma12, \ 00117 sigma12, \ 00118 gamma23, \ 00119 sigma23, \ 00120 a1, inc_a1, \ 00121 a2, inc_a2, \ 00122 a3, inc_a3 ) \ 00123 { \ 00124 float ga12 = *gamma12; \ 00125 float si12 = *sigma12; \ 00126 float ga23 = *gamma23; \ 00127 float si23 = *sigma23; \ 00128 scomplex* restrict alpha1 = a1; \ 00129 scomplex* restrict alpha2 = a2; \ 00130 scomplex* restrict alpha3 = a3; \ 00131 scomplex temp1; \ 00132 scomplex temp2; \ 00133 scomplex temp3; \ 00134 int i; \ 00135 \ 00136 for ( i = 0; i < m_A; ++i ) \ 00137 { \ 00138 temp1 = *alpha1; \ 00139 temp2 = *alpha2; \ 00140 \ 00141 alpha1->real = ga12 * temp1.real + si12 * temp2.real; \ 00142 alpha1->imag = ga12 * temp1.imag + si12 * temp2.imag; \ 00143 \ 00144 alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \ 00145 alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \ 00146 \ 00147 temp2 = *alpha2; \ 00148 temp3 = *alpha3; \ 00149 \ 00150 alpha2->real = ga23 * temp2.real + si23 * temp3.real; \ 00151 alpha2->imag = ga23 * temp2.imag + si23 * temp3.imag; \ 00152 \ 00153 alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \ 00154 alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \ 00155 \ 00156 alpha1 += inc_a1; \ 00157 alpha2 += inc_a2; \ 00158 alpha3 += inc_a3; \ 00159 } \ 00160 } 00161 00162 #define MAC_Apply_G_mx3_opz( m_A, \ 00163 gamma12, \ 00164 sigma12, \ 00165 gamma23, \ 00166 sigma23, \ 00167 a1, inc_a1, \ 00168 a2, inc_a2, \ 00169 a3, inc_a3 ) \ 00170 { \ 00171 double ga12 = *gamma12; \ 00172 double si12 = *sigma12; \ 00173 double ga23 = *gamma23; \ 00174 double si23 = *sigma23; \ 00175 dcomplex* restrict alpha1 = a1; \ 00176 dcomplex* restrict alpha2 = a2; \ 00177 dcomplex* restrict alpha3 = a3; \ 00178 dcomplex temp1; \ 00179 dcomplex temp2; \ 00180 dcomplex temp3; \ 00181 int i; \ 00182 \ 00183 for ( i = 0; i < m_A; ++i ) \ 00184 { \ 00185 temp1 = *alpha1; \ 00186 temp2 = *alpha2; \ 00187 \ 00188 alpha1->real = ga12 * temp1.real + si12 * temp2.real; \ 00189 alpha1->imag = ga12 * temp1.imag + si12 * temp2.imag; \ 00190 \ 00191 alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \ 00192 alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \ 00193 \ 00194 temp2 = *alpha2; \ 00195 temp3 = *alpha3; \ 00196 \ 00197 alpha2->real = ga23 * temp2.real + si23 * temp3.real; \ 00198 alpha2->imag = ga23 * temp2.imag + si23 * temp3.imag; \ 00199 \ 00200 alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \ 00201 alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \ 00202 \ 00203 alpha1 += inc_a1; \ 00204 alpha2 += inc_a2; \ 00205 alpha3 += inc_a3; \ 00206 } \ 00207 } 00208