libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 #define MAC_Apply_G_mx3b_ops( m_A, \ 00034 gamma12, \ 00035 sigma12, \ 00036 gamma23, \ 00037 sigma23, \ 00038 a1, inc_a1, \ 00039 a2, inc_a2, \ 00040 a3, inc_a3 ) \ 00041 { \ 00042 float ga12 = *gamma12; \ 00043 float si12 = *sigma12; \ 00044 float ga23 = *gamma23; \ 00045 float si23 = *sigma23; \ 00046 float* restrict alpha1 = a1; \ 00047 float* restrict alpha2 = a2; \ 00048 float* restrict alpha3 = a3; \ 00049 float temp1; \ 00050 float temp2; \ 00051 float temp3; \ 00052 int i; \ 00053 \ 00054 for ( i = 0; i < m_A; ++i ) \ 00055 { \ 00056 temp2 = *alpha2; \ 00057 temp3 = *alpha3; \ 00058 \ 00059 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00060 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00061 \ 00062 temp1 = *alpha1; \ 00063 temp2 = *alpha2; \ 00064 \ 00065 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00066 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00067 \ 00068 alpha1 += inc_a1; \ 00069 alpha2 += inc_a2; \ 00070 alpha3 += inc_a3; \ 00071 } \ 00072 } 00073 00074 #define MAC_Apply_G_mx3b_opc( m_A, \ 00075 gamma12, \ 00076 sigma12, \ 00077 gamma23, \ 00078 sigma23, \ 00079 a1, inc_a1, \ 00080 a2, inc_a2, \ 00081 a3, inc_a3 ) \ 00082 { \ 00083 float ga12 = *gamma12; \ 00084 float si12 = *sigma12; \ 00085 float ga23 = *gamma23; \ 00086 float si23 = *sigma23; \ 00087 scomplex* restrict alpha1 = a1; \ 00088 scomplex* restrict alpha2 = a2; \ 00089 scomplex* restrict alpha3 = a3; \ 00090 scomplex temp1; \ 00091 scomplex temp2; \ 00092 scomplex temp3; \ 00093 int i; \ 00094 \ 00095 for ( i = 0; i < m_A; ++i ) \ 00096 { \ 00097 temp2 = *alpha2; \ 00098 temp3 = *alpha3; \ 00099 \ 00100 alpha2->real = ga23 * temp2.real + si23 * temp3.real; \ 00101 alpha2->imag = ga23 * temp2.imag + si23 * temp3.imag; \ 00102 \ 00103 alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \ 00104 alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \ 00105 \ 00106 temp1 = *alpha1; \ 00107 temp2 = *alpha2; \ 00108 \ 00109 alpha1->real = ga12 * temp1.real + si12 * temp2.real; \ 00110 alpha1->imag = ga12 * temp1.imag + si12 * temp2.imag; \ 00111 \ 00112 alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \ 00113 alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \ 00114 \ 00115 alpha1 += inc_a1; \ 00116 alpha2 += inc_a2; \ 00117 alpha3 += inc_a3; \ 00118 } \ 00119 } 00120 00121 #define MAC_Apply_G_mx3b_opd( m_A, \ 00122 gamma12, \ 00123 sigma12, \ 00124 gamma23, \ 00125 sigma23, \ 00126 a1, inc_a1, \ 00127 a2, inc_a2, \ 00128 a3, inc_a3 ) \ 00129 { \ 00130 double ga12 = *gamma12; \ 00131 double si12 = *sigma12; \ 00132 double ga23 = *gamma23; \ 00133 double si23 = *sigma23; \ 00134 double* restrict alpha1 = a1; \ 00135 double* restrict alpha2 = a2; \ 00136 double* restrict alpha3 = a3; \ 00137 double temp1; \ 00138 double temp2; \ 00139 double temp3; \ 00140 int i; \ 00141 \ 00142 for ( i = 0; i < m_A; ++i ) \ 00143 { \ 00144 temp2 = *alpha2; \ 00145 temp3 = *alpha3; \ 00146 \ 00147 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00148 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00149 \ 00150 temp1 = *alpha1; \ 00151 temp2 = *alpha2; \ 00152 \ 00153 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00154 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00155 \ 00156 alpha1 += inc_a1; \ 00157 alpha2 += inc_a2; \ 00158 alpha3 += inc_a3; \ 00159 } \ 00160 } 00161 00162 #define MAC_Apply_G_mx3b_opz( m_A, \ 00163 gamma12, \ 00164 sigma12, \ 00165 gamma23, \ 00166 sigma23, \ 00167 a1, inc_a1, \ 00168 a2, inc_a2, \ 00169 a3, inc_a3 ) \ 00170 { \ 00171 double ga12 = *gamma12; \ 00172 double si12 = *sigma12; \ 00173 double ga23 = *gamma23; \ 00174 double si23 = *sigma23; \ 00175 dcomplex* restrict alpha1 = a1; \ 00176 dcomplex* restrict alpha2 = a2; \ 00177 dcomplex* restrict alpha3 = a3; \ 00178 dcomplex temp1; \ 00179 dcomplex temp2; \ 00180 dcomplex temp3; \ 00181 int i; \ 00182 \ 00183 for ( i = 0; i < m_A; ++i ) \ 00184 { \ 00185 temp2 = *alpha2; \ 00186 temp3 = *alpha3; \ 00187 \ 00188 alpha2->real = ga23 * temp2.real + si23 * temp3.real; \ 00189 alpha2->imag = ga23 * temp2.imag + si23 * temp3.imag; \ 00190 \ 00191 alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \ 00192 alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \ 00193 \ 00194 temp1 = *alpha1; \ 00195 temp2 = *alpha2; \ 00196 \ 00197 alpha1->real = ga12 * temp1.real + si12 * temp2.real; \ 00198 alpha1->imag = ga12 * temp1.imag + si12 * temp2.imag; \ 00199 \ 00200 alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \ 00201 alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \ 00202 \ 00203 alpha1 += inc_a1; \ 00204 alpha2 += inc_a2; \ 00205 alpha3 += inc_a3; \ 00206 } \ 00207 } 00208