KMIME Library
kmime_charfreq.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00034 #include "kmime_charfreq.h"
00035
00036 using namespace KMime;
00037
00042
00043
00044
00045
00046
00047
00048
00049 CharFreq::CharFreq( const QByteArray &buf )
00050 : mNUL( 0 ),
00051 mCTL( 0 ),
00052 mCR( 0 ), mLF( 0 ),
00053 mCRLF( 0 ),
00054 mPrintable( 0 ),
00055 mEightBit( 0 ),
00056 mTotal( 0 ),
00057 mLineMin( 0xffffffff ),
00058 mLineMax( 0 ),
00059 mTrailingWS( false ),
00060 mLeadingFrom( false )
00061 {
00062 if ( !buf.isEmpty() ) {
00063 count( buf.data(), buf.size() );
00064 }
00065 }
00066
00067 CharFreq::CharFreq( const char *buf, size_t len )
00068 : mNUL( 0 ),
00069 mCTL( 0 ),
00070 mCR( 0 ), mLF( 0 ),
00071 mCRLF( 0 ),
00072 mPrintable( 0 ),
00073 mEightBit( 0 ),
00074 mTotal( 0 ),
00075 mLineMin( 0xffffffff ),
00076 mLineMax( 0 ),
00077 mTrailingWS( false ),
00078 mLeadingFrom( false )
00079 {
00080 if ( buf && len > 0 ) {
00081 count( buf, len );
00082 }
00083 }
00084
00085
00086 static inline bool isWS( char ch )
00087 {
00088 return ( ch == '\t' || ch == ' ' );
00089 }
00090
00091
00092 void CharFreq::count( const char *it, size_t len )
00093 {
00094 const char *end = it + len;
00095 uint currentLineLength = 0;
00096
00097
00098 char prevChar = '\n';
00099 char prevPrevChar = 0;
00100
00101 for ( ; it != end ; ++it ) {
00102 ++currentLineLength;
00103 switch ( *it ) {
00104 case '\0': ++mNUL; break;
00105 case '\r': ++mCR; break;
00106 case '\n': ++mLF;
00107 if ( prevChar == '\r' ) {
00108 --currentLineLength; ++mCRLF;
00109 }
00110 if ( currentLineLength >= mLineMax ) {
00111 mLineMax = currentLineLength-1;
00112 }
00113 if ( currentLineLength <= mLineMin ) {
00114 mLineMin = currentLineLength-1;
00115 }
00116 if ( !mTrailingWS ) {
00117 if ( isWS( prevChar ) ||
00118 ( prevChar == '\r' && isWS( prevPrevChar ) ) ) {
00119 mTrailingWS = true;
00120 }
00121 }
00122 currentLineLength = 0;
00123 break;
00124 case 'F':
00125 if ( !mLeadingFrom ) {
00126 if ( prevChar == '\n' && end - it >= 5 &&
00127 !qstrncmp( "From ", it, 5 ) ) {
00128 mLeadingFrom = true;
00129 }
00130 }
00131 ++mPrintable;
00132 break;
00133 default:
00134 {
00135 uchar c = *it;
00136 if ( c == '\t' || ( c >= ' ' && c <= '~' ) ) {
00137 ++mPrintable;
00138 } else if ( c == 127 || c < ' ' ) {
00139 ++mCTL;
00140 } else {
00141 ++mEightBit;
00142 }
00143 }
00144 }
00145 prevPrevChar = prevChar;
00146 prevChar = *it;
00147 }
00148
00149
00150 if ( currentLineLength >= mLineMax ) {
00151 mLineMax = currentLineLength;
00152 }
00153 if ( currentLineLength <= mLineMin ) {
00154 mLineMin = currentLineLength;
00155 }
00156
00157
00158 if ( isWS( prevChar ) ) {
00159 mTrailingWS = true;
00160 }
00161
00162 mTotal = len;
00163 }
00164
00165 bool CharFreq::isEightBitData() const
00166 {
00167 return type() == EightBitData;
00168 }
00169
00170 bool CharFreq::isEightBitText() const
00171 {
00172 return type() == EightBitText;
00173 }
00174
00175 bool CharFreq::isSevenBitData() const
00176 {
00177 return type() == SevenBitData;
00178 }
00179
00180 bool CharFreq::isSevenBitText() const
00181 {
00182 return type() == SevenBitText;
00183 }
00184
00185 bool CharFreq::hasTrailingWhitespace() const
00186 {
00187 return mTrailingWS;
00188 }
00189
00190 bool CharFreq::hasLeadingFrom() const
00191 {
00192 return mLeadingFrom;
00193 }
00194
00195 CharFreq::Type CharFreq::type() const
00196 {
00197 #if 0
00198 qDebug( "Total: %d; NUL: %d; CTL: %d;\n"
00199 "CR: %d; LF: %d; CRLF: %d;\n"
00200 "lineMin: %d; lineMax: %d;\n"
00201 "printable: %d; eightBit: %d;\n"
00202 "trailing whitespace: %s;\n"
00203 "leading 'From ': %s;\n",
00204 total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
00205 printable, eightBit,
00206 mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
00207 #endif
00208 if ( mNUL ) {
00209 return Binary;
00210 }
00211
00212
00213 if ( mEightBit ) {
00214 if ( mLineMax > 988 ) {
00215 return EightBitData;
00216 }
00217 if ( mCR != mCRLF || controlCodesRatio() > 0.2 ) {
00218 return EightBitData;
00219 }
00220 return EightBitText;
00221 }
00222
00223
00224 if ( mLineMax > 988 ) {
00225 return SevenBitData;
00226 }
00227 if ( mCR != mCRLF || controlCodesRatio() > 0.2 ) {
00228 return SevenBitData;
00229 }
00230
00231
00232 return SevenBitText;
00233 }
00234
00235 float CharFreq::printableRatio() const
00236 {
00237 if ( mTotal ) {
00238 return float(mPrintable) / float(mTotal);
00239 } else {
00240 return 0;
00241 }
00242 }
00243
00244 float CharFreq::controlCodesRatio() const
00245 {
00246 if ( mTotal ) {
00247 return float(mCTL) / float(mTotal);
00248 } else {
00249 return 0;
00250 }
00251 }
00252