|
26 | 26 | #include "util/rsthreads.h" |
27 | 27 | #include "util/rsdebuglevel0.h" |
28 | 28 |
|
| 29 | +namespace DeepSearch |
| 30 | +{ |
| 31 | + |
| 32 | +std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc) |
| 33 | +{ |
| 34 | + if(rsHtmlDoc.empty()) return rsHtmlDoc; |
| 35 | + |
| 36 | + const bool isPlainMsg = |
| 37 | + rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>'; |
| 38 | + if(isPlainMsg) return rsHtmlDoc; |
| 39 | + |
| 40 | + auto oSize = rsHtmlDoc.size(); |
| 41 | + auto bodyTagBegin(rsHtmlDoc.find("<body")); |
| 42 | + if(bodyTagBegin >= oSize) return rsHtmlDoc; |
| 43 | + |
| 44 | + auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin)); |
| 45 | + if(bodyTagEnd >= oSize) return rsHtmlDoc; |
| 46 | + |
| 47 | + std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1)); |
| 48 | + |
| 49 | + // strip also CSS inside <style></style> |
| 50 | + oSize = retVal.size(); |
| 51 | + auto styleTagBegin(retVal.find("<style")); |
| 52 | + if(styleTagBegin < oSize) |
| 53 | + { |
| 54 | + auto styleEnd(retVal.find("</style>", styleTagBegin)); |
| 55 | + if(styleEnd < oSize) |
| 56 | + retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin); |
| 57 | + } |
| 58 | + |
| 59 | + std::string::size_type oPos; |
| 60 | + std::string::size_type cPos; |
| 61 | + int itCount = 0; |
| 62 | + while((oPos = retVal.find("<")) < retVal.size()) |
| 63 | + { |
| 64 | + if((cPos = retVal.find(">")) <= retVal.size()) |
| 65 | + retVal.erase(oPos, 1+cPos-oPos); |
| 66 | + else break; |
| 67 | + |
| 68 | + // Avoid infinite loop with crafty input |
| 69 | + if(itCount > 1000) |
| 70 | + { |
| 71 | + RS_WARN( "Breaking stripping loop due to max allowed iterations ", |
| 72 | + "rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal ); |
| 73 | + break; |
| 74 | + } |
| 75 | + ++itCount; |
| 76 | + } |
| 77 | + |
| 78 | + return retVal; |
| 79 | +} |
| 80 | + |
| 81 | +} |
| 82 | + |
| 83 | +// Xapian-specific code (only for channels/files indexing) |
| 84 | +#if defined(RS_DEEP_CHANNEL_INDEX) || defined(RS_DEEP_FILES_INDEX) |
| 85 | + |
29 | 86 | #ifndef XAPIAN_AT_LEAST |
30 | 87 | /// Added in Xapian 1.4.2. |
31 | 88 | #define XAPIAN_AT_LEAST(A,B,C) \ |
@@ -168,53 +225,6 @@ std::error_condition StubbornWriteOpQueue::flush( |
168 | 225 | return std::error_condition(); |
169 | 226 | } |
170 | 227 |
|
171 | | -std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc) |
172 | | -{ |
173 | | - if(rsHtmlDoc.empty()) return rsHtmlDoc; |
174 | | - |
175 | | - const bool isPlainMsg = |
176 | | - rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>'; |
177 | | - if(isPlainMsg) return rsHtmlDoc; |
178 | | - |
179 | | - auto oSize = rsHtmlDoc.size(); |
180 | | - auto bodyTagBegin(rsHtmlDoc.find("<body")); |
181 | | - if(bodyTagBegin >= oSize) return rsHtmlDoc; |
182 | | - |
183 | | - auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin)); |
184 | | - if(bodyTagEnd >= oSize) return rsHtmlDoc; |
185 | | - |
186 | | - std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1)); |
187 | | - |
188 | | - // strip also CSS inside <style></style> |
189 | | - oSize = retVal.size(); |
190 | | - auto styleTagBegin(retVal.find("<style")); |
191 | | - if(styleTagBegin < oSize) |
192 | | - { |
193 | | - auto styleEnd(retVal.find("</style>", styleTagBegin)); |
194 | | - if(styleEnd < oSize) |
195 | | - retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin); |
196 | | - } |
197 | | - |
198 | | - std::string::size_type oPos; |
199 | | - std::string::size_type cPos; |
200 | | - int itCount = 0; |
201 | | - while((oPos = retVal.find("<")) < retVal.size()) |
202 | | - { |
203 | | - if((cPos = retVal.find(">")) <= retVal.size()) |
204 | | - retVal.erase(oPos, 1+cPos-oPos); |
205 | | - else break; |
206 | | - |
207 | | - // Avoid infinite loop with crafty input |
208 | | - if(itCount > 1000) |
209 | | - { |
210 | | - RS_WARN( "Breaking stripping loop due to max allowed iterations ", |
211 | | - "rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal ); |
212 | | - break; |
213 | | - } |
214 | | - ++itCount; |
215 | | - } |
216 | | - |
217 | | - return retVal; |
218 | 228 | } |
219 | 229 |
|
220 | | -} |
| 230 | +#endif // RS_DEEP_CHANNEL_INDEX || RS_DEEP_FILES_INDEX |
0 commit comments