kodi
CharsetDetection.h
1 /*
2  * Copyright (C) 2013-2018 Team Kodi
3  * This file is part of Kodi - https://kodi.tv
4  *
5  * SPDX-License-Identifier: GPL-2.0-or-later
6  * See LICENSES/README.md for more information.
7  */
8 
9 #pragma once
10 
11 #include <string>
12 
13 
15 {
16 public:
24  static std::string GetBomEncoding(const char* const content, const size_t contentLength);
31  static inline std::string GetBomEncoding(const std::string& content)
32  { return GetBomEncoding(content.c_str(), content.length()); }
33 
34  static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
35  { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }
36 
37  static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);
38 
46  static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "")
47  {
48  std::string usedHtmlCharset;
49  return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset);
50  }
59  static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);
60 
69  static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);
70 
71 private:
72  static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
81  static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);
82 
83  static std::string GetHtmlEncodingFromHead(const std::string& htmlContent);
84  static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue);
85  static std::string ExtractEncodingFromHtmlMeta(const std::string& metaContent, size_t pos = 0);
86 
87  static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst);
88  static void appendCharAsAsciiUpperCase(std::string& str, const char chr);
89 
90  static const size_t m_XmlDeclarationMaxLength;
91  static const size_t m_HtmlCharsetEndSearchPos;
92 
93  static const std::string m_HtmlWhitespaceChars;
94 };
static bool ConvertPlainTextToUtf8(const std::string &textContent, std::string &converted, const std::string &serverReportedCharset, std::string &usedCharset)
Try to convert plain text to UTF-8 using best suitable charset.
Definition: CharsetDetection.cpp:349
static std::string GetBomEncoding(const std::string &content)
Detect text encoding by Byte Order Mark Multibyte encodings (UTF-16/32) always ends with explicit end...
Definition: CharsetDetection.h:31
static std::string GetBomEncoding(const char *const content, const size_t contentLength)
Detect text encoding by Byte Order Mark Multibyte encodings (UTF-16/32) always ends with explicit end...
Definition: CharsetDetection.cpp:32
Definition: LibInputPointer.h:13
static bool ConvertHtmlToUtf8(const std::string &htmlContent, std::string &converted, const std::string &serverReportedCharset="")
Detect HTML charset and HTML convert to UTF-8.
Definition: CharsetDetection.h:46
Definition: CharsetDetection.h:14