liblcf
reader_util.cpp
Go to the documentation of this file.
1 /*
2  * This file is part of liblcf. Copyright (c) 2021 liblcf authors.
3  * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4  *
5  * liblcf is Free/Libre Open Source Software, released under the MIT License.
6  * For the full copyright and license information, please view the COPYING
7  * file that was distributed with this source code.
8  */
9 
10 #include "lcf/config.h"
11 #include "lcf/scope_guard.h"
12 
13 #if LCF_SUPPORT_ICU
14 # include <unicode/ucsdet.h>
15 # include <unicode/ucnv.h>
16 # include <unicode/normalizer2.h>
17 # include <unicode/unistr.h>
18 #else
19 # ifdef _MSC_VER
20 # error MSVC builds require ICU
21 # endif
22 #endif
23 
24 #ifdef _WIN32
25 # include <windows.h>
26 #else
27 # if !LCF_SUPPORT_ICU
28 # include <iconv.h>
29 # endif
30 # include <locale>
31 #endif
32 
33 #include <algorithm>
34 #include <cstdio>
35 #include <cstdlib>
36 #include <sstream>
37 #include <vector>
38 
39 #include "lcf/inireader.h"
40 #include "lcf/ldb/reader.h"
41 #include "lcf/reader_util.h"
42 
43 namespace lcf {
44 
45 namespace ReaderUtil {
46 }
47 
48 std::string ReaderUtil::CodepageToEncoding(int codepage) {
49  if (codepage == 0)
50  return std::string();
51 
52  if (codepage == 932) {
53 #if LCF_SUPPORT_ICU
54  return "ibm-943_P15A-2003";
55 #else
56  return "SHIFT_JIS";
57 #endif
58  }
59  if (codepage == 949) {
60 #if LCF_SUPPORT_ICU
61  return "windows-949-2000";
62 #else
63  return "cp949";
64 #endif
65  }
66  std::ostringstream out;
67 #if LCF_SUPPORT_ICU
68  out << "windows-" << codepage;
69 #else
70  out << "CP" << codepage;
71 #endif
72 
73  // Looks like a valid codepage
74  std::string outs = out.str();
75  return outs;
76 }
77 
78 std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
79  std::vector<std::string> encodings = DetectEncodings(db);
80 
81  if (encodings.empty()) {
82  return "";
83  }
84 
85  return encodings.front();
86 }
87 
88 std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
89 #if LCF_SUPPORT_ICU
90  std::ostringstream text;
91 
92  auto append = [](const auto& s) {
93  return ToString(s) + " ";
94  };
95 
96  lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto& ctx) {
97  text << append(val);
98  });
99 
100  // Cannot use ForEachString here for Terms:
101  // Too much untranslated garbage data in there, even in default database
102  for (const auto& s: {
103  db.terms.menu_save,
104  db.terms.menu_quit,
105  db.terms.new_game,
106  db.terms.load_game,
107  db.terms.exit_game,
108  db.terms.status,
109  db.terms.row,
110  db.terms.order,
111  db.terms.wait_on,
112  db.terms.wait_off,
113  db.terms.level,
114  db.terms.health_points,
115  db.terms.spirit_points,
116  db.terms.normal_status,
117  db.terms.sp_cost,
118  db.terms.attack,
119  db.terms.defense,
120  db.terms.spirit,
121  db.terms.agility,
122  db.terms.weapon,
123  db.terms.shield,
124  db.terms.armor,
125  db.terms.helmet,
126  db.terms.accessory,
127  db.terms.save_game_message,
128  db.terms.load_game_message,
129  db.terms.exit_game_message,
130  db.terms.file,
131  db.terms.yes,
132  db.terms.no
133  }) {
134  text << append(s);
135  }
136 
137  return ReaderUtil::DetectEncodings(text.str());
138 #else
139  return std::vector<std::string>();
140 #endif
141 }
142 
143 std::string ReaderUtil::DetectEncoding(StringView string) {
144  std::vector<std::string> encodings = DetectEncodings(string);
145 
146  if (encodings.empty()) {
147  return "";
148  }
149 
150  return encodings.front();
151 }
152 
153 std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
154 std::vector<std::string> encodings;
155 #if LCF_SUPPORT_ICU
156  if (!string.empty()) {
157  UErrorCode status = U_ZERO_ERROR;
158  UCharsetDetector* detector = ucsdet_open(&status);
159 
160  auto s = std::string(string);
161  ucsdet_setText(detector, s.c_str(), s.length(), &status);
162 
163  int32_t matches_count;
164  const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
165 
166  if (matches != nullptr) {
167  // Collect all candidates, most confident comes first
168  for (int i = 0; i < matches_count; ++i) {
169  std::string encoding = ucsdet_getName(matches[i], &status);
170 
171  // Fixes to ensure proper Windows encodings
172  if (encoding == "Shift_JIS") {
173  encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
174  } else if (encoding == "EUC-KR") {
175  encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
176  } else if (encoding == "GB18030") {
177  encodings.emplace_back("windows-936-2000"); // Simplified Chinese
178  } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
179  encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
180  } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
181  encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
182  } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
183  encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
184  } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
185  encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
186  } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
187  encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
188  } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
189  encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
190  } else {
191  encodings.push_back(encoding);
192  }
193  }
194  }
195  ucsdet_close(detector);
196  }
197 #endif
198 
199  return encodings;
200 }
201 
202 std::string ReaderUtil::GetEncoding(StringView ini_file) {
203  INIReader ini(ToString(ini_file));
204  if (ini.ParseError() != -1) {
205  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
206  if (!encoding.empty()) {
207  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
208  }
209  }
210  return std::string();
211 }
212 
213 std::string ReaderUtil::GetEncoding(std::istream& filestream) {
214  INIReader ini(filestream);
215  if (ini.ParseError() != -1) {
216  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
217  if (!encoding.empty()) {
218  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
219  }
220  }
221  return std::string();
222 }
223 
224 std::string ReaderUtil::GetLocaleEncoding() {
225 #ifdef _WIN32
226  int codepage = GetACP();
227 #elif __ANDROID__
228  // No std::locale support in NDK
229  // Doesn't really matter because the Android version auto-detects via ICU
230  int codepage = 1252;
231 #else
232  int codepage = 1252;
233 
234  std::locale loc = std::locale("");
235  // Gets the language and culture part only
236  std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
237  // Gets the language part only
238  std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
239 
240  if (loc_lang == "th") codepage = 874;
241  else if (loc_lang == "ja") codepage = 932;
242  else if (loc_full == "zh_CN" ||
243  loc_full == "zh_SG") codepage = 936;
244  else if (loc_lang == "ko") codepage = 949;
245  else if (loc_full == "zh_TW" ||
246  loc_full == "zh_HK") codepage = 950;
247  else if (loc_lang == "cs" ||
248  loc_lang == "hu" ||
249  loc_lang == "pl" ||
250  loc_lang == "ro" ||
251  loc_lang == "hr" ||
252  loc_lang == "sk" ||
253  loc_lang == "sl") codepage = 1250;
254  else if (loc_lang == "ru") codepage = 1251;
255  else if (loc_lang == "ca" ||
256  loc_lang == "da" ||
257  loc_lang == "de" ||
258  loc_lang == "en" ||
259  loc_lang == "es" ||
260  loc_lang == "fi" ||
261  loc_lang == "fr" ||
262  loc_lang == "it" ||
263  loc_lang == "nl" ||
264  loc_lang == "nb" ||
265  loc_lang == "pt" ||
266  loc_lang == "sv" ||
267  loc_lang == "eu") codepage = 1252;
268  else if (loc_lang == "el") codepage = 1253;
269  else if (loc_lang == "tr") codepage = 1254;
270  else if (loc_lang == "he") codepage = 1255;
271  else if (loc_lang == "ar") codepage = 1256;
272  else if (loc_lang == "et" ||
273  loc_lang == "lt" ||
274  loc_lang == "lv") codepage = 1257;
275  else if (loc_lang == "vi") codepage = 1258;
276 #endif
277 
278  return CodepageToEncoding(codepage);
279 }
280 
281 std::string ReaderUtil::Recode(StringView str_to_encode, StringView source_encoding) {
282  return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
283 }
284 
285 std::string ReaderUtil::Recode(StringView str_to_encode,
286  StringView src_enc,
287  StringView dst_enc) {
288 
289  if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
290  return ToString(str_to_encode);
291  }
292 
293  auto src_cp = SvAtoi(src_enc);
294  const auto& src_enc_str = src_cp > 0
295  ? ReaderUtil::CodepageToEncoding(src_cp)
296  : ToString(src_enc);
297 
298  auto dst_cp = SvAtoi(dst_enc);
299  const auto& dst_enc_str = dst_cp > 0
300  ? ReaderUtil::CodepageToEncoding(dst_cp)
301  : ToString(dst_enc);
302 
303 #if LCF_SUPPORT_ICU
304  auto status = U_ZERO_ERROR;
305  auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
306 
307  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
308  fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
309  return std::string();
310  }
311  status = U_ZERO_ERROR;
312  auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
313 
314  auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
315 
316  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
317  fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
318  return std::string();
319  }
320  auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
321  status = U_ZERO_ERROR;
322 
323  std::string result(str_to_encode.size() * 4, '\0');
324  auto* src = str_to_encode.data();
325  auto* dst = &result.front();
326 
327  ucnv_convertEx(conv_to, conv_from,
328  &dst, dst + result.size(),
329  &src, src + str_to_encode.size(),
330  nullptr, nullptr, nullptr, nullptr,
331  true, true,
332  &status);
333 
334  if (U_FAILURE(status)) {
335  fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%.*s\": %s\n", (int)str_to_encode.length(), str_to_encode.data(), u_errorName(status));
336  return std::string();
337  }
338 
339  result.resize(dst - result.c_str());
340  result.shrink_to_fit();
341 
342  return result;
343 #else
344  iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
345  if (cd == (iconv_t)-1)
346  return ToString(str_to_encode);
347  char *src = const_cast<char *>(str_to_encode.data());
348  size_t src_left = str_to_encode.size();
349  size_t dst_size = str_to_encode.size() * 5 + 10;
350  char *dst = new char[dst_size];
351  size_t dst_left = dst_size;
352 # ifdef ICONV_CONST
353  char ICONV_CONST *p = src;
354 # else
355  char *p = src;
356 # endif
357  char *q = dst;
358  size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
359  iconv_close(cd);
360  if (status == (size_t) -1 || src_left > 0) {
361  delete[] dst;
362  return std::string();
363  }
364  *q++ = '\0';
365  std::string result(dst);
366  delete[] dst;
367  return result;
368 #endif
369 }
370 
371 std::string ReaderUtil::Normalize(StringView str) {
372 #if LCF_SUPPORT_ICU
373  icu::UnicodeString uni = icu::UnicodeString(str.data(), str.length(), "utf-8").toLower();
374  UErrorCode err = U_ZERO_ERROR;
375  std::string res;
376  const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
377  if (U_FAILURE(err)) {
378  static bool err_reported = false;
379  if (!err_reported) {
380  fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
381  err_reported = true;
382  }
383  uni.toUTF8String(res);
384  return res;
385  }
386  icu::UnicodeString f = norm->normalize(uni, err);
387  if (U_FAILURE(err)) {
388  uni.toUTF8String(res);
389  } else {
390  f.toUTF8String(res);
391  }
392  return res;
393 #else
394  auto result = std::string(str);
395  std::transform(result.begin(), result.end(), result.begin(), tolower);
396  return result;
397 #endif
398 }
399 
400 } //namespace lcf
Definition: dbarray.cpp:13