% \iffalse meta-comment % %% File: l3str-convert.dtx % % Copyright (C) 2013-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % % \title{^^A % The \pkg{l3str-convert} module\\ String encoding conversions^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-12-09} % % \maketitle % % \begin{documentation} % % \section{Encoding and escaping schemes} % % Traditionally, string encodings only specify how strings of characters % should be stored as bytes. However, the resulting lists of bytes are % often to be used in contexts where only a restricted subset of bytes % are permitted (\emph{e.g.}, \textsc{pdf} string objects, % \textsc{url}s). Hence, storing a string of characters is done in two % steps. % \begin{itemize} % \item The code points (\enquote{character codes}) are expressed as % bytes following a given \enquote{encoding}. This can be % \textsc{utf-16}, \textsc{iso 8859-1}, \emph{etc.} See % Table~\ref{tab:encodings} for a list of supported % encodings.\footnote{Encodings and escapings will be added as they % are requested.} % \item Bytes are translated to \TeX{} tokens through a given % \enquote{escaping}. Those are defined for the most part by the % \texttt{pdf} file format. See Table~\ref{tab:escapings} for a % list of escaping methods supported.\footnotemark[\csname c@footnote\endcsname] % \end{itemize} % % \begin{table}\centering % \caption{\label{tab:encodings}Supported encodings. % Non-alphanumeric characters are ignored, % and capital letters are lower-cased % before searching for the encoding in this list.} % \begin{tabular}{cc} % \toprule % \meta{Encoding} & description \\ % \midrule % \texttt{utf8} & \textsc{utf-8} \\ % \texttt{utf16} & \textsc{utf-16}, with byte-order mark \\ % \texttt{utf16be} & \textsc{utf-16}, big-endian \\ % \texttt{utf16le} & \textsc{utf-16}, little-endian \\ % \texttt{utf32} & \textsc{utf-32}, with byte-order mark \\ % \texttt{utf32be} & \textsc{utf-32}, big-endian \\ % \texttt{utf32le} & \textsc{utf-32}, little-endian \\ % \midrule % \texttt{iso88591}, \texttt{latin1} & \textsc{iso 8859-1} \\ % \texttt{iso88592}, \texttt{latin2} & \textsc{iso 8859-2} \\ % \texttt{iso88593}, \texttt{latin3} & \textsc{iso 8859-3} \\ % \texttt{iso88594}, \texttt{latin4} & \textsc{iso 8859-4} \\ % \texttt{iso88595} & \textsc{iso 8859-5} \\ % \texttt{iso88596} & \textsc{iso 8859-6} \\ % \texttt{iso88597} & \textsc{iso 8859-7} \\ % \texttt{iso88598} & \textsc{iso 8859-8} \\ % \texttt{iso88599}, \texttt{latin5} & \textsc{iso 8859-9} \\ % \texttt{iso885910}, \texttt{latin6} & \textsc{iso 8859-10} \\ % \texttt{iso885911} & \textsc{iso 8859-11} \\ % \texttt{iso885913}, \texttt{latin7} & \textsc{iso 8859-13} \\ % \texttt{iso885914}, \texttt{latin8} & \textsc{iso 8859-14} \\ % \texttt{iso885915}, \texttt{latin9} & \textsc{iso 8859-15} \\ % \texttt{iso885916}, \texttt{latin10} & \textsc{iso 8859-16} \\ % \midrule % \texttt{clist} & comma-list of integers \\ % \meta{empty} & native (Unicode) string \\ % \texttt{default} & like \texttt{utf8} with 8-bit engines, % and like native with unicode-engines \\ % \bottomrule % \end{tabular} % \end{table} % % \begin{table}\centering % \caption{\label{tab:escapings}Supported escapings. % Non-alphanumeric characters are ignored, % and capital letters are lower-cased % before searching for the escaping in this list.} % \begin{tabular}{cc} % \toprule % \meta{Escaping} & description \\ % \midrule % \texttt{bytes}, or empty % & arbitrary bytes \\ % \texttt{hex}, \texttt{hexadecimal} % & byte $=$ two hexadecimal digits \\ % \texttt{name} % & see \tn{pdfescapename} \\ % \texttt{string} % & see \tn{pdfescapestring} \\ % \texttt{url} % & encoding used in \textsc{url}s \\ % \bottomrule % \end{tabular} % \end{table} % % \section{Conversion functions} % % \begin{function}{\str_set_convert:Nnnn, \str_gset_convert:Nnnn} % \begin{syntax} % \cs{str_set_convert:Nnnn} \meta{str~var} \Arg{string} \Arg{name_1} \Arg{name_2} % \end{syntax} % This function converts the \meta{string} from the encoding given by % \meta{name_1} to the encoding given by \meta{name_2}, and stores the % result in the \meta{str~var}. Each \meta{name} can have the form % \meta{encoding} or \meta{encoding}\texttt{/}\meta{escaping}, where % the possible values of \meta{encoding} and \meta{escaping} are given % in Tables~\ref{tab:encodings} and~\ref{tab:escapings}, respectively. % The default escaping is to input and output bytes directly. The % special case of an empty \meta{name} indicates the use of % \enquote{native} strings, 8-bit for \pdfTeX{}, and Unicode strings % for the other two engines. % % For example, % \begin{verbatim} % \str_set_convert:Nnnn \l_foo_str { Hello! } { } { utf16/hex } % \end{verbatim} % results in the variable \cs[no-index]{l_foo_str} holding the string % \texttt{FEFF00480065006C006C006F0021}. This is obtained by % converting each character in the (native) string \texttt{Hello!} to % the \textsc{utf-16} encoding, and expressing each byte as a pair of % hexadecimal digits. Note the presence of a (big-endian) byte order % mark \hexnum{FEFF}, which can be avoided by specifying the encoding % \texttt{utf16be/hex}. % % An error is raised if the \meta{string} is not valid according to % the \meta{escaping~1} and \meta{encoding~1}, or if it cannot be % reencoded in the \meta{encoding~2} and \meta{escaping~2} (for % instance, if a character does not exist in the \meta{encoding~2}). % Erroneous input is replaced by the Unicode replacement character % \hexnum{FFFD}, and characters which cannot be reencoded are replaced % by either the replacement character \hexnum{FFFD} if it exists in % the \meta{encoding~2}, or an encoding-specific replacement % character, or the question mark character. % \end{function} % % \begin{function}[TF]{\str_set_convert:Nnnn, \str_gset_convert:Nnnn} % \begin{syntax} % \cs{str_set_convert:NnnnTF} \meta{str~var} \Arg{string} \Arg{name_1} \Arg{name_2} \Arg{true code} \Arg{false code} % \end{syntax} % As \cs{str_set_convert:Nnnn}, converts the \meta{string} from the % encoding given by \meta{name_1} to the encoding given by % \meta{name_2}, and assigns the result to \meta{str~var}. Contrarily % to \cs{str_set_convert:Nnnn}, the conditional variant does not raise % errors in case the \meta{string} is not valid according to the % \meta{name_1} encoding, or cannot be expressed in the \meta{name_2} % encoding. Instead, the \meta{false code} is performed. % \end{function} % % \section{Conversion by expansion (for PDF contexts)} % % A small number of expandable functions are provided for use in PDF string/name % contexts. These \emph{assume UTF-8} and \emph{no escaping} in the input. % % \begin{function}[EXP]{\str_convert_pdfname:n} % \begin{syntax} % \cs{str_convert_pdfname:n} \Arg{string} % \end{syntax} % As \cs{str_set_convert:Nnnn}, converts the \meta{string} on a byte-by-byte % basis with non-ASCII codepoints escaped using hashes. % \end{function} % % \section{Possibilities, and things to do} % % Encoding/escaping-related tasks. % \begin{itemize} % \item In \XeTeX{}/\LuaTeX{}, would it be better to use the % |^^^^....| approach to build a string from a given list of % character codes? Namely, within a group, assign |0-9a-f| and all % characters we want to category ``other'', then assign~|^| the % category superscript, and use \tn{scantokens}. % \item Change \cs{str_set_convert:Nnnn} to expand its last two % arguments. % \item Describe the internal format in the code comments. Refuse code % points in $[\hexnum{D800}, \hexnum{DFFF}]$ in the internal % representation? % \item Add documentation about each encoding and escaping method, and % add examples. % \item The \texttt{hex} unescaping should raise an error for % odd-token count strings. % \item Decide what bytes should be escaped in the \texttt{url} % escaping. Perhaps the characters |!'()*-./0123456789_| are safe, % and all other characters should be escaped? % \item Automate generation of 8-bit mapping files. % \item Change the framework for 8-bit encodings: for decoding from % 8-bit to Unicode, use $256$ integer registers; for encoding, use a % tree-box. % \item More encodings (see Heiko's \pkg{stringenc}). CESU? % \item More escapings: \textsc{ascii85}, shell escapes, lua escapes, % \emph{etc.}? % \end{itemize} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3str-convert} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=str> % \end{macrocode} % % \subsection{Helpers} % % \subsubsection{Variables and constants} % % \begin{macro}{\@@_tmp:w} % \begin{variable}{\l_@@_internal_tl} % Internal scratch space for some functions. % \begin{macrocode} \cs_new_protected:Npn \@@_tmp:w { } \tl_new:N \l_@@_internal_tl % \end{macrocode} % \end{variable} % \end{macro} % % \begin{variable}{\g_@@_result_tl} % The \cs{g_@@_result_tl} variable is used to hold the result of % various internal string operations (mostly conversions) which are % typically performed in a group. The variable is global so that it % remains defined outside the group, to be assigned to a user-provided % variable. % \begin{macrocode} \tl_new:N \g_@@_result_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\c_@@_replacement_char_int} % When converting, invalid bytes are replaced by the Unicode % replacement character \hexnum{FFFD}. % \begin{macrocode} \int_const:Nn \c_@@_replacement_char_int { "FFFD } % \end{macrocode} % \end{variable} % % \begin{variable}{\c_@@_max_byte_int} % The maximal byte number. % \begin{macrocode} \int_const:Nn \c_@@_max_byte_int { 255 } % \end{macrocode} % \end{variable} % % \begin{variable}{\s_@@} % Internal scan marks. % \begin{macrocode} \scan_new:N \s_@@ % \end{macrocode} % \end{variable} % % \begin{variable}{\q_@@_nil} % Internal quarks. % \begin{macrocode} \quark_new:N \q_@@_nil % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_alias_prop} % To avoid needing one file per encoding/escaping alias, we keep track % of those in a property list. % \begin{macrocode} \prop_new:N \g_@@_alias_prop \prop_gput:Nnn \g_@@_alias_prop { latin1 } { iso88591 } \prop_gput:Nnn \g_@@_alias_prop { latin2 } { iso88592 } \prop_gput:Nnn \g_@@_alias_prop { latin3 } { iso88593 } \prop_gput:Nnn \g_@@_alias_prop { latin4 } { iso88594 } \prop_gput:Nnn \g_@@_alias_prop { latin5 } { iso88599 } \prop_gput:Nnn \g_@@_alias_prop { latin6 } { iso885910 } \prop_gput:Nnn \g_@@_alias_prop { latin7 } { iso885913 } \prop_gput:Nnn \g_@@_alias_prop { latin8 } { iso885914 } \prop_gput:Nnn \g_@@_alias_prop { latin9 } { iso885915 } \prop_gput:Nnn \g_@@_alias_prop { latin10 } { iso885916 } \prop_gput:Nnn \g_@@_alias_prop { utf16le } { utf16 } \prop_gput:Nnn \g_@@_alias_prop { utf16be } { utf16 } \prop_gput:Nnn \g_@@_alias_prop { utf32le } { utf32 } \prop_gput:Nnn \g_@@_alias_prop { utf32be } { utf32 } \prop_gput:Nnn \g_@@_alias_prop { hexadecimal } { hex } \bool_lazy_any:nTF { \sys_if_engine_luatex_p: \sys_if_engine_xetex_p: } { \prop_gput:Nnn \g_@@_alias_prop { default } { } } { \prop_gput:Nnn \g_@@_alias_prop { default } { utf8 } } % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_error_bool} % In conversion functions with a built-in conditional, errors are not % reported directly to the user, but the information is collected in % this boolean, used at the end to decide on which branch of the % conditional to take. % \begin{macrocode} \bool_new:N \g_@@_error_bool % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_byte_flag, \l_@@_error_flag} % Conversions from one \meta{encoding}/\meta{escaping} pair to another % are done within \texttt{e}-expanding assignments. Errors are % signalled by raising the relevant flag. % \begin{macrocode} \flag_new:N \l_@@_byte_flag \flag_new:N \l_@@_error_flag % \end{macrocode} % \end{variable} % % \subsection{String conditionals} % % \begin{macro}[EXP]{\@@_if_contains_char:NnT, \@@_if_contains_char:NnTF} % \begin{macro}[EXP]{\@@_if_contains_char:nnTF} % \begin{macro}[EXP]{\@@_if_contains_char_aux:nn,\@@_if_contains_char_auxi:nN} % \begin{macro}[EXP]{\@@_if_contains_char_true:} % \begin{syntax} % \cs{@@_if_contains_char:nnTF} \Arg{token list} \meta{char} % \end{syntax} % Expects the \meta{token list} to be an \meta{other string}: the % caller is responsible for ensuring that no (too-)special catcodes % remain. % Loop over the characters of the string, comparing character codes. % The loop is broken if character codes match. Otherwise we return % \enquote{false}. % \begin{macrocode} \prg_new_conditional:Npnn \@@_if_contains_char:Nn #1#2 { T , TF } { \exp_after:wN \@@_if_contains_char_aux:nn \exp_after:wN {#1} {#2} { \prg_break:n { ? \fi: } } \prg_break_point: \prg_return_false: } \cs_new:Npn \@@_if_contains_char_aux:nn #1#2 { \@@_if_contains_char_auxi:nN {#2} #1 } \prg_new_conditional:Npnn \@@_if_contains_char:nn #1#2 { TF } { \@@_if_contains_char_auxi:nN {#2} #1 { \prg_break:n { ? \fi: } } \prg_break_point: \prg_return_false: } \cs_new:Npn \@@_if_contains_char_auxi:nN #1#2 { \if_charcode:w #1 #2 \exp_after:wN \@@_if_contains_char_true: \fi: \@@_if_contains_char_auxi:nN {#1} } \cs_new:Npn \@@_if_contains_char_true: { \prg_break:n { \prg_return_true: \use_none:n } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[rEXP]{\@@_octal_use:NTF} % \begin{syntax} % \cs{@@_octal_use:NTF} \meta{token} \Arg{true code} \Arg{false code} % \end{syntax} % If the \meta{token} is an octal digit, it is left in the input % stream, \emph{followed} by the \meta{true code}. Otherwise, the % \meta{false code} is left in the input stream. % \begin{texnote} % This function will fail if the escape character is an octal % digit. We are thus careful to set the escape character to a known % value before using it. % \end{texnote} % \TeX{} dutifully detects octal digits for us: if |#1| is an octal % digit, then the right-hand side of the comparison is |'1#1|, greater % than $1$. Otherwise, the right-hand side stops as |'1|, and the % conditional takes the \texttt{false} branch. % \begin{macrocode} \prg_new_conditional:Npnn \@@_octal_use:N #1 { TF } { \if_int_compare:w 1 < '1 \token_to_str:N #1 \exp_stop_f: #1 \prg_return_true: \else: \prg_return_false: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_hexadecimal_use:NTF} % \TeX{} detects uppercase hexadecimal digits for us (see % \cs{@@_octal_use:NTF}), but not the lowercase letters, which we % need to detect and replace by their uppercase counterpart. % \begin{macrocode} \prg_new_conditional:Npnn \@@_hexadecimal_use:N #1 { TF } { \if_int_compare:w 1 < "1 \token_to_str:N #1 \exp_stop_f: #1 \prg_return_true: \else: \if_case:w \int_eval:n { \exp_after:wN ` \token_to_str:N #1 - `a } A \or: B \or: C \or: D \or: E \or: F \else: \prg_return_false: \exp_after:wN \use_none:n \fi: \prg_return_true: \fi: } % \end{macrocode} % \end{macro} % % \subsection{Conversions} % % \subsubsection{Producing one byte or character} % % \begin{variable}{\c_@@_byte_0_tl, \c_@@_byte_1_tl, \c_@@_byte_255_tl} % \begin{variable}{\c_@@_byte_-1_tl} % For each integer $N$ in the range $[0,255]$, we create a constant % token list which holds three character tokens with category code % other: the character with character code $N$, followed by the % representation of $N$ as two hexadecimal digits. The value $-1$ is % given a default token list which ensures that later functions give % an empty result for the input $-1$. % \begin{macrocode} \group_begin: \__kernel_tl_set:Nx \l_@@_internal_tl { \tl_to_str:n { 0123456789ABCDEF } } \tl_map_inline:Nn \l_@@_internal_tl { \tl_map_inline:Nn \l_@@_internal_tl { \tl_const:ce { c_@@_byte_ \int_eval:n {"#1##1} _tl } { \char_generate:nn { "#1##1 } { 12 } #1 ##1 } } } \group_end: \tl_const:cn { c_@@_byte_-1_tl } { { } \use_none:n { } } % \end{macrocode} % \end{variable} % \end{variable} % % \begin{macro}[EXP]{\@@_output_byte:n} % \begin{macro}[EXP]{\@@_output_byte:w} % \begin{macro}[EXP]{\@@_output_hexadecimal:n} % \begin{macro}[EXP]{\@@_output_end:} % Those functions must be used carefully: feeding them a value outside % the range $[-1,255]$ will attempt to use the undefined token list % variable \cs{c_@@_byte_\meta{number}_tl}. Assuming that the % argument is in the right range, we expand the corresponding token % list, and pick either the byte (first token) or the hexadecimal % representations (second and third tokens). The value $-1$ produces % an empty result in both cases. % \begin{macrocode} \cs_new:Npn \@@_output_byte:n #1 { \@@_output_byte:w #1 \@@_output_end: } \cs_new:Npn \@@_output_byte:w { \exp_after:wN \exp_after:wN \exp_after:wN \use_i:nnn \cs:w c_@@_byte_ \int_eval:w } \cs_new:Npn \@@_output_hexadecimal:n #1 { \exp_after:wN \exp_after:wN \exp_after:wN \use_none:n \cs:w c_@@_byte_ \int_eval:n {#1} _tl \cs_end: } \cs_new:Npn \@@_output_end: { \scan_stop: _tl \cs_end: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[rEXP]{\@@_output_byte_pair_be:n} % \begin{macro}[rEXP]{\@@_output_byte_pair_le:n} % \begin{macro}[rEXP]{\@@_output_byte_pair:nnN} % Convert a number in the range $[0,65535]$ to a pair of bytes, either % big-endian or little-endian. % \begin{macrocode} \cs_new:Npn \@@_output_byte_pair_be:n #1 { \exp_args:Nf \@@_output_byte_pair:nnN { \int_div_truncate:nn { #1 } { "100 } } {#1} \use:nn } \cs_new:Npn \@@_output_byte_pair_le:n #1 { \exp_args:Nf \@@_output_byte_pair:nnN { \int_div_truncate:nn { #1 } { "100 } } {#1} \use_ii_i:nn } \cs_new:Npn \@@_output_byte_pair:nnN #1#2#3 { #3 { \@@_output_byte:n { #1 } } { \@@_output_byte:n { #2 - #1 * "100 } } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsubsection{Mapping functions for conversions} % % \begin{macro}{\@@_convert_gmap:N} % \begin{macro}[rEXP]{\@@_convert_gmap_loop:NN} % This maps the function |#1| over all characters in % \cs{g_@@_result_tl}, which should be a byte string in most cases, % sometimes a native string. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_gmap:N #1 { \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_convert_gmap_loop:NN \exp_after:wN #1 \g_@@_result_tl { ? \prg_break: } \prg_break_point: } } \cs_new:Npn \@@_convert_gmap_loop:NN #1#2 { \use_none:n #2 #1#2 \@@_convert_gmap_loop:NN #1 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_gmap_internal:N} % \begin{macro}[rEXP]{\@@_convert_gmap_internal_loop:Nw} % This maps the function |#1| over all character codes in % \cs{g_@@_result_tl}, which must be in the internal representation. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_gmap_internal:N #1 { \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_convert_gmap_internal_loop:Nww \exp_after:wN #1 \g_@@_result_tl \s_@@ \s_@@_stop \prg_break: \s_@@ \prg_break_point: } } \cs_new:Npn \@@_convert_gmap_internal_loop:Nww #1 #2 \s_@@ #3 \s_@@ { \@@_use_none_delimit_by_s_stop:w #3 \s_@@_stop #1 {#3} \@@_convert_gmap_internal_loop:Nww #1 } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Error-reporting during conversion} % % \begin{macro}{\@@_if_flag_error:Nne} % \begin{macro}{\@@_if_flag_no_error:Nne} % When converting using the function \cs{str_set_convert:Nnnn}, errors % should be reported to the user after each step in the % conversion. Errors are signalled by raising some flag (typically % \texttt{@@_error}), so here we test that flag: if it is raised, % give the user an error, otherwise remove the arguments. On the other % hand, in the conditional functions \cs{str_set_convert:NnnnTF}, % errors should be suppressed. This is done by changing % \cs{@@_if_flag_error:Nne} into \cs{@@_if_flag_no_error:Nne} % locally. % \begin{macrocode} \cs_new_protected:Npn \@@_if_flag_error:Nne #1 { \flag_if_raised:NTF #1 { \msg_error:nne { str } } { \use_none:nn } } \cs_new_protected:Npn \@@_if_flag_no_error:Nne #1#2#3 { \flag_if_raised:NT #1 { \bool_gset_true:N \g_@@_error_bool } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[rEXP]{\@@_if_flag_times:NT} % At the end of each conversion step, we raise all relevant errors as % one error message, built on the fly. The height of each flag % indicates how many times a given error was encountered. This % function prints |#2| followed by the number of occurrences of an % error if it occurred, nothing otherwise. % \begin{macrocode} \cs_new:Npn \@@_if_flag_times:NT #1#2 { \flag_if_raised:NT #1 { #2~(x \flag_height:N #1 ) } } % \end{macrocode} % \end{macro} % % \subsubsection{Framework for conversions} % % Most functions in this module expect to be working with % \enquote{native} strings. Strings can also be stored as bytes, in one % of many encodings, for instance \textsc{utf8}. The bytes themselves % can be expressed in various ways in terms of \TeX{} tokens, for % instance as pairs of hexadecimal digits. The questions of going from % arbitrary Unicode code points to bytes, and from bytes to tokens are % mostly independent. % % Conversions are done in four steps: % \begin{itemize} % \item \enquote{unescape} produces a string of bytes; % \item \enquote{decode} takes in a string of bytes, and converts it % to a list of Unicode characters in an internal representation, % with items of the form % \begin{quote} % \meta{bytes} \cs{s_@@} \meta{Unicode code point} \cs{s_@@} % \end{quote} % where we have collected the \meta{bytes} which combined to form % this particular Unicode character, and the \meta{Unicode code % point} is in the range $[0,\hexnum{10FFFF}]$. % \item \enquote{encode} encodes the internal list of code points as a % byte string in the new encoding; % \item \enquote{escape} escapes bytes as requested. % \end{itemize} % The process is modified in case one of the encoding is empty (or the % conversion function has been set equal to the empty encoding because % it was not found): then the unescape or escape step is ignored, and % the decode or encode steps work on tokens instead of bytes. Otherwise, % each step must ensure that it passes a correct byte string or internal % string to the next step. % % \begin{macro}{\str_set_convert:Nnnn, \str_gset_convert:Nnnn} % \begin{macro}[TF]{\str_set_convert:Nnnn, \str_gset_convert:Nnnn} % \begin{macro}{\@@_convert:nNNnnn} % The input string is stored in \cs{g_@@_result_tl}, then we: % unescape and decode; encode and escape; exit the group and store the % result in the user's variable. The various conversion functions all % act on \cs{g_@@_result_tl}. Errors are silenced for the conditional % functions by redefining \cs{@@_if_flag_error:Nne} locally. % \begin{macrocode} \cs_new_protected:Npn \str_set_convert:Nnnn { \@@_convert:nNNnnn { } \tl_set_eq:NN } \cs_new_protected:Npn \str_gset_convert:Nnnn { \@@_convert:nNNnnn { } \tl_gset_eq:NN } \prg_new_protected_conditional:Npnn \str_set_convert:Nnnn #1#2#3#4 { T , F , TF } { \bool_gset_false:N \g_@@_error_bool \@@_convert:nNNnnn { \cs_set_eq:NN \@@_if_flag_error:Nne \@@_if_flag_no_error:Nne } \tl_set_eq:NN #1 {#2} {#3} {#4} \bool_if:NTF \g_@@_error_bool \prg_return_false: \prg_return_true: } \prg_new_protected_conditional:Npnn \str_gset_convert:Nnnn #1#2#3#4 { T , F , TF } { \bool_gset_false:N \g_@@_error_bool \@@_convert:nNNnnn { \cs_set_eq:NN \@@_if_flag_error:Nne \@@_if_flag_no_error:Nne } \tl_gset_eq:NN #1 {#2} {#3} {#4} \bool_if:NTF \g_@@_error_bool \prg_return_false: \prg_return_true: } \cs_new_protected:Npn \@@_convert:nNNnnn #1#2#3#4#5#6 { \group_begin: #1 \__kernel_tl_gset:Nx \g_@@_result_tl { \__kernel_str_to_other_fast:n {#4} } \exp_after:wN \@@_convert:wwwnn \tl_to_str:n {#5} /// \s_@@_stop { decode } { unescape } \prg_do_nothing: \@@_convert_decode_: \exp_after:wN \@@_convert:wwwnn \tl_to_str:n {#6} /// \s_@@_stop { encode } { escape } \use_ii_i:nn \@@_convert_encode_: \__kernel_tl_gset:Nx \g_@@_result_tl { \tl_to_str:V \g_@@_result_tl } \group_end: #2 #3 \g_@@_result_tl } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert:wwwnn} % \begin{macro}{\@@_convert:NNnNN} % The task of \cs{@@_convert:wwwnn} is to split % \meta{encoding}/\meta{escaping} pairs into their components, |#1| % and |#2|. Calls to \cs{@@_convert:nnn} ensure that the % corresponding conversion functions are defined. The third auxiliary % does the main work. % \begin{itemize} % \item |#1| is the encoding conversion function; % \item |#2| is the escaping function; % \item |#3| is the escaping name for use in an error message; % \item |#4| is \cs{prg_do_nothing:} for unescaping/decoding, and % \cs{use_ii_i:nn} for encoding/escaping; % \item |#5| is the default encoding function (either % \enquote{decode} or \enquote{encode}), for which there should be % no escaping. % \end{itemize} % Let us ignore the native encoding for a second. In the % unescaping/decoding phase, we want to do |#2#1| in this order, and % in the encoding/escaping phase, the order should be reversed: % |#4#2#1| does exactly that. If one of the encodings is the default % (native), then the escaping should be ignored, with an error if any % was given, and only the encoding, |#1|, should be performed. % \begin{macrocode} \cs_new_protected:Npn \@@_convert:wwwnn #1 / #2 // #3 \s_@@_stop #4#5 { \@@_convert:nnn {enc} {#4} {#1} \@@_convert:nnn {esc} {#5} {#2} \exp_args:Ncc \@@_convert:NNnNN { @@_convert_#4_#1: } { @@_convert_#5_#2: } {#2} } \cs_new_protected:Npn \@@_convert:NNnNN #1#2#3#4#5 { \if_meaning:w #1 #5 \tl_if_empty:nF {#3} { \msg_error:nne { str } { native-escaping } {#3} } #1 \else: #4 #2 #1 \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert:nnn} % \begin{macro}{\@@_convert:nnnn} % The arguments of \cs{@@_convert:nnn} are: \texttt{enc} or % \texttt{esc}, used to build filenames, the type of the conversion % (unescape, decode, encode, escape), and the encoding or escaping % name. If the function is already defined, no need to do anything. % Otherwise, filter out all non-alphanumerics in the name, and % lowercase it. Feed that, and the same three arguments, to % \cs{@@_convert:nnnn}. The task is then to make sure that the % conversion function |#3_#1| corresponding to the type |#3| and % filtered name |#1| is defined, then set our initial conversion % function |#3_#4| equal to that. % % How do we get the |#3_#1| conversion to be defined if it isn't? % Two main cases. % % First, if |#1| is a key in \cs{g_@@_alias_prop}, then the value % \cs{l_@@_internal_tl} tells us what file to load. Loading is % skipped if the file was already read, \emph{i.e.}, if the conversion % command based on \cs{l_@@_internal_tl} already exists. Otherwise, % try to load the file; if that fails, there is an error, use the % default empty name instead. % % Second, |#1| may be absent from the property list. The % \cs{cs_if_exist:cF} test is automatically false, and we search for a % file defining the encoding or escaping |#1| (this should allow % third-party \texttt{.def} files). If the file is not found, there is % an error, use the default empty name instead. % % In all cases, the conversion based on \cs{l_@@_internal_tl} is % defined, so we can set the |#3_#1| function equal to that. In some % cases (\emph{e.g.}, \texttt{utf16be}), the |#3_#1| function is % actually defined within the file we just loaded, and it is different % from the \cs{l_@@_internal_tl}-based function: we mustn't clobber % that different definition. % \begin{macrocode} \cs_new_protected:Npn \@@_convert:nnn #1#2#3 { \cs_if_exist:cF { @@_convert_#2_#3: } { \exp_args:Ne \@@_convert:nnnn { \@@_convert_lowercase_alphanum:n {#3} } {#1} {#2} {#3} } } \cs_new_protected:Npn \@@_convert:nnnn #1#2#3#4 { \cs_if_exist:cF { @@_convert_#3_#1: } { \prop_get:NnNF \g_@@_alias_prop {#1} \l_@@_internal_tl { \tl_set:Nn \l_@@_internal_tl {#1} } \cs_if_exist:cF { @@_convert_#3_ \l_@@_internal_tl : } { \file_if_exist:nTF { l3str-#2- \l_@@_internal_tl .def } { \group_begin: \cctab_select:N \c_code_cctab \file_input:n { l3str-#2- \l_@@_internal_tl .def } \group_end: } { \tl_clear:N \l_@@_internal_tl \msg_error:nnee { str } { unknown-#2 } {#4} {#1} } } \cs_if_exist:cF { @@_convert_#3_#1: } { \cs_gset_eq:cc { @@_convert_#3_#1: } { @@_convert_#3_ \l_@@_internal_tl : } } } \cs_gset_eq:cc { @@_convert_#3_#4: } { @@_convert_#3_#1: } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[rEXP]{\@@_convert_lowercase_alphanum:n} % \begin{macro}[rEXP]{\@@_convert_lowercase_alphanum_loop:N} % This function keeps only letters and digits, with upper case letters % converted to lower case. % \begin{macrocode} \cs_new:Npn \@@_convert_lowercase_alphanum:n #1 { \exp_after:wN \@@_convert_lowercase_alphanum_loop:N \tl_to_str:n {#1} { ? \prg_break: } \prg_break_point: } \cs_new:Npn \@@_convert_lowercase_alphanum_loop:N #1 { \use_none:n #1 \if_int_compare:w `#1 > `Z \exp_stop_f: \if_int_compare:w `#1 > `z \exp_stop_f: \else: \if_int_compare:w `#1 < `a \exp_stop_f: \else: #1 \fi: \fi: \else: \if_int_compare:w `#1 < `A \exp_stop_f: \if_int_compare:w 1 < 1#1 \exp_stop_f: #1 \fi: \else: \@@_output_byte:n { `#1 + `a - `A } \fi: \fi: \@@_convert_lowercase_alphanum_loop:N } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Byte unescape and escape} % % Strings of bytes may need to be stored in auxiliary files in safe % \enquote{escaping} formats. Each such escaping is only loaded as % needed. By default, on input any non-byte is filtered out, while the % output simply consists in letting bytes through. % % \begin{macro}[rEXP]{\@@_filter_bytes:n} % \begin{macro}[rEXP]{\@@_filter_bytes_aux:N} % In the case of 8-bit engines, every character is a byte. For % Unicode-aware engines, test the character code; non-bytes cause us % to raise the flag \cs{l_@@_byte_flag}. Spaces have already been given % the correct category code when this function is called. % \begin{macrocode} \bool_lazy_any:nTF { \sys_if_engine_luatex_p: \sys_if_engine_xetex_p: } { \cs_new:Npn \@@_filter_bytes:n #1 { \@@_filter_bytes_aux:N #1 { ? \prg_break: } \prg_break_point: } \cs_new:Npn \@@_filter_bytes_aux:N #1 { \use_none:n #1 \if_int_compare:w `#1 < 256 \exp_stop_f: #1 \else: \flag_raise:N \l_@@_byte_flag \fi: \@@_filter_bytes_aux:N } } { \cs_new_eq:NN \@@_filter_bytes:n \use:n } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_unescape_:} % \begin{macro}{\@@_convert_unescape_bytes:} % The simplest unescaping method removes non-bytes from % \cs{g_@@_result_tl}. % \begin{macrocode} \bool_lazy_any:nTF { \sys_if_engine_luatex_p: \sys_if_engine_xetex_p: } { \cs_new_protected:Npn \@@_convert_unescape_: { \flag_clear:N \l_@@_byte_flag \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_args:No \@@_filter_bytes:n \g_@@_result_tl } \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { bytes } } } { \cs_new_protected:Npn \@@_convert_unescape_: { } } \cs_new_eq:NN \@@_convert_unescape_bytes: \@@_convert_unescape_: % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_escape_:} % \begin{macro}{\@@_convert_escape_bytes:} % The simplest form of escape leaves the bytes from the previous step % of the conversion unchanged. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_escape_: { } \cs_new_eq:NN \@@_convert_escape_bytes: \@@_convert_escape_: % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{Native strings} % % \begin{macro}{\@@_convert_decode_:} % \begin{macro}[rEXP]{\@@_decode_native_char:N} % Convert each character to its character code, one at a time. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_decode_: { \@@_convert_gmap:N \@@_decode_native_char:N } \cs_new:Npn \@@_decode_native_char:N #1 { #1 \s_@@ \int_value:w `#1 \s_@@ } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_encode_:} % \begin{macro}[EXP]{\@@_encode_native_char:n} % The conversion from an internal string to native character tokens % basically maps \cs{char_generate:nn} through the code-points, but in % non-Unicode-aware engines we use a fall-back character |?| rather % than nothing when given a character code outside $[0,255]$. We % detect the presence of bad characters using a flag and only produce % a single error after the \texttt{e}-expanding assignment. % \begin{macrocode} \bool_lazy_any:nTF { \sys_if_engine_luatex_p: \sys_if_engine_xetex_p: } { \cs_new_protected:Npn \@@_convert_encode_: { \@@_convert_gmap_internal:N \@@_encode_native_char:n } \cs_new:Npn \@@_encode_native_char:n #1 { \char_generate:nn {#1} {12} } } { \cs_new_protected:Npn \@@_convert_encode_: { \flag_clear:N \l_@@_error_flag \@@_convert_gmap_internal:N \@@_encode_native_char:n \@@_if_flag_error:Nne \l_@@_error_flag { native-overflow } { } } \cs_new:Npn \@@_encode_native_char:n #1 { \if_int_compare:w #1 > \c_@@_max_byte_int \flag_raise:N \l_@@_error_flag ? \else: \char_generate:nn {#1} {12} \fi: } \msg_new:nnnn { str } { native-overflow } { Character~code~too~large~for~this~engine. } { This~engine~only~support~8-bit~characters:~ valid~character~codes~are~in~the~range~[0,255].~ To~manipulate~arbitrary~Unicode,~use~LuaTeX~or~XeTeX. } } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{\texttt{clist}} % % \begin{macro}{\@@_convert_decode_clist:} % \begin{macro}[rEXP]{\@@_decode_clist_char:n} % Convert each integer to the internal form. We first turn % \cs{g_@@_result_tl} into a clist variable, as this avoids problems % with leading or trailing commas. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_decode_clist: { \clist_gset:No \g_@@_result_tl \g_@@_result_tl \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_args:No \clist_map_function:nN \g_@@_result_tl \@@_decode_clist_char:n } } \cs_new:Npn \@@_decode_clist_char:n #1 { #1 \s_@@ \int_eval:n {#1} \s_@@ } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_encode_clist:} % \begin{macro}[rEXP]{\@@_encode_clist_char:n} % Convert the internal list of character codes to a comma-list of % character codes. The first line produces a comma-list with a % leading comma, removed in the next step (this also works in the % empty case, since \cs{tl_tail:N} does not trigger an error in this % case). % \begin{macrocode} \cs_new_protected:Npn \@@_convert_encode_clist: { \@@_convert_gmap_internal:N \@@_encode_clist_char:n \__kernel_tl_gset:Nx \g_@@_result_tl { \tl_tail:N \g_@@_result_tl } } \cs_new:Npn \@@_encode_clist_char:n #1 { , #1 } % \end{macrocode} % \end{macro} % \end{macro} % % \subsubsection{8-bit encodings} % % It is not clear in what situations 8-bit encodings are used, hence it % is not clear what should be optimized. The current approach is % reasonably efficient to convert long strings, and it scales well when % using many different encodings. % % The data needed to support a given 8-bit encoding is stored in a file % that consists of a single function call % \begin{quote}\ttfamily % \cs{@@_declare_eight_bit_encoding:nnnn} \Arg{name} \Arg{modulo} % \Arg{mapping} \Arg{missing} % \end{quote} % This declares the encoding \meta{name} to map bytes to Unicode % characters according to the \meta{mapping}, and map those bytes which % are not mentioned in the \meta{mapping} either to the replacement % character (if they appear in \meta{missing}), or to themselves. The % \meta{mapping} argument is a token list of pairs \Arg{byte} % \Arg{Unicode} expressed in uppercase hexadecimal notation. The % \meta{missing} argument is a token list of \Arg{byte}. Every % \meta{byte} which does not appear in the \meta{mapping} nor the % \meta{missing} lists maps to itself in Unicode, so for instance the % \texttt{latin1} encoding has empty \meta{mapping} and \meta{missing} % lists. The \meta{modulo} is a (decimal) integer between $256$ and % $558$ inclusive, modulo which all Unicode code points supported by the % encodings must be different. % % We use two integer arrays per encoding. When decoding we only use the % \texttt{decode} integer array, with entry $n+1$ (offset needed because % integer array indices start at~$1$) equal to the Unicode code point % that corresponds to the $n$-th byte in the encoding under % consideration, or $-1$ if the given byte is invalid in this encoding. % When encoding we use both arrays: upon seeing a code point~$n$, we % look up the entry ($1$~plus) $n$ modulo some number $M$ in the % \texttt{encode} array, which tells us the byte that might encode the % given Unicode code point, then we check in the \texttt{decode} array % that indeed this byte encodes the Unicode code point we want. Here, % $M$ is an encoding-dependent integer between $256$ and $558$ (it turns % out), chosen so that among the Unicode code points that can be validly % represented in the given encoding, no pair of code points have the % same value modulo~$M$. % % \begin{macro} % { % \@@_declare_eight_bit_encoding:nnnn, % \@@_declare_eight_bit_aux:NNnnn, % \@@_declare_eight_bit_loop:Nnn, % \@@_declare_eight_bit_loop:Nn % } % Loop through both lists of bytes to fill in the \texttt{decode} % integer array, then fill the \texttt{encode} array accordingly. % For bytes that are invalid in the given encoding, store $-1$ in the % \texttt{decode} array. % \begin{macrocode} \cs_new_protected:Npn \@@_declare_eight_bit_encoding:nnnn #1 { \tl_set:Nn \l_@@_internal_tl {#1} \cs_new_protected:cpn { @@_convert_decode_#1: } { \@@_convert_decode_eight_bit:n {#1} } \cs_new_protected:cpn { @@_convert_encode_#1: } { \@@_convert_encode_eight_bit:n {#1} } \exp_args:Ncc \@@_declare_eight_bit_aux:NNnnn { g_@@_decode_#1_intarray } { g_@@_encode_#1_intarray } } \cs_new_protected:Npn \@@_declare_eight_bit_aux:NNnnn #1#2#3#4#5 { \intarray_new:Nn #1 { 256 } \int_step_inline:nnn { 0 } { 255 } { \intarray_gset:Nnn #1 { 1 + ##1 } {##1} } \@@_declare_eight_bit_loop:Nnn #1 #4 { \s_@@_stop \prg_break: } { } \prg_break_point: \@@_declare_eight_bit_loop:Nn #1 #5 { \s_@@_stop \prg_break: } \prg_break_point: \intarray_new:Nn #2 {#3} \int_step_inline:nnn { 0 } { 255 } { \int_compare:nNnF { \intarray_item:Nn #1 { 1 + ##1 } } = { -1 } { \intarray_gset:Nnn #2 { 1 + \int_mod:nn { \intarray_item:Nn #1 { 1 + ##1 } } { \intarray_count:N #2 } } {##1} } } } \cs_new_protected:Npn \@@_declare_eight_bit_loop:Nnn #1#2#3 { \@@_use_none_delimit_by_s_stop:w #2 \s_@@_stop \intarray_gset:Nnn #1 { 1 + "#2 } { "#3 } \@@_declare_eight_bit_loop:Nnn #1 } \cs_new_protected:Npn \@@_declare_eight_bit_loop:Nn #1#2 { \@@_use_none_delimit_by_s_stop:w #2 \s_@@_stop \intarray_gset:Nnn #1 { 1 + "#2 } { -1 } \@@_declare_eight_bit_loop:Nn #1 } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_convert_decode_eight_bit:n} % \begin{macro}[rEXP]{\@@_decode_eight_bit_aux:n, \@@_decode_eight_bit_aux:Nn} % The map from bytes to Unicode code points is in the \texttt{decode} % array corresponding to the given encoding. Define \cs{@@_tmp:w} and % pass it successively all bytes in the string. It produces an % internal representation with suitable \cs{s_@@} inserted, and the % corresponding code point is obtained by looking it up in the integer % array. If the entry is $-1$ then issue a replacement character and % raise the flag indicating that there was an error. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_decode_eight_bit:n #1 { \cs_set:Npe \@@_tmp:w { \exp_not:N \@@_decode_eight_bit_aux:Nn \exp_not:c { g_@@_decode_#1_intarray } } \flag_clear:N \l_@@_error_flag \@@_convert_gmap:N \@@_tmp:w \@@_if_flag_error:Nne \l_@@_error_flag { decode-8-bit } {#1} } \cs_new:Npn \@@_decode_eight_bit_aux:Nn #1#2 { #2 \s_@@ \exp_args:Nf \@@_decode_eight_bit_aux:n { \intarray_item:Nn #1 { 1 + `#2 } } \s_@@ } \cs_new:Npn \@@_decode_eight_bit_aux:n #1 { \if_int_compare:w #1 < \c_zero_int \flag_raise:N \l_@@_error_flag \int_value:w \c_@@_replacement_char_int \else: #1 \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_encode_eight_bit:n} % \begin{macro}[rEXP]{\@@_encode_eight_bit_aux:nnN, \@@_encode_eight_bit_aux:NNn} % It is not practical to make an integer array with indices in the % full Unicode range, so we work modulo some number, which is simply % the size of the \texttt{encode} integer array for the given % encoding. This gives us a candidate byte for representing a given % Unicode code point. Of course taking the modulo leads to collisions % so we check in the \texttt{decode} array that the byte we got is % indeed correct. Otherwise the Unicode code point we started from is % simply not representable in the given encoding. % \begin{macrocode} \int_new:N \l_@@_modulo_int \cs_new_protected:Npn \@@_convert_encode_eight_bit:n #1 { \cs_set:Npe \@@_tmp:w { \exp_not:N \@@_encode_eight_bit_aux:NNn \exp_not:c { g_@@_encode_#1_intarray } \exp_not:c { g_@@_decode_#1_intarray } } \flag_clear:N \l_@@_error_flag \@@_convert_gmap_internal:N \@@_tmp:w \@@_if_flag_error:Nne \l_@@_error_flag { encode-8-bit } {#1} } \cs_new:Npn \@@_encode_eight_bit_aux:NNn #1#2#3 { \exp_args:Nf \@@_encode_eight_bit_aux:nnN { \intarray_item:Nn #1 { 1 + \int_mod:nn {#3} { \intarray_count:N #1 } } } {#3} #2 } \cs_new:Npn \@@_encode_eight_bit_aux:nnN #1#2#3 { \int_compare:nNnTF { \intarray_item:Nn #3 { 1 + #1 } } = {#2} { \@@_output_byte:n {#1} } { \flag_raise:N \l_@@_error_flag } } % \end{macrocode} % \end{macro} % \end{macro} % % \subsection{Messages} % % General messages, and messages for the encodings and escapings loaded % by default (\enquote{native}, and \enquote{bytes}). % \begin{macrocode} \msg_new:nnn { str } { unknown-esc } { Escaping~scheme~'#1'~(filtered:~'#2')~unknown. } \msg_new:nnn { str } { unknown-enc } { Encoding~scheme~'#1'~(filtered:~'#2')~unknown. } \msg_new:nnnn { str } { native-escaping } { The~'native'~encoding~scheme~does~not~support~any~escaping. } { Since~native~strings~do~not~consist~in~bytes,~ none~of~the~escaping~methods~make~sense.~ The~specified~escaping,~'#1',~will~be~ignored. } \msg_new:nnn { str } { file-not-found } { File~'l3str-#1.def'~not~found. } % \end{macrocode} % % Message used when the \enquote{bytes} unescaping fails because the % string given to \cs{str_set_convert:Nnnn} contains a non-byte. This % cannot happen for the -8-bit engines. % Messages used for other escapings and % encodings are defined in each definition file. % \begin{macrocode} \bool_lazy_any:nT { \sys_if_engine_luatex_p: \sys_if_engine_xetex_p: } { \msg_new:nnnn { str } { non-byte } { String~invalid~in~escaping~'#1':~it~may~only~contain~bytes. } { Some~characters~in~the~string~you~asked~to~convert~are~not~ 8-bit~characters.~Perhaps~the~string~is~a~'native'~Unicode~string?~ If~it~is,~try~using\\ \\ \iow_indent:n { \iow_char:N\\str_set_convert:Nnnn \\ \ \ ~\{~~\}~\{~native~\}~\{~~\} } } } % \end{macrocode} % % Those messages are used when converting to and from 8-bit encodings. % \begin{macrocode} \msg_new:nnnn { str } { decode-8-bit } { Invalid~string~in~encoding~'#1'. } { LaTeX~came~across~a~byte~which~is~not~defined~to~represent~ any~character~in~the~encoding~'#1'. } \msg_new:nnnn { str } { encode-8-bit } { Unicode~string~cannot~be~converted~to~encoding~'#1'. } { The~encoding~'#1'~only~contains~a~subset~of~all~Unicode~characters.~ LaTeX~was~asked~to~convert~a~string~to~that~encoding,~but~that~ string~contains~a~character~that~'#1'~does~not~support. } % \end{macrocode} % % \subsection{Escaping definitions} % % Several of those encodings are defined by the pdf file format. The % following byte storage methods are defined: % \begin{itemize} % \item \texttt{bytes} (default), non-bytes are filtered out, and % bytes are left untouched (this is defined by default); % \item \texttt{hex} or \texttt{hexadecimal}, as per the \pdfTeX{} % primitive \tn{pdfescapehex} % \item \texttt{name}, as per the \pdfTeX{} primitive % \tn{pdfescapename} % \item \texttt{string}, as per the \pdfTeX{} primitive % \tn{pdfescapestring} % \item \texttt{url}, as per the percent encoding of urls. % \end{itemize} % % \subsubsection{Unescape methods} % % \begin{macro}{\@@_convert_unescape_hex:} % \begin{macro}[rEXP]{\@@_unescape_hex_auxi:N} % \begin{macro}[rEXP]{\@@_unescape_hex_auxii:N} % Take chars two by two, and interpret each pair as the hexadecimal % code for a byte. Anything else than hexadecimal digits is ignored, % raising the flag. A string which contains an odd number of % hexadecimal digits gets |0| appended to it: this is equivalent to % appending a |0| in all cases, and dropping it if it is alone. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_unescape_hex: { \group_begin: \flag_clear:N \l_@@_error_flag \int_set:Nn \tex_escapechar:D { 92 } \__kernel_tl_gset:Nx \g_@@_result_tl { \@@_output_byte:w " \exp_last_unbraced:Nf \@@_unescape_hex_auxi:N { \tl_to_str:N \g_@@_result_tl } 0 { ? 0 - 1 \prg_break: } \prg_break_point: \@@_output_end: } \@@_if_flag_error:Nne \l_@@_error_flag { unescape-hex } { } \group_end: } \cs_new:Npn \@@_unescape_hex_auxi:N #1 { \use_none:n #1 \@@_hexadecimal_use:NTF #1 { \@@_unescape_hex_auxii:N } { \flag_raise:N \l_@@_error_flag \@@_unescape_hex_auxi:N } } \cs_new:Npn \@@_unescape_hex_auxii:N #1 { \use_none:n #1 \@@_hexadecimal_use:NTF #1 { \@@_output_end: \@@_output_byte:w " \@@_unescape_hex_auxi:N } { \flag_raise:N \l_@@_error_flag \@@_unescape_hex_auxii:N } } \msg_new:nnnn { str } { unescape-hex } { String~invalid~in~escaping~'hex':~only~hexadecimal~digits~allowed. } { Some~characters~in~the~string~you~asked~to~convert~are~not~ hexadecimal~digits~(0-9,~A-F,~a-f)~nor~spaces. } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_unescape_name:} % \begin{macro}[rEXP]{\@@_unescape_name_loop:wNN} % \begin{macro}{\@@_convert_unescape_url:} % \begin{macro}[rEXP]{\@@_unescape_url_loop:wNN} % The \cs{@@_convert_unescape_name:} function replaces each % occurrence of |#| followed by two hexadecimal digits in % \cs{g_@@_result_tl} by the corresponding byte. The \texttt{url} % function is identical, with escape character |%| instead of |#|. % Thus we define the two together. The arguments of \cs{@@_tmp:w} are % the character code of |#| or |%| in hexadecimal, the name of the % main function to define, and the name of the auxiliary which % performs the loop. % % The looping auxiliary |#3| finds the next escape character, reads % the following two characters, and tests them. The test % \cs{@@_hexadecimal_use:NTF} leaves the upper-case digit in the % input stream, hence we surround the test with % \cs{@@_output_byte:w}~|"| and \cs{@@_output_end:}. If both % characters are hexadecimal digits, they should be removed before % looping: this is done by \cs{use_i:nnn}. If one of the characters % is not a hexadecimal digit, then feed |"#1| to % \cs{@@_output_byte:w} to produce the escape character, raise the % flag, and call the looping function followed by the two characters % (remove \cs{use_i:nnn}). % \begin{macrocode} \cs_set_protected:Npn \@@_tmp:w #1#2#3 { \cs_new_protected:cpn { @@_convert_unescape_#2: } { \group_begin: \flag_clear:N \l_@@_byte_flag \flag_clear:N \l_@@_error_flag \int_set:Nn \tex_escapechar:D { 92 } \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN #3 \g_@@_result_tl #1 ? { ? \prg_break: } \prg_break_point: } \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { #2 } \@@_if_flag_error:Nne \l_@@_error_flag { unescape-#2 } { } \group_end: } \cs_new:Npn #3 ##1#1##2##3 { \@@_filter_bytes:n {##1} \use_none:n ##3 \@@_output_byte:w " \@@_hexadecimal_use:NTF ##2 { \@@_hexadecimal_use:NTF ##3 { } { \flag_raise:N \l_@@_error_flag * 0 + `#1 \use_i:nn } } { \flag_raise:N \l_@@_error_flag 0 + `#1 \use_i:nn } \@@_output_end: \use_i:nnn #3 ##2##3 } \msg_new:nnnn { str } { unescape-#2 } { String~invalid~in~escaping~'#2'. } { LaTeX~came~across~the~escape~character~'#1'~not~followed~by~ two~hexadecimal~digits.~This~is~invalid~in~the~escaping~'#2'. } } \exp_after:wN \@@_tmp:w \c_hash_str { name } \@@_unescape_name_loop:wNN \exp_after:wN \@@_tmp:w \c_percent_str { url } \@@_unescape_url_loop:wNN % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_unescape_string:} % \begin{macro}[rEXP]{\@@_unescape_string_newlines:wN} % \begin{macro}[rEXP]{\@@_unescape_string_loop:wNNN} % \begin{macro}[rEXP]{\@@_unescape_string_repeat:NNNNNN} % The \texttt{string} escaping is somewhat similar to the % \texttt{name} and \texttt{url} escapings, with escape character |\|. % The first step is to convert all three line endings, |^^J|, |^^M|, % and |^^M^^J| to the common |^^J|, as per the \textsc{pdf} % specification. This step cannot raise the flag. % % Then the following escape sequences are decoded. % \begin{itemize}\def\makelabel#1{\hss\llap{\ttfamily\string#1}} % \item[\n] Line feed ($10$) % \item[\r] Carriage return ($13$) % \item[\t] Horizontal tab ($9$) % \item[\b] Backspace ($8$) % \item[\f] Form feed ($12$) % \item[\(] Left parenthesis % \item[\)] Right parenthesis % \item[\\] Backslash % \item[\ddd] (backslash followed by $1$ to $3$ octal digits) Byte % \texttt{ddd} (octal), subtracting $256$ in case of overflow. % \end{itemize} % If followed by an end-of-line character, the backslash and the % end-of-line are ignored. If followed by anything else, the backslash % is ignored, raising the error flag. % \begin{macrocode} \group_begin: \char_set_catcode_other:N \^^J \char_set_catcode_other:N \^^M \cs_set_protected:Npn \@@_tmp:w #1 { \cs_new_protected:Npn \@@_convert_unescape_string: { \group_begin: \flag_clear:N \l_@@_byte_flag \flag_clear:N \l_@@_error_flag \int_set:Nn \tex_escapechar:D { 92 } \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_unescape_string_newlines:wN \g_@@_result_tl \prg_break: ^^M ? \prg_break_point: } \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_unescape_string_loop:wNNN \g_@@_result_tl #1 ?? { ? \prg_break: } \prg_break_point: } \@@_if_flag_error:Nne \l_@@_byte_flag { non-byte } { string } \@@_if_flag_error:Nne \l_@@_error_flag { unescape-string } { } \group_end: } } \exp_args:No \@@_tmp:w { \c_backslash_str } \exp_last_unbraced:NNNNo \cs_new:Npn \@@_unescape_string_loop:wNNN #1 \c_backslash_str #2#3#4 { \@@_filter_bytes:n {#1} \use_none:n #4 \@@_output_byte:w ' \@@_octal_use:NTF #2 { \@@_octal_use:NTF #3 { \@@_octal_use:NTF #4 { \if_int_compare:w #2 > 3 \exp_stop_f: - 256 \fi: \@@_unescape_string_repeat:NNNNNN } { \@@_unescape_string_repeat:NNNNNN ? } } { \@@_unescape_string_repeat:NNNNNN ?? } } { \str_case_e:nnF {#2} { { \c_backslash_str } { 134 } { ( } { 50 } { ) } { 51 } { r } { 15 } { f } { 14 } { n } { 12 } { t } { 11 } { b } { 10 } { ^^J } { 0 - 1 } } { \flag_raise:N \l_@@_error_flag 0 - 1 \use_i:nn } } \@@_output_end: \use_i:nn \@@_unescape_string_loop:wNNN #2#3#4 } \cs_new:Npn \@@_unescape_string_repeat:NNNNNN #1#2#3#4#5#6 { \@@_output_end: \@@_unescape_string_loop:wNNN } \cs_new:Npn \@@_unescape_string_newlines:wN #1 ^^M #2 { #1 \if_charcode:w ^^J #2 \else: ^^J \fi: \@@_unescape_string_newlines:wN #2 } \msg_new:nnnn { str } { unescape-string } { String~invalid~in~escaping~'string'. } { LaTeX~came~across~an~escape~character~'\c_backslash_str'~ not~followed~by~any~of:~'n',~'r',~'t',~'b',~'f',~'(',~')',~ '\c_backslash_str',~one~to~three~octal~digits,~or~the~end~ of~a~line. } \group_end: % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsubsection{Escape methods} % % Currently, none of the escape methods can lead to errors, assuming % that their input is made out of bytes. % % \begin{macro}{\@@_convert_escape_hex:} % \begin{macro}[rEXP]{\@@_escape_hex_char:N} % Loop and convert each byte to hexadecimal. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_escape_hex: { \@@_convert_gmap:N \@@_escape_hex_char:N } \cs_new:Npn \@@_escape_hex_char:N #1 { \@@_output_hexadecimal:n { `#1 } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_escape_name:} % \begin{macro}[rEXP]{\@@_escape_name_char:n} % \begin{macro}[rEXP]{\@@_if_escape_name:nTF} % \begin{variable}{\c_@@_escape_name_str} % \begin{variable}{\c_@@_escape_name_not_str} % For each byte, test whether it should be output as is, or be % \enquote{hash-encoded}. Roughly, bytes outside the range % $[\hexnum{2A},\hexnum{7E}]$ are hash-encoded. We keep two lists of % exceptions: characters in \cs{c_@@_escape_name_not_str} are not % hash-encoded, and characters in the \cs{c_@@_escape_name_str} are % encoded. % \begin{macrocode} \str_const:Nn \c_@@_escape_name_not_str { ! " $ & ' } %$ \str_const:Nn \c_@@_escape_name_str { {}/<>[] } \cs_new_protected:Npn \@@_convert_escape_name: { \@@_convert_gmap:N \@@_escape_name_char:n } \cs_new:Npn \@@_escape_name_char:n #1 { \@@_if_escape_name:nTF {#1} {#1} { \c_hash_str \@@_output_hexadecimal:n {`#1} } } \prg_new_conditional:Npnn \@@_if_escape_name:n #1 { TF } { \if_int_compare:w `#1 < "2A \exp_stop_f: \@@_if_contains_char:NnTF \c_@@_escape_name_not_str {#1} \prg_return_true: \prg_return_false: \else: \if_int_compare:w `#1 > "7E \exp_stop_f: \prg_return_false: \else: \@@_if_contains_char:NnTF \c_@@_escape_name_str {#1} \prg_return_false: \prg_return_true: \fi: \fi: } % \end{macrocode} % \end{variable} % \end{variable} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_escape_string:} % \begin{macro}[rEXP]{\@@_escape_string_char:N} % \begin{macro}[rEXP]{\@@_if_escape_string:NTF} % \begin{variable}{\c_@@_escape_string_str} % Any character below (and including) space, and any character above % (and including) \texttt{del}, are converted to octal. One backslash % is added before each parenthesis and backslash. % \begin{macrocode} \str_const:Ne \c_@@_escape_string_str { \c_backslash_str ( ) } \cs_new_protected:Npn \@@_convert_escape_string: { \@@_convert_gmap:N \@@_escape_string_char:N } \cs_new:Npn \@@_escape_string_char:N #1 { \@@_if_escape_string:NTF #1 { \@@_if_contains_char:NnT \c_@@_escape_string_str {#1} { \c_backslash_str } #1 } { \c_backslash_str \int_div_truncate:nn {`#1} {64} \int_mod:nn { \int_div_truncate:nn {`#1} { 8 } } { 8 } \int_mod:nn {`#1} { 8 } } } \prg_new_conditional:Npnn \@@_if_escape_string:N #1 { TF } { \if_int_compare:w `#1 < "27 \exp_stop_f: \prg_return_false: \else: \if_int_compare:w `#1 > "7A \exp_stop_f: \prg_return_false: \else: \prg_return_true: \fi: \fi: } % \end{macrocode} % \end{variable} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_convert_escape_url:} % \begin{macro}[rEXP]{\@@_escape_url_char:n} % \begin{macro}[rEXP]{\@@_if_escape_url:nTF} % This function is similar to \cs{@@_convert_escape_name:}, escaping % different characters. % \begin{macrocode} \cs_new_protected:Npn \@@_convert_escape_url: { \@@_convert_gmap:N \@@_escape_url_char:n } \cs_new:Npn \@@_escape_url_char:n #1 { \@@_if_escape_url:nTF {#1} {#1} { \c_percent_str \@@_output_hexadecimal:n { `#1 } } } \prg_new_conditional:Npnn \@@_if_escape_url:n #1 { TF } { \if_int_compare:w `#1 < "30 \exp_stop_f: \@@_if_contains_char:nnTF { "-. } {#1} \prg_return_true: \prg_return_false: \else: \if_int_compare:w `#1 > "7E \exp_stop_f: \prg_return_false: \else: \@@_if_contains_char:nnTF { : ; = ? @ [ ] } {#1} \prg_return_false: \prg_return_true: \fi: \fi: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Encoding definitions} % % The \texttt{native} encoding is automatically defined. Other encodings % are loaded as needed. The following encodings are supported: % \begin{itemize} % \item \textsc{utf-8}; % \item \textsc{utf-16}, big-, little-endian, or with byte order mark; % \item \textsc{utf-32}, big-, little-endian, or with byte order mark; % \item the \textsc{iso 8859} code pages, numbered from $1$ to $16$, % skipping the inexistent \textsc{iso 8859-12}. % \end{itemize} % % \subsubsection{\textsc{utf-8} support} % % \begin{macro}{\@@_convert_encode_utf8:} % \begin{macro}[rEXP]{\@@_encode_utf_viii_char:n} % \begin{macro}[rEXP]{\@@_encode_utf_viii_loop:wwnnw} % Loop through the internal string, and convert each character to its % \textsc{utf-8} representation. The representation is built from the % right-most (least significant) byte to the left-most (most % significant) byte. Continuation bytes are in the range $[128,191]$, % taking $64$ different values, hence we roughly want to express the % character code in base $64$, shifting the first digit in the % representation by some number depending on how many continuation % bytes there are. In the range $[0,127]$, output the corresponding % byte directly. In the range $[128,2047]$, output the remainder % modulo $64$, plus $128$ as a continuation byte, then output the % quotient (which is in the range $[0,31]$), shifted by $192$. In the % next range, $[2048,65535]$, split the character code into residue % and quotient modulo $64$, output the residue as a first continuation % byte, then repeat; this leaves us with a quotient in the range % $[0,15]$, which we output shifted by $224$. The last range, % $[65536,1114111]$, follows the same pattern: once we realize that % dividing twice by $64$ leaves us with a number larger than $15$, we % repeat, producing a last continuation byte, and offset the quotient % by $240$ for the leading byte. % % How is that implemented? \cs{@@_encode_utf_vii_loop:wwnnw} takes % successive quotients as its first argument, the quotient from the % previous step as its second argument (except in step~$1$), the bound % for quotients that trigger one more step or not, and finally the % offset used if this step should produce the leading byte. Leading % bytes can be in the ranges $[0,127]$, $[192,223]$, $[224,239]$, and % $[240,247]$ (really, that last limit should be $244$ because Unicode % stops at the code point $1114111$). At each step, if the quotient % |#1| is less than the limit |#3| for that range, output the leading % byte (|#1| shifted by |#4|) and stop. Otherwise, we need one more % step: use the quotient of |#1| by $64$, and |#1| as arguments for % the looping auxiliary, and output the continuation byte % corresponding to the remainder $|#2|-64|#1|+128$. The bizarre % construction |- 1 + 0 *| removes the spurious initial % continuation byte (better methods welcome). % \begin{macrocode} \cs_new_protected:cpn { @@_convert_encode_utf8: } { \@@_convert_gmap_internal:N \@@_encode_utf_viii_char:n } \cs_new:Npn \@@_encode_utf_viii_char:n #1 { \@@_encode_utf_viii_loop:wwnnw #1 ; - 1 + 0 * ; { 128 } { 0 } { 32 } { 192 } { 16 } { 224 } { 8 } { 240 } \s_@@_stop } \cs_new:Npn \@@_encode_utf_viii_loop:wwnnw #1; #2; #3#4 #5 \s_@@_stop { \if_int_compare:w #1 < #3 \exp_stop_f: \@@_output_byte:n { #1 + #4 } \exp_after:wN \@@_use_none_delimit_by_s_stop:w \fi: \exp_after:wN \@@_encode_utf_viii_loop:wwnnw \int_value:w \int_div_truncate:nn {#1} {64} ; #1 ; #5 \s_@@_stop \@@_output_byte:n { #2 - 64 * ( #1 - 2 ) } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{variable} % { % @@_missing , % @@_extra , % @@_overlong , % @@_overflow , % } % When decoding a string that is purportedly in the \textsc{utf-8} % encoding, four different errors can occur, signalled by a specific % flag for each (we define those flags using \cs{flag_clear_new:N} % rather than \cs{flag_new:N}, because they are shared with other % encoding definition files). % \begin{itemize} % \item \enquote{Missing continuation byte}: a leading byte is not % followed by the right number of continuation bytes. % \item \enquote{Extra continuation byte}: a continuation byte % appears where it was not expected, \emph{i.e.}, not after an % appropriate leading byte. % \item \enquote{Overlong}: a Unicode character is expressed using % more bytes than necessary, for instance, \hexnum{C0}\hexnum{80} % for the code point $0$, instead of a single null byte. % \item \enquote{Overflow}: this occurs when decoding produces % Unicode code points greater than $1114111$. % \end{itemize} % We only raise one \LaTeX3 error message, combining all the errors % which occurred. In the short message, the leading comma must be % removed to get a grammatically correct sentence. In the long text, % first remind the user what a correct \textsc{utf-8} string should % look like, then add error-specific information. % \begin{macrocode} \flag_clear_new:N \l_@@_missing_flag \flag_clear_new:N \l_@@_extra_flag \flag_clear_new:N \l_@@_overlong_flag \flag_clear_new:N \l_@@_overflow_flag \msg_new:nnnn { str } { utf8-decode } { Invalid~UTF-8~string: \exp_last_unbraced:Nf \use_none:n { \@@_if_flag_times:NT \l_@@_missing_flag { ,~missing~continuation~byte } \@@_if_flag_times:NT \l_@@_extra_flag { ,~extra~continuation~byte } \@@_if_flag_times:NT \l_@@_overlong_flag { ,~overlong~form } \@@_if_flag_times:NT \l_@@_overflow_flag { ,~code~point~too~large } } . } { In~the~UTF-8~encoding,~each~Unicode~character~consists~in~ 1~to~4~bytes,~with~the~following~bit~pattern: \\ \iow_indent:n { Code~point~\ \ \ \ <~128:~0xxxxxxx \\ Code~point~\ \ \ <~2048:~110xxxxx~10xxxxxx \\ Code~point~\ \ <~65536:~1110xxxx~10xxxxxx~10xxxxxx \\ Code~point~ <~1114112:~11110xxx~10xxxxxx~10xxxxxx~10xxxxxx \\ } Bytes~of~the~form~10xxxxxx~are~called~continuation~bytes. \flag_if_raised:NT \l_@@_missing_flag { \\\\ A~leading~byte~(in~the~range~[192,255])~was~not~followed~by~ the~appropriate~number~of~continuation~bytes. } \flag_if_raised:NT \l_@@_extra_flag { \\\\ LaTeX~came~across~a~continuation~byte~when~it~was~not~expected. } \flag_if_raised:NT \l_@@_overlong_flag { \\\\ Every~Unicode~code~point~must~be~expressed~in~the~shortest~ possible~form.~For~instance,~'0xC0'~'0x83'~is~not~a~valid~ representation~for~the~code~point~3. } \flag_if_raised:NT \l_@@_overflow_flag { \\\\ Unicode~limits~code~points~to~the~range~[0,1114111]. } } \prop_gput:Nnn \g_msg_module_name_prop { str } { LaTeX } \prop_gput:Nnn \g_msg_module_type_prop { str } { } % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_convert_decode_utf8:} % \begin{macro}[rEXP] % { % \@@_decode_utf_viii_start:N, % \@@_decode_utf_viii_continuation:wwN, % \@@_decode_utf_viii_aux:wNnnwN % } % \begin{macro}[rEXP] % {\@@_decode_utf_viii_overflow:w, \@@_decode_utf_viii_end:} % Decoding is significantly harder than encoding. As before, lower % some flags, which are tested at the end (in bulk, to trigger at most % one \LaTeX3 error, as explained above). We expect successive % multi-byte sequences of the form \meta{start byte} % \meta{continuation bytes}. The \texttt{_start} auxiliary tests the % first byte: % \begin{itemize} % \item $[0,\hexnum{7F}]$: the byte stands alone, and is converted % to its own character code; % \item $[\hexnum{80}, \hexnum{BF}]$: unexpected continuation byte, % raise the appropriate flag, and convert that byte to the % replacement character \hexnum{FFFD}; % \item $[\hexnum{C0}, \hexnum{FF}]$: this byte should be followed % by some continuation byte(s). % \end{itemize} % In the first two cases, \cs{use_none_delimit_by_q_stop:w} removes % data that only the third case requires, namely the limits of ranges % of Unicode characters which can be expressed with $1$, $2$, $3$, or % $4$ bytes. % % We can now concentrate on the multi-byte case and the % \texttt{_continuation} auxiliary. We expect |#3| to be in the range % $[\hexnum{80}, \hexnum{BF}]$. The test for this goes as follows: if % the character code is less than \hexnum{80}, we compare it to % $-\hexnum{C0}$, yielding \texttt{false}; otherwise to \hexnum{C0}, % yielding \texttt{true} in the range $[\hexnum{80}, \hexnum{BF}]$ and % \texttt{false} otherwise. If we find that the byte is not a % continuation range, stop the current slew of bytes, output the % replacement character, and continue parsing with the \texttt{_start} % auxiliary, starting at the byte we just tested. Once we know that % the byte is a continuation byte, leave it behind us in the input % stream, compute what code point the bytes read so far would produce, % and feed that number to the \texttt{_aux} function. % % The \texttt{_aux} function tests whether we should look for more % continuation bytes or not. If the number it receives as |#1| is less % than the maximum |#4| for the current range, then we are done: check % for an overlong representation by comparing |#1| with the maximum % |#3| for the previous range. Otherwise, we call the % \texttt{_continuation} auxiliary again, after shifting the % \enquote{current code point} by |#4| (maximum from the range we just % checked). % % Two additional tests are needed: if we reach the end of the list of % range maxima and we are still not done, then we are faced with an % overflow. Clean up, and again insert the code point \hexnum{FFFD} % for the replacement character. Also, every time we read a byte, we % need to check whether we reached the end of the string. In a correct % \textsc{utf-8} string, this happens automatically when the % \texttt{_start} auxiliary leaves its first argument in the input % stream: the end-marker begins with \cs{prg_break:}, which ends % the loop. On the other hand, if the end is reached when looking for % a continuation byte, the \cs{use_none:n} |#3| construction removes % the first token from the end-marker, and leaves the \texttt{_end} % auxiliary, which raises the appropriate error flag before ending the % mapping. % \begin{macrocode} \cs_new_protected:cpn { @@_convert_decode_utf8: } { \flag_clear:N \l_@@_error_flag \flag_clear:N \l_@@_missing_flag \flag_clear:N \l_@@_extra_flag \flag_clear:N \l_@@_overlong_flag \flag_clear:N \l_@@_overflow_flag \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_decode_utf_viii_start:N \g_@@_result_tl { \prg_break: \@@_decode_utf_viii_end: } \prg_break_point: } \@@_if_flag_error:Nne \l_@@_error_flag { utf8-decode } { } } \cs_new:Npn \@@_decode_utf_viii_start:N #1 { #1 \if_int_compare:w `#1 < "C0 \exp_stop_f: \s_@@ \if_int_compare:w `#1 < "80 \exp_stop_f: \int_value:w `#1 \else: \flag_raise:N \l_@@_extra_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \fi: \else: \exp_after:wN \@@_decode_utf_viii_continuation:wwN \int_value:w \int_eval:n { `#1 - "C0 } \exp_after:wN \fi: \s_@@ \@@_use_none_delimit_by_s_stop:w {"80} {"800} {"10000} {"110000} \s_@@_stop \@@_decode_utf_viii_start:N } \cs_new:Npn \@@_decode_utf_viii_continuation:wwN #1 \s_@@ #2 \@@_decode_utf_viii_start:N #3 { \use_none:n #3 \if_int_compare:w `#3 < \if_int_compare:w `#3 < "80 \exp_stop_f: - \fi: "C0 \exp_stop_f: #3 \exp_after:wN \@@_decode_utf_viii_aux:wNnnwN \int_value:w \int_eval:n { #1 * "40 + `#3 - "80 } \exp_after:wN \else: \s_@@ \flag_raise:N \l_@@_missing_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \fi: \s_@@ #2 \@@_decode_utf_viii_start:N #3 } \cs_new:Npn \@@_decode_utf_viii_aux:wNnnwN #1 \s_@@ #2#3#4 #5 \@@_decode_utf_viii_start:N #6 { \if_int_compare:w #1 < #4 \exp_stop_f: \s_@@ \if_int_compare:w #1 < #3 \exp_stop_f: \flag_raise:N \l_@@_overlong_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \else: #1 \fi: \else: \if_meaning:w \s_@@_stop #5 \@@_decode_utf_viii_overflow:w #1 \fi: \exp_after:wN \@@_decode_utf_viii_continuation:wwN \int_value:w \int_eval:n { #1 - #4 } \exp_after:wN \fi: \s_@@ #2 {#4} #5 \@@_decode_utf_viii_start:N } \cs_new:Npn \@@_decode_utf_viii_overflow:w #1 \fi: #2 \fi: { \fi: \fi: \flag_raise:N \l_@@_overflow_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int } \cs_new:Npn \@@_decode_utf_viii_end: { \s_@@ \flag_raise:N \l_@@_missing_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \s_@@ \prg_break: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsubsection{\textsc{utf-16} support} % % The definitions are done in a category code regime where the bytes % $254$ and $255$ used by the byte order mark have catcode~$12$. % \begin{macrocode} \group_begin: \char_set_catcode_other:N \^^fe \char_set_catcode_other:N \^^ff % \end{macrocode} % % \begin{macro} % { % \@@_convert_encode_utf16: , % \@@_convert_encode_utf16be: , % \@@_convert_encode_utf16le: , % } % \begin{macro}[rEXP] % { % \@@_encode_utf_xvi_aux:N , % \@@_encode_utf_xvi_char:n , % } % When the endianness is not specified, it is big-endian by default, % and we add a byte-order mark. Convert characters one by one in a % loop, with different behaviours depending on the character code. % \begin{itemize} % \item $[0, \hexnum{D7FF}]$: converted to two bytes; % \item $[\hexnum{D800}, \hexnum{DFFF}]$ are used as surrogates: % they cannot be converted and are replaced by the replacement % character; % \item $[\hexnum{E000}, \hexnum{FFFF}]$: converted to two bytes; % \item $[\hexnum{10000}, \hexnum{10FFFF}]$: converted to a pair of % surrogates, each two bytes. The magic \hexnum{D7C0} is % $\hexnum{D800}-\hexnum{10000}/\hexnum{400}$. % \end{itemize} % For the duration of this operation, \cs{@@_tmp:w} is defined as a % function to convert a number in the range $[0, \hexnum{FFFF}]$ to a % pair of bytes (either big endian or little endian), by feeding the % quotient of the division of |#1| by \hexnum{100}, followed by |#1| % to \cs{@@_encode_utf_xvi_be:nn} or its \texttt{le} analog: those % compute the remainder, and output two bytes for the quotient and % remainder. % \begin{macrocode} \cs_new_protected:cpn { @@_convert_encode_utf16: } { \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_be:n \tl_gput_left:Ne \g_@@_result_tl { ^^fe ^^ff } } \cs_new_protected:cpn { @@_convert_encode_utf16be: } { \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_be:n } \cs_new_protected:cpn { @@_convert_encode_utf16le: } { \@@_encode_utf_xvi_aux:N \@@_output_byte_pair_le:n } \cs_new_protected:Npn \@@_encode_utf_xvi_aux:N #1 { \flag_clear:N \l_@@_error_flag \cs_set_eq:NN \@@_tmp:w #1 \@@_convert_gmap_internal:N \@@_encode_utf_xvi_char:n \@@_if_flag_error:Nne \l_@@_error_flag { utf16-encode } { } } \cs_new:Npn \@@_encode_utf_xvi_char:n #1 { \if_int_compare:w #1 < "D800 \exp_stop_f: \@@_tmp:w {#1} \else: \if_int_compare:w #1 < "10000 \exp_stop_f: \if_int_compare:w #1 < "E000 \exp_stop_f: \flag_raise:N \l_@@_error_flag \@@_tmp:w { \c_@@_replacement_char_int } \else: \@@_tmp:w {#1} \fi: \else: \exp_args:Nf \@@_tmp:w { \int_div_truncate:nn {#1} {"400} + "D7C0 } \exp_args:Nf \@@_tmp:w { \int_mod:nn {#1} {"400} + "DC00 } \fi: \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{variable} % { % @@_missing , % @@_extra , % @@_end , % } % When encoding a Unicode string to \textsc{utf-16}, only one error % can occur: code points in the range $[\hexnum{D800}, % \hexnum{DFFF}]$, corresponding to surrogates, cannot be encoded. We % use the all-purpose flag \texttt{@@_error} to signal that error. % % When decoding a Unicode string which is purportedly in % \textsc{utf-16}, three errors can occur: a missing trail surrogate, % an unexpected trail surrogate, and a string containing an odd number % of bytes. % \begin{macrocode} \flag_clear_new:N \l_@@_missing_flag \flag_clear_new:N \l_@@_extra_flag \flag_clear_new:N \l_@@_end_flag \msg_new:nnnn { str } { utf16-encode } { Unicode~string~cannot~be~expressed~in~UTF-16:~surrogate. } { Surrogate~code~points~(in~the~range~[U+D800,~U+DFFF])~ can~be~expressed~in~the~UTF-8~and~UTF-32~encodings,~ but~not~in~the~UTF-16~encoding. } \msg_new:nnnn { str } { utf16-decode } { Invalid~UTF-16~string: \exp_last_unbraced:Nf \use_none:n { \@@_if_flag_times:NT \l_@@_missing_flag { ,~missing~trail~surrogate } \@@_if_flag_times:NT \l_@@_extra_flag { ,~extra~trail~surrogate } \@@_if_flag_times:NT \l_@@_end_flag { ,~odd~number~of~bytes } } . } { In~the~UTF-16~encoding,~each~Unicode~character~is~encoded~as~ 2~or~4~bytes: \\ \iow_indent:n { Code~point~in~[U+0000,~U+D7FF]:~two~bytes \\ Code~point~in~[U+D800,~U+DFFF]:~illegal \\ Code~point~in~[U+E000,~U+FFFF]:~two~bytes \\ Code~point~in~[U+10000,~U+10FFFF]:~ a~lead~surrogate~and~a~trail~surrogate \\ } Lead~surrogates~are~pairs~of~bytes~in~the~range~[0xD800,~0xDBFF],~ and~trail~surrogates~are~in~the~range~[0xDC00,~0xDFFF]. \flag_if_raised:NT \l_@@_missing_flag { \\\\ A~lead~surrogate~was~not~followed~by~a~trail~surrogate. } \flag_if_raised:NT \l_@@_extra_flag { \\\\ LaTeX~came~across~a~trail~surrogate~when~it~was~not~expected. } \flag_if_raised:NT \l_@@_end_flag { \\\\ The~string~contained~an~odd~number~of~bytes.~This~is~invalid:~ the~basic~code~unit~for~UTF-16~is~16~bits~(2~bytes). } } % \end{macrocode} % \end{variable} % % \begin{macro} % { % \@@_convert_decode_utf16: , % \@@_convert_decode_utf16be: , % \@@_convert_decode_utf16le: , % } % \begin{macro}{\@@_decode_utf_xvi_bom:NN, \@@_decode_utf_xvi:Nw} % As for \textsc{utf-8}, decoding \textsc{utf-16} is harder than % encoding it. If the endianness is unknown, check the first two % bytes: if those are \hexnum{FE} and \hexnum{FF} in either order, % remove them and use the corresponding endianness, otherwise assume % big-endianness. The three endianness cases are based on a common % auxiliary whose first argument is $1$ for big-endian and $2$ for % little-endian, and whose second argument, delimited by the scan mark % \cs{s_@@_stop}, is expanded once (the string may be long; passing % \cs{g_@@_result_tl} as an argument before expansion is cheaper). % % The \cs{@@_decode_utf_xvi:Nw} function defines \cs{@@_tmp:w} to % take two arguments and return the character code of the first one if % the string is big-endian, and the second one if the string is % little-endian, then loops over the string using % \cs{@@_decode_utf_xvi_pair:NN} described below. % \begin{macrocode} \cs_new_protected:cpn { @@_convert_decode_utf16be: } { \@@_decode_utf_xvi:Nw 1 \g_@@_result_tl \s_@@_stop } \cs_new_protected:cpn { @@_convert_decode_utf16le: } { \@@_decode_utf_xvi:Nw 2 \g_@@_result_tl \s_@@_stop } \cs_new_protected:cpn { @@_convert_decode_utf16: } { \exp_after:wN \@@_decode_utf_xvi_bom:NN \g_@@_result_tl \s_@@_stop \s_@@_stop \s_@@_stop } \cs_new_protected:Npn \@@_decode_utf_xvi_bom:NN #1#2 { \str_if_eq:nnTF { #1#2 } { ^^ff ^^fe } { \@@_decode_utf_xvi:Nw 2 } { \str_if_eq:nnTF { #1#2 } { ^^fe ^^ff } { \@@_decode_utf_xvi:Nw 1 } { \@@_decode_utf_xvi:Nw 1 #1#2 } } } \cs_new_protected:Npn \@@_decode_utf_xvi:Nw #1#2 \s_@@_stop { \flag_clear:N \l_@@_error_flag \flag_clear:N \l_@@_missing_flag \flag_clear:N \l_@@_extra_flag \flag_clear:N \l_@@_end_flag \cs_set:Npn \@@_tmp:w ##1 ##2 { ` ## #1 } \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_decode_utf_xvi_pair:NN #2 \q_@@_nil \q_@@_nil \prg_break_point: } \@@_if_flag_error:Nne \l_@@_error_flag { utf16-decode } { } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[rEXP] % { % \@@_decode_utf_xvi_pair:NN , % \@@_decode_utf_xvi_quad:NNwNN , % \@@_decode_utf_xvi_pair_end:Nw , % } % \begin{macro}[rEXP] % { % \@@_decode_utf_xvi_error:nNN , % \@@_decode_utf_xvi_extra:NNw , % } % Bytes are read two at a time. At this stage, |\@@_tmp:w #1#2| % expands to the character code of the most significant byte, and we % distinguish cases depending on which range it lies in: % \begin{itemize} % \item $[\hexnum{D8}, \hexnum{DB}]$ signals a lead surrogate, and % the integer expression yields $1$ (\eTeX{} rounds ties away from % zero); % \item $[\hexnum{DC}, \hexnum{DF}]$ signals a trail surrogate, % unexpected here, and the integer expression yields $2$; % \item any other value signals a code point in the Basic % Multilingual Plane, which stands for itself, and the % \cs{if_case:w} construction expands to nothing (cases other than % $1$ or $2$), leaving the relevant material in the input stream, % followed by another call to the \texttt{_pair} auxiliary. % \end{itemize} % The case of a lead surrogate is treated by the \texttt{_quad} % auxiliary, whose arguments |#1|, |#2|, |#4| and |#5| are the four % bytes. We expect the most significant byte of |#4#5| to be in the % range $[\hexnum{DC}, \hexnum{DF}]$ (trail surrogate). The test is % similar to the test used for continuation bytes in the % \textsc{utf-8} decoding functions. In the case where |#4#5| is % indeed a trail surrogate, leave |#1#2#4#5| \cs{s_@@} % \meta{code~point} \cs{s_@@}, and remove the pair |#4#5| before % looping with \cs{@@_decode_utf_xvi_pair:NN}. Otherwise, of course, % complain about the missing surrogate. % % The magic number \hexnum{D7F7} is such that % $\hexnum{D7F7}*\hexnum{400} = \hexnum{D800}*\hexnum{400} + % \hexnum{DC00} - \hexnum{10000}$. % % Every time we read a pair of bytes, we test for the end-marker % \cs{q_@@_nil}. When reaching the end, we additionally check that the % string had an even length. Also, if the end is reached when % expecting a trail surrogate, we treat that as a missing surrogate. % \begin{macrocode} \cs_new:Npn \@@_decode_utf_xvi_pair:NN #1#2 { \if_meaning:w \q_@@_nil #2 \@@_decode_utf_xvi_pair_end:Nw #1 \fi: \if_case:w \int_eval:n { ( \@@_tmp:w #1#2 - "D6 ) / 4 } \scan_stop: \or: \exp_after:wN \@@_decode_utf_xvi_quad:NNwNN \or: \exp_after:wN \@@_decode_utf_xvi_extra:NNw \fi: #1#2 \s_@@ \int_eval:n { "100 * \@@_tmp:w #1#2 + \@@_tmp:w #2#1 } \s_@@ \@@_decode_utf_xvi_pair:NN } \cs_new:Npn \@@_decode_utf_xvi_quad:NNwNN #1#2 #3 \@@_decode_utf_xvi_pair:NN #4#5 { \if_meaning:w \q_@@_nil #5 \@@_decode_utf_xvi_error:nNN { missing } #1#2 \@@_decode_utf_xvi_pair_end:Nw #4 \fi: \if_int_compare:w \if_int_compare:w \@@_tmp:w #4#5 < "DC \exp_stop_f: 0 = 1 \else: \@@_tmp:w #4#5 < "E0 \fi: \exp_stop_f: #1 #2 #4 #5 \s_@@ \int_eval:n { ( "100 * \@@_tmp:w #1#2 + \@@_tmp:w #2#1 - "D7F7 ) * "400 + "100 * \@@_tmp:w #4#5 + \@@_tmp:w #5#4 } \s_@@ \exp_after:wN \use_i:nnn \else: \@@_decode_utf_xvi_error:nNN { missing } #1#2 \fi: \@@_decode_utf_xvi_pair:NN #4#5 } \cs_new:Npn \@@_decode_utf_xvi_pair_end:Nw #1 \fi: { \fi: \if_meaning:w \q_@@_nil #1 \else: \@@_decode_utf_xvi_error:nNN { end } #1 \prg_do_nothing: \fi: \prg_break: } \cs_new:Npn \@@_decode_utf_xvi_extra:NNw #1#2 \s_@@ #3 \s_@@ { \@@_decode_utf_xvi_error:nNN { extra } #1#2 } \cs_new:Npn \@@_decode_utf_xvi_error:nNN #1#2#3 { \flag_raise:N \l_@@_error_flag \flag_raise:c { l_@@_#1_flag } #2 #3 \s_@@ \int_use:N \c_@@_replacement_char_int \s_@@ } % \end{macrocode} % \end{macro} % \end{macro} % % Restore the original catcodes of bytes $254$ and $255$. % \begin{macrocode} \group_end: % \end{macrocode} % % \subsubsection{\textsc{utf-32} support} % % The definitions are done in a category code regime where the bytes % $0$, $254$ and $255$ used by the byte order mark have catcode % \enquote{other}. % \begin{macrocode} \group_begin: \char_set_catcode_other:N \^^00 \char_set_catcode_other:N \^^fe \char_set_catcode_other:N \^^ff % \end{macrocode} % % \begin{macro} % { % \@@_convert_encode_utf32: , % \@@_convert_encode_utf32be: , % \@@_convert_encode_utf32le: , % } % \begin{macro}[rEXP] % { % \@@_encode_utf_xxxii_be:n , % \@@_encode_utf_xxxii_be_aux:nn , % \@@_encode_utf_xxxii_le:n , % \@@_encode_utf_xxxii_le_aux:nn , % } % Convert each integer in the comma-list \cs{g_@@_result_tl} to a % sequence of four bytes. The functions for big-endian and % little-endian encodings are very similar, but the % \cs{@@_output_byte:n} instructions are reversed. % \begin{macrocode} \cs_new_protected:cpn { @@_convert_encode_utf32: } { \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_be:n \tl_gput_left:Ne \g_@@_result_tl { ^^00 ^^00 ^^fe ^^ff } } \cs_new_protected:cpn { @@_convert_encode_utf32be: } { \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_be:n } \cs_new_protected:cpn { @@_convert_encode_utf32le: } { \@@_convert_gmap_internal:N \@@_encode_utf_xxxii_le:n } \cs_new:Npn \@@_encode_utf_xxxii_be:n #1 { \exp_args:Nf \@@_encode_utf_xxxii_be_aux:nn { \int_div_truncate:nn {#1} { "100 } } {#1} } \cs_new:Npn \@@_encode_utf_xxxii_be_aux:nn #1#2 { ^^00 \@@_output_byte_pair_be:n {#1} \@@_output_byte:n { #2 - #1 * "100 } } \cs_new:Npn \@@_encode_utf_xxxii_le:n #1 { \exp_args:Nf \@@_encode_utf_xxxii_le_aux:nn { \int_div_truncate:nn {#1} { "100 } } {#1} } \cs_new:Npn \@@_encode_utf_xxxii_le_aux:nn #1#2 { \@@_output_byte:n { #2 - #1 * "100 } \@@_output_byte_pair_le:n {#1} ^^00 } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{variable}{@@_overflow, @@_end} % There can be no error when encoding in \textsc{utf-32}. When % decoding, the string may not have length $4n$, or it may contain % code points larger than \hexnum{10FFFF}. The latter case often % happens if the encoding was in fact not \textsc{utf-32}, because % most arbitrary strings are not valid in \textsc{utf-32}. % \begin{macrocode} \flag_clear_new:N \l_@@_overflow_flag \flag_clear_new:N \l_@@_end_flag \msg_new:nnnn { str } { utf32-decode } { Invalid~UTF-32~string: \exp_last_unbraced:Nf \use_none:n { \@@_if_flag_times:NT \l_@@_overflow_flag { ,~code~point~too~large } \@@_if_flag_times:NT \l_@@_end_flag { ,~truncated~string } } . } { In~the~UTF-32~encoding,~every~Unicode~character~ (in~the~range~[U+0000,~U+10FFFF])~is~encoded~as~4~bytes. \flag_if_raised:NT \l_@@_overflow_flag { \\\\ LaTeX~came~across~a~code~point~larger~than~1114111,~ the~maximum~code~point~defined~by~Unicode.~ Perhaps~the~string~was~not~encoded~in~the~UTF-32~encoding? } \flag_if_raised:NT \l_@@_end_flag { \\\\ The~length~of~the~string~is~not~a~multiple~of~4.~ Perhaps~the~string~was~truncated? } } % \end{macrocode} % \end{variable} % % \begin{macro} % { % \@@_convert_decode_utf32: , % \@@_convert_decode_utf32be: , % \@@_convert_decode_utf32le: , % } % \begin{macro} % {\@@_decode_utf_xxxii_bom:NNNN, \@@_decode_utf_xxxii:Nw} % \begin{macro}[rEXP] % {\@@_decode_utf_xxxii_loop:NNNN, \@@_decode_utf_xxxii_end:w} % % The structure is similar to \textsc{utf-16} decoding functions. If % the endianness is not given, test the first $4$ bytes of the string % (possibly \cs{s_@@_stop} if the string is too short) for the presence % of a byte-order mark. If there is a byte-order mark, use that % endianness, and remove the $4$ bytes, otherwise default to % big-endian, and leave the $4$ bytes in place. The % \cs{@@_decode_utf_xxxii:Nw} auxiliary receives $1$ or $2$ as its % first argument indicating endianness, and the string to convert as % its second argument (expanded or not). It sets \cs{@@_tmp:w} to % expand to the character code of either of its two arguments % depending on endianness, then triggers the \texttt{_loop} auxiliary % inside an \texttt{e}-expanding assignment to \cs{g_@@_result_tl}. % % The \texttt{_loop} auxiliary first checks for the end-of-string % marker \cs{s_@@_stop}, calling the \texttt{_end} auxiliary if % appropriate. Otherwise, leave the \meta{4~bytes} \cs{s_@@} behind, % then check that the code point is not overflowing: the leading byte % must be $0$, and the following byte at most $16$. % % In the ending code, we check that there remains no byte: there % should be nothing left until the first \cs{s_@@_stop}. Break the map. % \begin{macrocode} \cs_new_protected:cpn { @@_convert_decode_utf32be: } { \@@_decode_utf_xxxii:Nw 1 \g_@@_result_tl \s_@@_stop } \cs_new_protected:cpn { @@_convert_decode_utf32le: } { \@@_decode_utf_xxxii:Nw 2 \g_@@_result_tl \s_@@_stop } \cs_new_protected:cpn { @@_convert_decode_utf32: } { \exp_after:wN \@@_decode_utf_xxxii_bom:NNNN \g_@@_result_tl \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop } \cs_new_protected:Npn \@@_decode_utf_xxxii_bom:NNNN #1#2#3#4 { \str_if_eq:nnTF { #1#2#3#4 } { ^^ff ^^fe ^^00 ^^00 } { \@@_decode_utf_xxxii:Nw 2 } { \str_if_eq:nnTF { #1#2#3#4 } { ^^00 ^^00 ^^fe ^^ff } { \@@_decode_utf_xxxii:Nw 1 } { \@@_decode_utf_xxxii:Nw 1 #1#2#3#4 } } } \cs_new_protected:Npn \@@_decode_utf_xxxii:Nw #1#2 \s_@@_stop { \flag_clear:N \l_@@_overflow_flag \flag_clear:N \l_@@_end_flag \flag_clear:N \l_@@_error_flag \cs_set:Npn \@@_tmp:w ##1 ##2 { ` ## #1 } \__kernel_tl_gset:Nx \g_@@_result_tl { \exp_after:wN \@@_decode_utf_xxxii_loop:NNNN #2 \s_@@_stop \s_@@_stop \s_@@_stop \s_@@_stop \prg_break_point: } \@@_if_flag_error:Nne \l_@@_error_flag { utf32-decode } { } } \cs_new:Npn \@@_decode_utf_xxxii_loop:NNNN #1#2#3#4 { \if_meaning:w \s_@@_stop #4 \exp_after:wN \@@_decode_utf_xxxii_end:w \fi: #1#2#3#4 \s_@@ \if_int_compare:w \@@_tmp:w #1#4 > \c_zero_int \flag_raise:N \l_@@_overflow_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \else: \if_int_compare:w \@@_tmp:w #2#3 > 16 \exp_stop_f: \flag_raise:N \l_@@_overflow_flag \flag_raise:N \l_@@_error_flag \int_use:N \c_@@_replacement_char_int \else: \int_eval:n { \@@_tmp:w #2#3*"10000 + \@@_tmp:w #3#2*"100 + \@@_tmp:w #4#1 } \fi: \fi: \s_@@ \@@_decode_utf_xxxii_loop:NNNN } \cs_new:Npn \@@_decode_utf_xxxii_end:w #1 \s_@@_stop { \tl_if_empty:nF {#1} { \flag_raise:N \l_@@_end_flag \flag_raise:N \l_@@_error_flag #1 \s_@@ \int_use:N \c_@@_replacement_char_int \s_@@ } \prg_break: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % Restore the original catcodes of bytes $0$, $254$ and $255$. % \begin{macrocode} \group_end: % \end{macrocode} % % \subsection{PDF names and strings by expansion} % % \begin{macro}[EXP]{\str_convert_pdfname:n} % \begin{macro}[EXP]{\@@_convert_pdfname:n} % \begin{macro}[EXP] % {\@@_convert_pdfname_bytes:n, \@@_convert_pdfname_bytes_aux:n} % \begin{macro}[EXP]{\@@_convert_pdfname_bytes_aux:nnn} % To convert to PDF names by expansion, we work purely on UTF-8 input. The % first step is to make a string with \enquote{other} spaces, % after which we use a simple token-by-token approach. In Unicode % engines, we break down everything before one-byte codepoints, but for % $8$-bit engines there is no need to worry. Actual escaping is covered % by the same code as used in the non-expandable route. % \begin{macrocode} \cs_new:Npn \str_convert_pdfname:n #1 { \exp_args:Ne \tl_to_str:n { \str_map_function:nN {#1} \@@_convert_pdfname:n } } \sys_if_engine_opentype:TF { \cs_new:Npn \@@_convert_pdfname:n #1 { \int_compare:nNnTF { `#1 } > { "7F } { \@@_convert_pdfname_bytes:n {#1} } { \@@_escape_name_char:n {#1} } } \cs_new:Npn \@@_convert_pdfname_bytes:n #1 { \exp_args:Ne \@@_convert_pdfname_bytes_aux:n { \__kernel_codepoint_to_bytes:n {`#1} } } \cs_new:Npn \@@_convert_pdfname_bytes_aux:n #1 { \@@_convert_pdfname_bytes_aux:nnnn #1 } \cs_new:Npe \@@_convert_pdfname_bytes_aux:nnnn #1#2#3#4 { \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#1} \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#2} \exp_not:N \tl_if_blank:nF {#3} { \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#3} \exp_not:N \tl_if_blank:nF {#4} { \c_hash_str \exp_not:N \@@_output_hexadecimal:n {#4} } } } } { \cs_new_eq:NN \@@_convert_pdfname:n \@@_escape_name_char:n } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % % \subsubsection{\textsc{iso 8859} support} % % The \textsc{iso-8859-1} encoding exactly matches with the $256$ first % Unicode characters. For other 8-bit encodings of the \textsc{iso-8859} % family, we keep track only of differences, and of unassigned bytes. % \begin{macrocode} %<*iso88591> \@@_declare_eight_bit_encoding:nnnn { iso88591 } { 256 } { } { } % % \end{macrocode} % % \begin{macrocode} %<*iso88592> \@@_declare_eight_bit_encoding:nnnn { iso88592 } { 399 } { { A1 } { 0104 } { A2 } { 02D8 } { A3 } { 0141 } { A5 } { 013D } { A6 } { 015A } { A9 } { 0160 } { AA } { 015E } { AB } { 0164 } { AC } { 0179 } { AE } { 017D } { AF } { 017B } { B1 } { 0105 } { B2 } { 02DB } { B3 } { 0142 } { B5 } { 013E } { B6 } { 015B } { B7 } { 02C7 } { B9 } { 0161 } { BA } { 015F } { BB } { 0165 } { BC } { 017A } { BD } { 02DD } { BE } { 017E } { BF } { 017C } { C0 } { 0154 } { C3 } { 0102 } { C5 } { 0139 } { C6 } { 0106 } { C8 } { 010C } { CA } { 0118 } { CC } { 011A } { CF } { 010E } { D0 } { 0110 } { D1 } { 0143 } { D2 } { 0147 } { D5 } { 0150 } { D8 } { 0158 } { D9 } { 016E } { DB } { 0170 } { DE } { 0162 } { E0 } { 0155 } { E3 } { 0103 } { E5 } { 013A } { E6 } { 0107 } { E8 } { 010D } { EA } { 0119 } { EC } { 011B } { EF } { 010F } { F0 } { 0111 } { F1 } { 0144 } { F2 } { 0148 } { F5 } { 0151 } { F8 } { 0159 } { F9 } { 016F } { FB } { 0171 } { FE } { 0163 } { FF } { 02D9 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso88593> \@@_declare_eight_bit_encoding:nnnn { iso88593 } { 384 } { { A1 } { 0126 } { A2 } { 02D8 } { A6 } { 0124 } { A9 } { 0130 } { AA } { 015E } { AB } { 011E } { AC } { 0134 } { AF } { 017B } { B1 } { 0127 } { B6 } { 0125 } { B9 } { 0131 } { BA } { 015F } { BB } { 011F } { BC } { 0135 } { BF } { 017C } { C5 } { 010A } { C6 } { 0108 } { D5 } { 0120 } { D8 } { 011C } { DD } { 016C } { DE } { 015C } { E5 } { 010B } { E6 } { 0109 } { F5 } { 0121 } { F8 } { 011D } { FD } { 016D } { FE } { 015D } { FF } { 02D9 } } { { A5 } { AE } { BE } { C3 } { D0 } { E3 } { F0 } } % % \end{macrocode} % % \begin{macrocode} %<*iso88594> \@@_declare_eight_bit_encoding:nnnn { iso88594 } { 383 } { { A1 } { 0104 } { A2 } { 0138 } { A3 } { 0156 } { A5 } { 0128 } { A6 } { 013B } { A9 } { 0160 } { AA } { 0112 } { AB } { 0122 } { AC } { 0166 } { AE } { 017D } { B1 } { 0105 } { B2 } { 02DB } { B3 } { 0157 } { B5 } { 0129 } { B6 } { 013C } { B7 } { 02C7 } { B9 } { 0161 } { BA } { 0113 } { BB } { 0123 } { BC } { 0167 } { BD } { 014A } { BE } { 017E } { BF } { 014B } { C0 } { 0100 } { C7 } { 012E } { C8 } { 010C } { CA } { 0118 } { CC } { 0116 } { CF } { 012A } { D0 } { 0110 } { D1 } { 0145 } { D2 } { 014C } { D3 } { 0136 } { D9 } { 0172 } { DD } { 0168 } { DE } { 016A } { E0 } { 0101 } { E7 } { 012F } { E8 } { 010D } { EA } { 0119 } { EC } { 0117 } { EF } { 012B } { F0 } { 0111 } { F1 } { 0146 } { F2 } { 014D } { F3 } { 0137 } { F9 } { 0173 } { FD } { 0169 } { FE } { 016B } { FF } { 02D9 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso88595> \@@_declare_eight_bit_encoding:nnnn { iso88595 } { 374 } { { A1 } { 0401 } { A2 } { 0402 } { A3 } { 0403 } { A4 } { 0404 } { A5 } { 0405 } { A6 } { 0406 } { A7 } { 0407 } { A8 } { 0408 } { A9 } { 0409 } { AA } { 040A } { AB } { 040B } { AC } { 040C } { AE } { 040E } { AF } { 040F } { B0 } { 0410 } { B1 } { 0411 } { B2 } { 0412 } { B3 } { 0413 } { B4 } { 0414 } { B5 } { 0415 } { B6 } { 0416 } { B7 } { 0417 } { B8 } { 0418 } { B9 } { 0419 } { BA } { 041A } { BB } { 041B } { BC } { 041C } { BD } { 041D } { BE } { 041E } { BF } { 041F } { C0 } { 0420 } { C1 } { 0421 } { C2 } { 0422 } { C3 } { 0423 } { C4 } { 0424 } { C5 } { 0425 } { C6 } { 0426 } { C7 } { 0427 } { C8 } { 0428 } { C9 } { 0429 } { CA } { 042A } { CB } { 042B } { CC } { 042C } { CD } { 042D } { CE } { 042E } { CF } { 042F } { D0 } { 0430 } { D1 } { 0431 } { D2 } { 0432 } { D3 } { 0433 } { D4 } { 0434 } { D5 } { 0435 } { D6 } { 0436 } { D7 } { 0437 } { D8 } { 0438 } { D9 } { 0439 } { DA } { 043A } { DB } { 043B } { DC } { 043C } { DD } { 043D } { DE } { 043E } { DF } { 043F } { E0 } { 0440 } { E1 } { 0441 } { E2 } { 0442 } { E3 } { 0443 } { E4 } { 0444 } { E5 } { 0445 } { E6 } { 0446 } { E7 } { 0447 } { E8 } { 0448 } { E9 } { 0449 } { EA } { 044A } { EB } { 044B } { EC } { 044C } { ED } { 044D } { EE } { 044E } { EF } { 044F } { F0 } { 2116 } { F1 } { 0451 } { F2 } { 0452 } { F3 } { 0453 } { F4 } { 0454 } { F5 } { 0455 } { F6 } { 0456 } { F7 } { 0457 } { F8 } { 0458 } { F9 } { 0459 } { FA } { 045A } { FB } { 045B } { FC } { 045C } { FD } { 00A7 } { FE } { 045E } { FF } { 045F } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso88596> \@@_declare_eight_bit_encoding:nnnn { iso88596 } { 344 } { { AC } { 060C } { BB } { 061B } { BF } { 061F } { C1 } { 0621 } { C2 } { 0622 } { C3 } { 0623 } { C4 } { 0624 } { C5 } { 0625 } { C6 } { 0626 } { C7 } { 0627 } { C8 } { 0628 } { C9 } { 0629 } { CA } { 062A } { CB } { 062B } { CC } { 062C } { CD } { 062D } { CE } { 062E } { CF } { 062F } { D0 } { 0630 } { D1 } { 0631 } { D2 } { 0632 } { D3 } { 0633 } { D4 } { 0634 } { D5 } { 0635 } { D6 } { 0636 } { D7 } { 0637 } { D8 } { 0638 } { D9 } { 0639 } { DA } { 063A } { E0 } { 0640 } { E1 } { 0641 } { E2 } { 0642 } { E3 } { 0643 } { E4 } { 0644 } { E5 } { 0645 } { E6 } { 0646 } { E7 } { 0647 } { E8 } { 0648 } { E9 } { 0649 } { EA } { 064A } { EB } { 064B } { EC } { 064C } { ED } { 064D } { EE } { 064E } { EF } { 064F } { F0 } { 0650 } { F1 } { 0651 } { F2 } { 0652 } } { { A1 } { A2 } { A3 } { A5 } { A6 } { A7 } { A8 } { A9 } { AA } { AB } { AE } { AF } { B0 } { B1 } { B2 } { B3 } { B4 } { B5 } { B6 } { B7 } { B8 } { B9 } { BA } { BC } { BD } { BE } { C0 } { DB } { DC } { DD } { DE } { DF } } % % \end{macrocode} % % \begin{macrocode} %<*iso88597> \@@_declare_eight_bit_encoding:nnnn { iso88597 } { 498 } { { A1 } { 2018 } { A2 } { 2019 } { A4 } { 20AC } { A5 } { 20AF } { AA } { 037A } { AF } { 2015 } { B4 } { 0384 } { B5 } { 0385 } { B6 } { 0386 } { B8 } { 0388 } { B9 } { 0389 } { BA } { 038A } { BC } { 038C } { BE } { 038E } { BF } { 038F } { C0 } { 0390 } { C1 } { 0391 } { C2 } { 0392 } { C3 } { 0393 } { C4 } { 0394 } { C5 } { 0395 } { C6 } { 0396 } { C7 } { 0397 } { C8 } { 0398 } { C9 } { 0399 } { CA } { 039A } { CB } { 039B } { CC } { 039C } { CD } { 039D } { CE } { 039E } { CF } { 039F } { D0 } { 03A0 } { D1 } { 03A1 } { D3 } { 03A3 } { D4 } { 03A4 } { D5 } { 03A5 } { D6 } { 03A6 } { D7 } { 03A7 } { D8 } { 03A8 } { D9 } { 03A9 } { DA } { 03AA } { DB } { 03AB } { DC } { 03AC } { DD } { 03AD } { DE } { 03AE } { DF } { 03AF } { E0 } { 03B0 } { E1 } { 03B1 } { E2 } { 03B2 } { E3 } { 03B3 } { E4 } { 03B4 } { E5 } { 03B5 } { E6 } { 03B6 } { E7 } { 03B7 } { E8 } { 03B8 } { E9 } { 03B9 } { EA } { 03BA } { EB } { 03BB } { EC } { 03BC } { ED } { 03BD } { EE } { 03BE } { EF } { 03BF } { F0 } { 03C0 } { F1 } { 03C1 } { F2 } { 03C2 } { F3 } { 03C3 } { F4 } { 03C4 } { F5 } { 03C5 } { F6 } { 03C6 } { F7 } { 03C7 } { F8 } { 03C8 } { F9 } { 03C9 } { FA } { 03CA } { FB } { 03CB } { FC } { 03CC } { FD } { 03CD } { FE } { 03CE } } { { AE } { D2 } } % % \end{macrocode} % % \begin{macrocode} %<*iso88598> \@@_declare_eight_bit_encoding:nnnn { iso88598 } { 308 } { { AA } { 00D7 } { BA } { 00F7 } { DF } { 2017 } { E0 } { 05D0 } { E1 } { 05D1 } { E2 } { 05D2 } { E3 } { 05D3 } { E4 } { 05D4 } { E5 } { 05D5 } { E6 } { 05D6 } { E7 } { 05D7 } { E8 } { 05D8 } { E9 } { 05D9 } { EA } { 05DA } { EB } { 05DB } { EC } { 05DC } { ED } { 05DD } { EE } { 05DE } { EF } { 05DF } { F0 } { 05E0 } { F1 } { 05E1 } { F2 } { 05E2 } { F3 } { 05E3 } { F4 } { 05E4 } { F5 } { 05E5 } { F6 } { 05E6 } { F7 } { 05E7 } { F8 } { 05E8 } { F9 } { 05E9 } { FA } { 05EA } { FD } { 200E } { FE } { 200F } } { { A1 } { BF } { C0 } { C1 } { C2 } { C3 } { C4 } { C5 } { C6 } { C7 } { C8 } { C9 } { CA } { CB } { CC } { CD } { CE } { CF } { D0 } { D1 } { D2 } { D3 } { D4 } { D5 } { D6 } { D7 } { D8 } { D9 } { DA } { DB } { DC } { DD } { DE } { FB } { FC } } % % \end{macrocode} % % \begin{macrocode} %<*iso88599> \@@_declare_eight_bit_encoding:nnnn { iso88599 } { 352 } { { D0 } { 011E } { DD } { 0130 } { DE } { 015E } { F0 } { 011F } { FD } { 0131 } { FE } { 015F } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso885910> \@@_declare_eight_bit_encoding:nnnn { iso885910 } { 383 } { { A1 } { 0104 } { A2 } { 0112 } { A3 } { 0122 } { A4 } { 012A } { A5 } { 0128 } { A6 } { 0136 } { A8 } { 013B } { A9 } { 0110 } { AA } { 0160 } { AB } { 0166 } { AC } { 017D } { AE } { 016A } { AF } { 014A } { B1 } { 0105 } { B2 } { 0113 } { B3 } { 0123 } { B4 } { 012B } { B5 } { 0129 } { B6 } { 0137 } { B8 } { 013C } { B9 } { 0111 } { BA } { 0161 } { BB } { 0167 } { BC } { 017E } { BD } { 2015 } { BE } { 016B } { BF } { 014B } { C0 } { 0100 } { C7 } { 012E } { C8 } { 010C } { CA } { 0118 } { CC } { 0116 } { D1 } { 0145 } { D2 } { 014C } { D7 } { 0168 } { D9 } { 0172 } { E0 } { 0101 } { E7 } { 012F } { E8 } { 010D } { EA } { 0119 } { EC } { 0117 } { F1 } { 0146 } { F2 } { 014D } { F7 } { 0169 } { F9 } { 0173 } { FF } { 0138 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso885911> \@@_declare_eight_bit_encoding:nnnn { iso885911 } { 369 } { { A1 } { 0E01 } { A2 } { 0E02 } { A3 } { 0E03 } { A4 } { 0E04 } { A5 } { 0E05 } { A6 } { 0E06 } { A7 } { 0E07 } { A8 } { 0E08 } { A9 } { 0E09 } { AA } { 0E0A } { AB } { 0E0B } { AC } { 0E0C } { AD } { 0E0D } { AE } { 0E0E } { AF } { 0E0F } { B0 } { 0E10 } { B1 } { 0E11 } { B2 } { 0E12 } { B3 } { 0E13 } { B4 } { 0E14 } { B5 } { 0E15 } { B6 } { 0E16 } { B7 } { 0E17 } { B8 } { 0E18 } { B9 } { 0E19 } { BA } { 0E1A } { BB } { 0E1B } { BC } { 0E1C } { BD } { 0E1D } { BE } { 0E1E } { BF } { 0E1F } { C0 } { 0E20 } { C1 } { 0E21 } { C2 } { 0E22 } { C3 } { 0E23 } { C4 } { 0E24 } { C5 } { 0E25 } { C6 } { 0E26 } { C7 } { 0E27 } { C8 } { 0E28 } { C9 } { 0E29 } { CA } { 0E2A } { CB } { 0E2B } { CC } { 0E2C } { CD } { 0E2D } { CE } { 0E2E } { CF } { 0E2F } { D0 } { 0E30 } { D1 } { 0E31 } { D2 } { 0E32 } { D3 } { 0E33 } { D4 } { 0E34 } { D5 } { 0E35 } { D6 } { 0E36 } { D7 } { 0E37 } { D8 } { 0E38 } { D9 } { 0E39 } { DA } { 0E3A } { DF } { 0E3F } { E0 } { 0E40 } { E1 } { 0E41 } { E2 } { 0E42 } { E3 } { 0E43 } { E4 } { 0E44 } { E5 } { 0E45 } { E6 } { 0E46 } { E7 } { 0E47 } { E8 } { 0E48 } { E9 } { 0E49 } { EA } { 0E4A } { EB } { 0E4B } { EC } { 0E4C } { ED } { 0E4D } { EE } { 0E4E } { EF } { 0E4F } { F0 } { 0E50 } { F1 } { 0E51 } { F2 } { 0E52 } { F3 } { 0E53 } { F4 } { 0E54 } { F5 } { 0E55 } { F6 } { 0E56 } { F7 } { 0E57 } { F8 } { 0E58 } { F9 } { 0E59 } { FA } { 0E5A } { FB } { 0E5B } } { { DB } { DC } { DD } { DE } } % % \end{macrocode} % % \begin{macrocode} %<*iso885913> \@@_declare_eight_bit_encoding:nnnn { iso885913 } { 399 } { { A1 } { 201D } { A5 } { 201E } { A8 } { 00D8 } { AA } { 0156 } { AF } { 00C6 } { B4 } { 201C } { B8 } { 00F8 } { BA } { 0157 } { BF } { 00E6 } { C0 } { 0104 } { C1 } { 012E } { C2 } { 0100 } { C3 } { 0106 } { C6 } { 0118 } { C7 } { 0112 } { C8 } { 010C } { CA } { 0179 } { CB } { 0116 } { CC } { 0122 } { CD } { 0136 } { CE } { 012A } { CF } { 013B } { D0 } { 0160 } { D1 } { 0143 } { D2 } { 0145 } { D4 } { 014C } { D8 } { 0172 } { D9 } { 0141 } { DA } { 015A } { DB } { 016A } { DD } { 017B } { DE } { 017D } { E0 } { 0105 } { E1 } { 012F } { E2 } { 0101 } { E3 } { 0107 } { E6 } { 0119 } { E7 } { 0113 } { E8 } { 010D } { EA } { 017A } { EB } { 0117 } { EC } { 0123 } { ED } { 0137 } { EE } { 012B } { EF } { 013C } { F0 } { 0161 } { F1 } { 0144 } { F2 } { 0146 } { F4 } { 014D } { F8 } { 0173 } { F9 } { 0142 } { FA } { 015B } { FB } { 016B } { FD } { 017C } { FE } { 017E } { FF } { 2019 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso885914> \@@_declare_eight_bit_encoding:nnnn { iso885914 } { 529 } { { A1 } { 1E02 } { A2 } { 1E03 } { A4 } { 010A } { A5 } { 010B } { A6 } { 1E0A } { A8 } { 1E80 } { AA } { 1E82 } { AB } { 1E0B } { AC } { 1EF2 } { AF } { 0178 } { B0 } { 1E1E } { B1 } { 1E1F } { B2 } { 0120 } { B3 } { 0121 } { B4 } { 1E40 } { B5 } { 1E41 } { B7 } { 1E56 } { B8 } { 1E81 } { B9 } { 1E57 } { BA } { 1E83 } { BB } { 1E60 } { BC } { 1EF3 } { BD } { 1E84 } { BE } { 1E85 } { BF } { 1E61 } { D0 } { 0174 } { D7 } { 1E6A } { DE } { 0176 } { F0 } { 0175 } { F7 } { 1E6B } { FE } { 0177 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso885915> \@@_declare_eight_bit_encoding:nnnn { iso885915 } { 383 } { { A4 } { 20AC } { A6 } { 0160 } { A8 } { 0161 } { B4 } { 017D } { B8 } { 017E } { BC } { 0152 } { BD } { 0153 } { BE } { 0178 } } { } % % \end{macrocode} % % \begin{macrocode} %<*iso885916> \@@_declare_eight_bit_encoding:nnnn { iso885916 } { 558 } { { A1 } { 0104 } { A2 } { 0105 } { A3 } { 0141 } { A4 } { 20AC } { A5 } { 201E } { A6 } { 0160 } { A8 } { 0161 } { AA } { 0218 } { AC } { 0179 } { AE } { 017A } { AF } { 017B } { B2 } { 010C } { B3 } { 0142 } { B4 } { 017D } { B5 } { 201D } { B8 } { 017E } { B9 } { 010D } { BA } { 0219 } { BC } { 0152 } { BD } { 0153 } { BE } { 0178 } { BF } { 017C } { C3 } { 0102 } { C5 } { 0106 } { D0 } { 0110 } { D1 } { 0143 } { D5 } { 0150 } { D7 } { 015A } { D8 } { 0170 } { DD } { 0118 } { DE } { 021A } { E3 } { 0103 } { E5 } { 0107 } { F0 } { 0111 } { F1 } { 0144 } { F5 } { 0151 } { F7 } { 015B } { F8 } { 0171 } { FD } { 0119 } { FE } { 021B } } { } % % \end{macrocode} % % \end{implementation} % % \PrintIndex