diff --git a/source/lex.tex b/source/lex.tex index d034d35f94..d436676ae3 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -110,9 +110,9 @@ \indextext{line splicing}% If the first translation character is \unicode{feff}{byte order mark}, it is deleted. -Each sequence of a backslash character (\textbackslash) +Each sequence of a backslash character (\unicode{005c}{reverse solidus}) immediately followed by -zero or more whitespace characters other than new-line followed by +zero or more \grammarterm{whitespace-character}s other than new-line followed by a new-line character is deleted, splicing physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last backslash on any physical source line shall be eligible for being part @@ -126,9 +126,13 @@ shall be processed as if an additional new-line character were appended to the file. -\item The source file is decomposed into preprocessing -tokens\iref{lex.pptoken} and sequences of whitespace characters -(including comments). A source file shall not end in a partial +\item +\indextext{whitespace}% +\indextext{comment}% +\indextext{token!preprocessing}% +The source file is decomposed into preprocessing +tokens\iref{lex.pptoken} and whitespace\iref{lex.whitespace} (sequences of \grammarterm{whitespace-character}s +and comments). A source file shall not end in a partial preprocessing token or in a partial comment. \begin{footnote} A partial preprocessing @@ -140,9 +144,9 @@ would arise from a source file ending with an unclosed \tcode{/*} comment. \end{footnote} -Each comment\iref{lex.comment} is replaced by one space character. New-line characters are -retained. Whether each nonempty sequence of whitespace characters other -than new-line is retained or replaced by one space character is +Each comment\iref{lex.comment} is replaced by one \unicode{0020}{space} character. New-line characters are +retained. Whether each nonempty sequence of \grammarterm{whitespace-character}s other +than new-line is retained or replaced by one \unicode{0020}{space} character is unspecified. As characters from the source file are consumed to form the next preprocessing token @@ -178,7 +182,8 @@ \item Adjacent \grammarterm{string-literal} tokens are concatenated\iref{lex.string}. -\item Whitespace characters separating tokens are no longer +\item +Any \grammarterm{whitespace-character}s separating tokens are no longer significant. Each preprocessing token is converted into a token\iref{lex.token}. The resulting tokens constitute a \defn{translation unit} and @@ -467,7 +472,28 @@ None of these names or aliases have leading or trailing spaces. \end{note} -\rSec1[lex.comment]{Comments} +\rSec1[lex.whitespace]{Whitespace} +\indextext{whitespace|(}% + +\rSec2[lex.whitechar]{Whitespace Characters} + +\indextext{character!whitespace|(}% +\begin{bnf} +\nontermdef{whitespace-character}\br + \unicode{0009}{character tabulation}\br + \textnormal{new-line}\br + \unicode{000b}{line tabulation}\br + \unicode{000c}{form feed}\br + \unicode{0020}{space}\br +\end{bnf} + +\pnum +\begin{note} +Whitespace characters are used to separate elements of the \Cpp grammar. +\end{note} +\indextext{character!whitespace|)} + +\rSec2[lex.comment]{Comments} \pnum \indextext{comment|(}% @@ -477,8 +503,8 @@ characters \tcode{*/}. These comments do not nest. \indextext{comment!\tcode{//}}% The characters \tcode{//} start a comment, which terminates immediately before the -next new-line character. If there is a form-feed or a vertical-tab -character in such a comment, only whitespace characters shall appear +next new-line character. If there is a \unicode{000c}{form feed} or a \unicode{000b}{line tabulation} +character in such a comment, only \grammarterm{whitespace-character}s shall appear between it and the new-line that terminates the comment; no diagnostic is required. \begin{note} @@ -489,6 +515,7 @@ \tcode{/*} comment. \end{note} \indextext{comment|)} +\indextext{whitespace|)}% \rSec1[lex.pptoken]{Preprocessing tokens} @@ -506,7 +533,7 @@ string-literal\br user-defined-string-literal\br preprocessing-op-or-punc\br - \textnormal{each non-whitespace character that cannot be one of the above} + \textnormal{each non-\grammarterm{whitespace-character} that cannot be one of the above} \end{bnf} \pnum @@ -520,7 +547,7 @@ (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}), identifiers, preprocessing numbers, character literals (including user-defined character literals), string literals (including user-defined string literals), preprocessing -operators and punctuators, and single non-whitespace characters that do not lexically +operators and punctuators, and single non-\grammarterm{whitespace-character}s that do not lexically match the other preprocessing token categories. If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character matches the last category, the program is ill-formed. @@ -530,12 +557,9 @@ \indextext{whitespace}% whitespace; \indextext{comment}% -this consists of comments\iref{lex.comment}, or whitespace characters -(\unicode{0020}{space}, -\unicode{0009}{character tabulation}, -new-line, -\unicode{000b}{line tabulation}, and -\unicode{000c}{form feed}), or both. +this consists of comments\iref{lex.comment}, +\grammarterm{whitespace-character}s, or +both. As described in \ref{cpp}, in certain circumstances during translation phase 4, whitespace (or the absence thereof) serves as more than preprocessing token separation. Whitespace @@ -673,13 +697,13 @@ external source file names as specified in~\ref{cpp.include}. \pnum -The appearance of either of the characters \tcode{'} or \tcode{\textbackslash} or of +The appearance of either of the characters \unicode{0027}{apostrophe} or \unicode{005c}{reverse solidus} or of either of the character sequences \tcode{/*} or \tcode{//} in a \grammarterm{q-char-sequence} or an \grammarterm{h-char-sequence} is conditionally-supported with \impldef{meaning of \tcode{'}, \tcode{\textbackslash}, \tcode{/*}, or \tcode{//} in a \grammarterm{q-char-sequence} or an \grammarterm{h-char-sequence}} semantics, as is the appearance of the character -\tcode{"} in an \grammarterm{h-char-sequence}. +\unicode{0022}{quotation mark} in an \grammarterm{h-char-sequence}. \begin{footnote} Thus, a sequence of characters that resembles an escape sequence can result in an error, be interpreted as the @@ -826,9 +850,7 @@ \end{footnote} operators, and other separators. \indextext{whitespace}% -Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments -(collectively, ``whitespace''), as described below, are ignored except -as they serve to separate tokens. +Whitespace\iref{lex.whitespace} is ignored except to separate tokens. \begin{note} Whitespace can separate otherwise adjacent identifiers, keywords, numeric literals, and alternative tokens containing alphabetic characters. @@ -1790,8 +1812,8 @@ \begin{bnf} \nontermdef{d-char}\br \textnormal{any member of the basic character set except:}\br - \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br - \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line} + \bnfindent\textnormal{a \grammarterm{whitespace-character}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis},}\br + \bnfindent\textnormal{and \unicode{005c}{reverse solidus}} \end{bnf} \pnum