*** /dev/null Sat Nov 23 01:14:39 2024
--- - Sat Nov 23 01:14:58 2024
***************
*** 0 ****
--- 1,900 ----
+ .rn '' }`
+ ''' $RCSfile$$Revision$$Date$
+ '''
+ ''' $Log$
+ '''
+ .de Sh
+ .br
+ .if t .Sp
+ .ne 5
+ .PP
+ \fB\\$1\fR
+ .PP
+ ..
+ .de Sp
+ .if t .sp .5v
+ .if n .sp
+ ..
+ .de Ip
+ .br
+ .ie \\n(.$>=3 .ne \\$3
+ .el .ne 3
+ .IP "\\$1" \\$2
+ ..
+ .de Vb
+ .ft CW
+ .nf
+ .ne \\$1
+ ..
+ .de Ve
+ .ft R
+
+ .fi
+ ..
+ '''
+ '''
+ ''' Set up \*(-- to give an unbreakable dash;
+ ''' string Tr holds user defined translation string.
+ ''' Bell System Logo is used as a dummy character.
+ '''
+ .tr \(*W-|\(bv\*(Tr
+ .ie n \{\
+ .ds -- \(*W-
+ .ds PI pi
+ .if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+ .if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
+ .ds L" ""
+ .ds R" ""
+ ''' \*(M", \*(S", \*(N" and \*(T" are the equivalent of
+ ''' \*(L" and \*(R", except that they are used on ".xx" lines,
+ ''' such as .IP and .SH, which do another additional levels of
+ ''' double-quote interpretation
+ .ds M" """
+ .ds S" """
+ .ds N" """""
+ .ds T" """""
+ .ds L' '
+ .ds R' '
+ .ds M' '
+ .ds S' '
+ .ds N' '
+ .ds T' '
+ 'br\}
+ .el\{\
+ .ds -- \(em\|
+ .tr \*(Tr
+ .ds L" ``
+ .ds R" ''
+ .ds M" ``
+ .ds S" ''
+ .ds N" ``
+ .ds T" ''
+ .ds L' `
+ .ds R' '
+ .ds M' `
+ .ds S' '
+ .ds N' `
+ .ds T' '
+ .ds PI \(*p
+ 'br\}
+ .\" If the F register is turned on, we'll generate
+ .\" index entries out stderr for the following things:
+ .\" TH Title
+ .\" SH Header
+ .\" Sh Subsection
+ .\" Ip Item
+ .\" X<> Xref (embedded
+ .\" Of course, you have to process the output yourself
+ .\" in some meaninful fashion.
+ .if \nF \{
+ .de IX
+ .tm Index:\\$1\t\\n%\t"\\$2"
+ ..
+ .nr % 0
+ .rr F
+ .\}
+ .TH str 3 "14-Jul-2000" "Str 0.9.5" "String Library"
+ .UC
+ .if n .hy 0
+ .if n .na
+ .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+ .de CQ \" put $1 in typewriter font
+ .ft CW
+ 'if n "\c
+ 'if t \\&\\$1\c
+ 'if n \\&\\$1\c
+ 'if n \&"
+ \\&\\$2 \\$3 \\$4 \\$5 \\$6 \\$7
+ '.ft R
+ ..
+ .\" @(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2
+ . \" AM - accent mark definitions
+ .bd B 3
+ . \" fudge factors for nroff and troff
+ .if n \{\
+ . ds #H 0
+ . ds #V .8m
+ . ds #F .3m
+ . ds #[ \f1
+ . ds #] \fP
+ .\}
+ .if t \{\
+ . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+ . ds #V .6m
+ . ds #F 0
+ . ds #[ \&
+ . ds #] \&
+ .\}
+ . \" simple accents for nroff and troff
+ .if n \{\
+ . ds ' \&
+ . ds ` \&
+ . ds ^ \&
+ . ds , \&
+ . ds ~ ~
+ . ds ? ?
+ . ds ! !
+ . ds /
+ . ds q
+ .\}
+ .if t \{\
+ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+ . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+ . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+ . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+ . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+ . ds ? \s-2c\h'-\w'c'u*7/10'\u\h'\*(#H'\zi\d\s+2\h'\w'c'u*8/10'
+ . ds ! \s-2\(or\s+2\h'-\w'\(or'u'\v'-.8m'.\v'.8m'
+ . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+ . ds q o\h'-\w'o'u*8/10'\s-4\v'.4m'\z\(*i\v'-.4m'\s+4\h'\w'o'u*8/10'
+ .\}
+ . \" troff and (daisy-wheel) nroff accents
+ .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+ .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+ .ds v \\k:\h'-(\\n(.wu*9/10-\*(#H)'\v'-\*(#V'\*(#[\s-4v\s0\v'\*(#V'\h'|\\n:u'\*(#]
+ .ds _ \\k:\h'-(\\n(.wu*9/10-\*(#H+(\*(#F*2/3))'\v'-.4m'\z\(hy\v'.4m'\h'|\\n:u'
+ .ds . \\k:\h'-(\\n(.wu*8/10)'\v'\*(#V*4/10'\z.\v'-\*(#V*4/10'\h'|\\n:u'
+ .ds 3 \*(#[\v'.2m'\s-2\&3\s0\v'-.2m'\*(#]
+ .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+ .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+ .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+ .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+ .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+ .ds ae a\h'-(\w'a'u*4/10)'e
+ .ds Ae A\h'-(\w'A'u*4/10)'E
+ .ds oe o\h'-(\w'o'u*4/10)'e
+ .ds Oe O\h'-(\w'O'u*4/10)'E
+ . \" corrections for vroff
+ .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+ .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+ . \" for low resolution devices (crt and lpr)
+ .if \n(.H>23 .if \n(.V>19 \
+ \{\
+ . ds : e
+ . ds 8 ss
+ . ds v \h'-1'\o'\(aa\(ga'
+ . ds _ \h'-1'^
+ . ds . \h'-1'.
+ . ds 3 3
+ . ds o a
+ . ds d- d\h'-1'\(ga
+ . ds D- D\h'-1'\(hy
+ . ds th \o'bp'
+ . ds Th \o'LP'
+ . ds ae ae
+ . ds Ae AE
+ . ds oe oe
+ . ds Oe OE
+ .\}
+ .rm #[ #] #H #V #F C
+ .SH "NAME"
+ \fBStr\fR \- String Library
+ .SH "VERSION"
+ Str 0.9.5 (14-Jul-2000)
+ .SH "SYNOPSIS"
+ \fBstr_len\fR,
+ \fBstr_copy\fR,
+ \fBstr_dup\fR,
+ \fBstr_concat\fR,
+ \fBstr_splice\fR,
+ \fBstr_compare\fR,
+ \fBstr_span\fR,
+ \fBstr_locate\fR,
+ \fBstr_token\fR,
+ \fBstr_parse\fR,
+ \fBstr_format\fR,
+ \fBstr_hash\fR,
+ \fBstr_base64\fR.
+ .SH "DESCRIPTION"
+ The \fBStr\fR library is a generic string library written in ANSI C which
+ provides functions for handling, matching, parsing, searching and
+ formatting of C strings. So it can be considered as a superset of POSIX
+ \fIstring\fR\|(3), but its main intention is to provide a more convinient and
+ compact API plus a more generalized functionality.
+ .SH "FUNCTIONS"
+ The following functions are provided by the \fBStr\fR API:
+ .Ip "str_size_t \fBstr_len\fR(const char *\fIs\fR);" 4
+ This function determines the length of string \fIs\fR, i.e., the number
+ of characters starting at \fIs\fR that precede the terminating \f(CWNUL\fR
+ character. It returns \f(CWNULL\fR if \fIs\fR is \f(CWNULL\fR.
+ .Ip "char *\fBstr_copy\fR(char *\fIs\fR, const char *\fIt\fR, size_t \fIn\fR);" 4
+ This copies the characters in string \fIt\fR into the string \fIs\fR, but never more
+ than \fIn\fR characters (if \fIn\fR is greater than \f(CW0\fR). The two involved strings
+ can overlap and the characters in \fIs\fR are always \f(CWNUL\fR\-terminated. The
+ string \fIs\fR has to be large enough to hold all characters to be copied.
+ function returns \f(CWNULL\fR if \fIs\fR or \fIt\fR are \f(CWNULL\fR. Else it returns the
+ pointer to the written \f(CWNUL\fR\-terminating character in \fIs\fR.
+ .Ip "char *\fBstr_dup\fR(const char *\fIs\fR, str_size_t \fIn\fR);" 4
+ This returns a copy of the characters in string \fIs\fR, but never more than \fIn\fR
+ characters if \fIn\fR is greater than \f(CW0\fR. It returns \f(CWNULL\fR if \fIs\fR is
+ \f(CWNULL\fR. The returned string has to be deallocated later with \fIfree\fR\|(3).
+ .Ip "char *\fBstr_concat\fR(char *\fIs\fR, ...);" 4
+ This functions concatenates the characters of all string arguments into a new
+ allocated string and returns this new string. If \fIs\fR is \f(CWNULL\fR the function
+ returns \f(CWNULL\fR. Else it returns the pointer to the written final
+ \f(CWNUL\fR\-terminating character in \fIs\fR. The returned string later has to be
+ deallicated with \fIfree\fR\|(3).
+ .Ip "char *\fBstr_splice\fR(char *\fIs\fR, str_size_t \fIoff\fR, str_size_t \fIn\fR, char *\fIt\fR, str_size_t \fIm\fR);" 4
+ This splices the string \fIt\fR into string \fIs\fR, i.e., the \fIn\fR characters
+ at offset \fIoff\fR in \fIs\fR are removed and at their location the string
+ \fIt\fR is inserted (or just the first \fIm\fR characters of \fIt\fR if \fIm\fR is
+ greater than \f(CW0\fR). It returns \f(CWNULL\fR if \fIs\fR or \fIt\fR are \f(CWNULL\fR.
+ Else the string \fIs\fR is returned. The function supports also the
+ situation where \fIt\fR is a sub-string of \fIs\fR as long as the area
+ \fIs+off\fR...\fIs+off+n\fR and \fIt\fR...\fIt+m\fR do not overlap. The caller
+ always has to make sure that enough room exists in \fIs\fR.
+ .Ip "int \fBstr_compare\fR(const char *\fIs\fR, const char *\fIt\fR, str_size_t \fIn\fR, int \fImode\fR);" 4
+ This performs a lexicographical comparison of the two strings \fIs\fR
+ and \fIt\fR (but never compares more than \fIn\fR characters of them)
+ and returns one of three return values: a value lower than \f(CW0\fR if
+ \fIs\fR is lexicographically lower than \fIt\fR, a vlue of exactly \f(CW0\fR
+ if \fIs\fR and \fIt\fR are equal and a value greater than \f(CW0\fR if \fIs\fR is
+ lexicographically higher than \fIt\fR. Per default (\fImode\fR is \f(CW0\fR) the
+ comparison is case-sensitive, but if \f(CWSTR_NOCASE\fR is used for \fImode\fR
+ the comparison is done in a case-insensitive way.
+ .Ip "char *\fBstr_span\fR(const char *\fIs\fR, size_t \fIn\fR, const char *\fIcharset\fR, int \fImode\fR);" 4
+ This functions spans a string \fIs\fR according to the characters specified in
+ \fIcharset\fR. If \fImode\fR is \f(CW0\fR, this means that \fIs\fR is spanned from left to
+ right starting at \fIs\fR (and ending either when reaching the terminating \f(CWNUL\fR
+ character or already after \fIn\fR spanned characters) as long as the characters
+ of \fIs\fR are contained in \fIcharset\fR.
+ .Sp
+ Alternatively one can use a \fImode\fR of \f(CWSTR_COMPLEMENT\fR to indicate that \fIs\fR
+ is spanned as long as the characters of \fIs\fR are \fInot\fR contained in
+ \fIcharset\fR, i.e., \fIcharset\fR then specifies the complement of the spanning
+ characters.
+ .Sp
+ In both cases one can additionally \*(L"or\*(R" (with the C operator ``\f(CW|\fR'')
+ \f(CWSTR_RIGHT\fR into \fImode\fR to indicate that the spanning is done right to
+ left starting at the terminating \f(CWNUL\fR character of \fIs\fR (and ending
+ either when reaching \fIs\fR or already after \fIn\fR spanned characters).
+ .Ip "char *\fBstr_locate\fR(const char *\fIs\fR, str_size_t \fIn\fR, const char *\fIt\fR);" 4
+ This functions searches for the (smaller) string \fIt\fR inside (larger) string
+ \fIs\fR. If \fIn\fR is not \f(CW0\fR, the search is performed only inside the first \fIn\fR
+ characters of \fIs\fR.
+ .Ip "char *\fBstr_token\fR(char **\fIs\fR, const char *\fIdelim\fR, const char *\fIquote\fR, const char *\fIcomment\fR, int \fImode\fR);" 4
+ This function considers the string \fIs\fR to consist of a sequence of
+ zero or more text tokens separated by spans of one or more characters
+ from the separator string \fIdelim\fR. However, text between matched pairs
+ of quotemarks (characters in \fIquote\fR) is treated as plain text, never
+ as delimiter (separator) text. Each call of this function returns a
+ pointer to the first character of the first token of \fIs\fR. The token is
+ \f(CWNUL\fR\-terminated, i.e., the string \fIs\fR is processed in a destructive
+ way. If there are quotation marks or escape sequences, the input
+ string is rewritten with quoted sections and escape sequences properly
+ interpreted.
+ .Sp
+ This function keeps track of its parsing position in the string between
+ separate calls by simply adjusting the callers \fIs\fR pointer, so that
+ subsequent calls with the same pointer variable \fIs\fR will start
+ processing from the position immediately after the last returned token.
+ In this way subsequent calls will work through the string \fIs\fR until no
+ tokens remain. When no token remains in \fIs\fR, \f(CWNULL\fR is returned. The
+ string of token separators (\fIdelim\fR) and the string of quote characters
+ (\fIquote\fR) may be changed from call to call.
+ .Sp
+ If a character in the string \fIs\fR is not quoted or escaped, and is in the
+ \fIcomment\fR set, then it is overwritten with a \f(CWNUL\fR character and the rest of
+ the string is ignored. The characters to be used as quote characters are
+ specified in the \fIquote\fR set, and must be used in balanced pairs. If there
+ is more than one flavor of quote character, one kind of quote character may be
+ used to quote another kind. If an unbalanced quote is found, the function
+ silently act as if one had been placed at the end of the input string. The
+ \fIdelim\fR and \fIquote\fR strings must be disjoint, i.e., they have to share
+ no characters.
+ .Sp
+ The \fImode\fR argument can be used to modify the processing of the string
+ (default for \fImode\fR is \f(CW0\fR): \f(CWSTR_STRIPQUOTES\fR forces \fIquote\fR
+ characters to be stripped from quoted tokens; \f(CWSTR_BACKSLASHESC\fR
+ enables the interpretation (and expansion) of backslash escape sequences
+ (`\fB\ex\fR') through \s-1ANSI\s0\-C rules; \f(CWSTR_SKIPDELIMS\fR forces that after the
+ terminating \f(CWNUL\fR is written and the token returned, further delimiters
+ are skipped (this allows one to make sure that the delimiters for
+ one word don't become part of the next word if one change delimiters
+ between calls); and \f(CWSTR_TRIGRAPHS\fR enables the recognition and
+ expansion of \s-1ANSI\s0 C Trigraph sequences (as a side effect this enables
+ \f(CWSTR_BACKSLASHESC\fR, too).
+ .Ip "int \fBstr_parse\fR(const char *\fIs\fR, const char *\fIpop\fR, ...);" 4
+ This parses the string \fIs\fR according to the parsing operation specified
+ by \fIpop\fR. If the parsing operation succeeds, \f(CWTRUE\fR is returned. Else
+ \f(CWFALSE\fR is returned.
+ .Sp
+ The \fIpop\fR string usually has one of the following two syntax variants:
+ `\fBm\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIflags\fR*\*(R' (for matching operations)
+ and `\fBs\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIsubst\fR \fIdelim\fR \fIflags\fR*\*(R' (for
+ substitution operations). For more details about the syntax variants
+ and semantic of the \fIpop\fR argument see section \fB\s-1GORY\s0 \s-1DETAILS\s0, Parsing
+ Specification\fR below. The syntax of the \fIregex\fR part in \fIpop\fR is
+ mostly equivalent to Perl 5's regular expression syntax. For the
+ complete and gory details see \fIperlre\fR\|(1). A brief summary you can find
+ under section \fB\s-1GORY\s0 \s-1DETAILS\s0, Perl Regular Expressions\fR below.
+ .Ip "int \fBstr_format\fR(char *\fIs\fR, str_size_t \fIn\fR, const char *\fIfmt\fR, ...);" 4
+ This formats a new string according to \fIfmt\fR and optionally following
+ arguments and writes it into the string \fIs\fR, but never more than \fIn\fR
+ characters at all. It returns the number of written characters. If \fIs\fR is
+ \f(CWNULL\fR it just calculates the number of characters which would be written.
+ .Sp
+ The function generates the output string under the control of the \fIfmt\fR
+ format string that specifies how subsequent arguments (or arguments accessed
+ via the variable-length argument facilities of \fIstdarg\fR\|(3)) are converted for
+ output.
+ .Sp
+ The format string \fIfmt\fR is composed of zero or more directives:
+ ordinary characters (not \fB%\fR), which are copied unchanged to the output
+ stream; and conversion specifications, each of which results in fetching
+ zero or more subsequent arguments. Each conversion specification is
+ introduced by the character \fB%\fR. The arguments must correspond properly
+ (after type promotion) with the conversion specifier. Which conversion
+ specifications are supported are described in detail under \fB\s-1GORY\s0
+ \s-1DETAILS\s0, Format Specification\fR below.
+ .Ip "unsigned long \fBstr_hash\fR(const char *\fIs\fR, str_size_t \fIn\fR, int \fImode\fR);" 4
+ This function calculates a hash value of string \fIs\fR (or of its first \fIn\fR
+ characters if \fIn\fR is equal to \f(CW0\fR). The following hashing functions
+ are supported and can be selected with \fImode\fR: \s-1STR_HASH_DJBX33\s0 (Daniel
+ J. Berstein, Times 33 Hash with Addition), \s-1STR_HASH_BJDDJ\s0 (Bob
+ Jenkins, Dr. Dobbs Journal), and \s-1STR_HASH_MACRC32\s0 (Mark Adler, Cyclic
+ Redundancy Check with 32-Bit). This function is intended for fast use
+ in hashing algorithms and \fInot\fR for use as cryptographically strong
+ message digests.
+ .Ip "int \fBstr_base64\fR(char *\fIs\fR, str_size_t \fIn\fR, unsigned char *\fIucp\fR, str_size_t \fIucn\fR, int \fImode\fR);" 4
+ This function Base64 encodes \fIucn\fR bytes starting at \fIucp\fR and writes
+ the resulting string into \fIs\fR (but never more than \fIn\fR characters are
+ written). The \fImode\fR for this operation has to be \f(CWSTR_BASE64_ENCODE\fR.
+ Additionally one can \s-1OR\s0 the value \f(CWSTR_BASE64_STRICT\fR to enable strict
+ encoding where after every 72th output character a newline character is
+ inserted. The function returns the number of output characters written.
+ If \fIs\fR is \f(CWNULL\fR the function just calculates the number of required
+ output characters.
+ .Sp
+ Alternatively, if \fImode\fR is \f(CWSTR_BASE64_DECODE\fR the string \fIs\fR (or
+ the first \fIn\fR characters only if \fIn\fR is not \f(CW0\fR) is decoded and the
+ output bytes written at \fIucp\fR. Again, if \fIucp\fR is \f(CWNULL\fR only the
+ number of required output bytes are calculated.
+ .SH "GORY DETAILS"
+ In this part of the documentation more complex topics are documented in
+ detail.
+ .Sh "Perl Regular Expressions"
+ The regular expressions used in \fBStr\fR are more or less Perl compatible
+ (they are provided by a stripped down and built-in version of the
+ \fI\s-1PCRE\s0\fR library). So the syntax description in \fIperlre\fR\|(1) applies
+ and don't has to be repeated here again. For a deeper understanding
+ and details you should have a look at the book `\fIMastering Regular
+ Expressions\fR\*(R' (see also the \fIperlbook\fR\|(1) manpage) by \fIJeffrey Friedl\fR.
+ For convinience reasons we give you only a brief summary of Perl
+ compatible regular expressions:
+ .PP
+ The following metacharacters have their standard \fIegrep\fR\|(1) meanings:
+ .PP
+ .Vb 7
+ \& \e Quote the next metacharacter
+ \& ^ Match the beginning of the line
+ \& . Match any character (except newline)
+ \& $ Match the end of the line (or before newline at the end)
+ \& | Alternation
+ \& () Grouping
+ \& [] Character class
+ .Ve
+ The following standard quantifiers are recognized:
+ .PP
+ .Vb 12
+ \& * Match 0 or more times (greedy)
+ \& *? Match 0 or more times (non greedy)
+ \& + Match 1 or more times (greedy)
+ \& +? Match 1 or more times (non greedy)
+ \& ? Match 1 or 0 times (greedy)
+ \& ?? Match 1 or 0 times (non greedy)
+ \& {n} Match exactly n times (greedy)
+ \& {n}? Match exactly n times (non greedy)
+ \& {n,} Match at least n times (greedy)
+ \& {n,}? Match at least n times (non greedy)
+ \& {n,m} Match at least n but not more than m times (greedy)
+ \& {n,m}? Match at least n but not more than m times (non greedy)
+ .Ve
+ The following backslash sequences are recognized:
+ .PP
+ .Vb 15
+ \& \et Tab (HT, TAB)
+ \& \en Newline (LF, NL)
+ \& \er Return (CR)
+ \& \ef Form feed (FF)
+ \& \ea Alarm (bell) (BEL)
+ \& \ee Escape (think troff) (ESC)
+ \& \e033 Octal char
+ \& \ex1B Hex char
+ \& \ec[ Control char
+ \& \el Lowercase next char
+ \& \eu Uppercase next char
+ \& \eL Lowercase till \eE
+ \& \eU Uppercase till \eE
+ \& \eE End case modification
+ \& \eQ Quote (disable) pattern metacharacters till \eE
+ .Ve
+ The following non zero-width assertions are recognized:
+ .PP
+ .Vb 6
+ \& \ew Match a "word" character (alphanumeric plus "_")
+ \& \eW Match a non-word character
+ \& \es Match a whitespace character
+ \& \eS Match a non-whitespace character
+ \& \ed Match a digit character
+ \& \eD Match a non-digit character
+ .Ve
+ The following zero-width assertions are recognized:
+ .PP
+ .Vb 6
+ \& \eb Match a word boundary
+ \& \eB Match a non-(word boundary)
+ \& \eA Match only at beginning of string
+ \& \eZ Match only at end of string, or before newline at the end
+ \& \ez Match only at end of string
+ \& \eG Match only where previous m//g left off (works only with /g)
+ .Ve
+ The following regular expression extensions are recognized:
+ .PP
+ .Vb 11
+ \& (?#text) An embedded comment
+ \& (?:pattern) This is for clustering, not capturing (simple)
+ \& (?imsx-imsx:pattern) This is for clustering, not capturing (full)
+ \& (?=pattern) A zero-width positive lookahead assertion
+ \& (?!pattern) A zero-width negative lookahead assertion
+ \& (?<=pattern) A zero-width positive lookbehind assertion
+ \& (?<!pattern) A zero-width negative lookbehind assertion
+ \& (?>pattern) An "independent" subexpression
+ \& (?(cond)yes-re) Conditional expression (simple)
+ \& (?(cond)yes-re|no-re) Conditional expression (full)
+ \& (?imsx-imsx) One or more embedded pattern-match modifiers
+ .Ve
+ .Sh "Parsing Specification"
+ The \fBstr_parse\fR(const char *\fIs\fR, const char *\fIpop\fR, ...) function
+ is a very flexible but complex one. The argument \fIs\fR is the string on
+ which the parsing operation specified by argument \fIpop\fR is applied.
+ The parsing semantics are highly influenced by Perl's `\fB=~\fR\*(R' matching
+ operator, because one of the main goals of \fIstr_parse\fR\|(3) is to allow one
+ to rewrite typical Perl matching constructs into C.
+ .PP
+ Now to the gory details. In general, the \fIpop\fR argument of \fIstr_parse\fR\|(3)
+ has one of the following two syntax variants:
+ .Ip "\fBMatching:\fR `\fBm\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIflags\fR*': " 4
+ This matches \fIs\fR against the Perl-style regular expression \fIregex\fR
+ under the control of zero or more \fIflags\fR which control the parsing
+ semantics. The stripped down \fIpop\fR syntax `\fIregex\fR\*(R' is equivalent to
+ `\fBm/\fR\fIregex\fR\fB/\fR\*(R'.
+ .Sp
+ For each grouping pair of parenthesis in \fIregex\fR, the text in \fIs\fR
+ which was grouped by the parenthesis is extracted into new strings.
+ These per default are allocated as seperate strings and returned to the
+ caller through following `\fBchar **\fR\*(R' arguments. The caller is required
+ to \fIfree\fR\|(3) them later.
+ .Ip "\fBSubstitution:\fR `\fBs\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIsubst\fR \fIdelim\fR \fIflags\fR*': " 4
+ This matches \fIs\fR against the Perl-style regular expression \fIregex\fR
+ under the control of zero or more \fIflags\fR which control the parsing
+ semantics. As a result of the operation, a new string formed which
+ consists of \fIs\fR but with the part which matched \fIregex\fR replaced by
+ \fIsubst\fR. The result string is returned to the caller through a `\fBchar
+ **\fR\*(R' argument. The caller is required to \fIfree\fR\|(3) this later.
+ .Sp
+ For each grouping pair of parenthesis in \fIregex\fR, the text in \fIs\fR
+ which was grouped by the parenthesis is extracted into new strings
+ and can be referenced for expansion via `\fB$n\fR\*(R' (n=1,..) in \fIsubst\fR.
+ Additionally any \fIstr_format\fR\|(3) style `\fB%\fR\*(R' constructs in \fIsubst\fR are
+ expanded through additional caller supplied arguments.
+ .PP
+ The following \fIflags\fR are supported:
+ .Ip "\fBb\fR" 4
+ If the \fIbundle\fR flag `\fBb\fR\*(R' is specified, the extracted strings are
+ bundled together into a single chunk of memory and its address is
+ returned to the caller with a additional `\fBchar **\fR\*(R' argument which has
+ to preceed the regular string arguments. The caller then has to \fIfree\fR\|(3)
+ only this chunk of memory in order to free all extracted strings at
+ once.
+ .Ip "\fBi\fR" 4
+ If the case-\fIinsensitive\fR flag `\fBi\fR\*(R' is specified, \fIregex\fR
+ is matched in case-insensitive way.
+ .Ip "\fBo\fR" 4
+ If the \fIonce\fR flag `\fBo\fR\*(R' is specified, this indicates to the \fBStr\fR
+ library that the whole \fIpop\fR string is constant and that its internal
+ pre-processing (it is compiled into a deterministic finite automaton
+ (\s-1DFA\s0) internally) has to be done only once (the \fBStr\fR library then
+ caches the \s-1DFA\s0 which corresponds to the \fIpop\fR argument).
+ .Ip "\fBx\fR" 4
+ If the \fIextended\fR flag `\fBx\fR\*(R' is specified, the \fIregex\fR's legibility
+ is extended by permitting embedded whitespace and comments to allow one
+ to write down complex regular expressions more cleary and even in a
+ documented way.
+ .Ip "\fBm\fR" 4
+ If the \fImultiple\fR lines flag `\fBm\fR\*(R' is specified, the string \fIs\fR is
+ treated as multiple lines. That is, this changes the regular expression
+ meta characters `\fB^\fR\*(R' and `\fB$\fR\*(R' from matching at only the very start
+ or end of the string \fIs\fR to the start or end of any line anywhere
+ within the string \fIs\fR.
+ .Ip "\fBs\fR" 4
+ If the \fIsingle\fR line flag `\fBs\fR\*(R' is specified, the string \fIs\fR is
+ treated as single line. That is, this changes the regular expression
+ meta character `\fB.\fR\*(R' to match any character whatsoever, even a newline,
+ which it normally would not match.
+ .SH "CONVERSION SPECIFICATION"
+ In the format string of \fIstr_format\fR\|(3) each conversion specification is
+ introduced by the character \fB%\fR. After the \fB%\fR, the following appear
+ in sequence:
+ .Ip "o" 4
+ An optional field, consisting of a decimal digit string followed by a \fB$\fR,
+ specifying the next argument to access. If this field is not provided, the
+ argument following the last argument accessed will be used. Arguments are
+ numbered starting at \fB1\fR. If unaccessed arguments in the format string are
+ interspersed with ones that are accessed the results will be indeterminate.
+ .Ip "o" 4
+ Zero or more of the following flags:
+ .Sp
+ A \fB#\fR character specifying that the value should be converted to an
+ ``alternate form'\*(R'. For \fBc\fR, \fBd\fR, \fBi\fR, \fBn\fR, \fBp\fR, \fBs\fR, and \fBu\fR,
+ conversions, this option has no effect. For \fBo\fR conversions, the precision
+ of the number is increased to force the first character of the output string
+ to a zero (except if a zero value is printed with an explicit precision of
+ zero). For \fBx\fR and \fBX\fR conversions, a non-zero result has the string \fB0x\fR
+ (or \fB0X\fR for \fBX\fR conversions) prepended to it. For \fBe\fR, \fBE\fR, \fBf\fR, \fBg\fR,
+ and \fBG\fR, conversions, the result will always contain a decimal point, even if
+ no digits follow it (normally, a decimal point appears in the results of those
+ conversions only if a digit follows). For \fBg\fR and \fBG\fR conversions, trailing
+ zeros are not removed from the result as they would otherwise be.
+ .Sp
+ A zero `\fB0\fR\*(R' character specifying zero padding. For all conversions except
+ \fBn\fR, the converted value is padded on the left with zeros rather than blanks.
+ If a precision is given with a numeric conversion (\fBd\fR, \fBi\fR, \fBo\fR, \fBu\fR,
+ \fBi\fR, \fBx\fR, and \fBX\fR), the `\fB0\fR\*(R' flag is ignored.
+ .Sp
+ A negative field width flag `\fB\-\fR\*(R' indicates the converted value is to be left
+ adjusted on the field boundary. Except for \fBn\fR conversions, the converted
+ value is padded on the right with blanks, rather than on the left with blanks
+ or zeros. A `\fB\-\fR\*(R' overrides a `\fB0\fR\*(R' if both are given.
+ .Sp
+ A space, specifying that a blank should be left before a positive number
+ produced by a signed conversion (\fBd\fR, \fBe\fR, \fBE\fR, \fBf\fR, \fBg\fR, \fBG\fR, or \fBi\fR).
+ .Sp
+ A `\fB+\fR\*(R' character specifying that a sign always be placed before a number
+ produced by a signed conversion. A `\fB+\fR\*(R' overrides a space if both are used.
+ .Ip "o" 4
+ An optional decimal digit string specifying a minimum field width.
+ If the converted value has fewer characters than the field width, it will
+ be padded with spaces on the left (or right, if the left-adjustment
+ flag has been given) to fill out
+ the field width.
+ .Ip "o" 4
+ An optional precision, in the form of a period `\fB.\fR\*(R' followed by an
+ optional digit string. If the digit string is omitted, the precision is
+ taken as zero. This gives the minimum number of digits to appear for
+ \fBd\fR, \fBi\fR, \fBo\fR, \fBu\fR, \fBx\fR, and \fBX\fR conversions, the number of digits
+ to appear after the decimal-point for \fBe\fR, \fBE\fR, and \fBf\fR conversions,
+ the maximum number of significant digits for \fBg\fR and \fBG\fR conversions,
+ or the maximum number of characters to be printed from a string for \fBs\fR
+ conversions.
+ .Ip "o" 4
+ The optional character \fBh\fR, specifying that a following \fBd\fR, \fBi\fR, \fBo\fR,
+ \fBu\fR, \fBx\fR, or \fBX\fR conversion corresponds to a `\f(CWshort int\fR\*(R' or `\f(CWunsigned
+ short int\fR\*(R' argument, or that a following \fBn\fR conversion corresponds to a
+ pointer to a `\f(CWshort int\fR argument.
+ .Ip "o" 4
+ The optional character \fBl\fR (ell) specifying that a following \fBd\fR, \fBi\fR,
+ \fBo\fR, \fBu\fR, \fBx\fR, or \fBX\fR conversion applies to a pointer to a `\f(CWlong int\fR\*(R'
+ or `\f(CWunsigned long int\fR\*(R' argument, or that a following \fBn\fR conversion
+ corresponds to a pointer to a `\f(CWlong int\fR argument.
+ .Ip "o" 4
+ The optional character \fBq\fR, specifying that a following \fBd\fR, \fBi\fR, \fBo\fR,
+ \fBu\fR, \fBx\fR, or \fBX\fR conversion corresponds to a `\f(CWquad int\fR\*(R' or `\f(CWunsigned
+ quad int\fR\*(R' argument, or that a following \fBn\fR conversion corresponds to a
+ pointer to a `\f(CWquad int\fR\*(R' argument.
+ .Ip "o" 4
+ The character \fBL\fR specifying that a following \fBe\fR, \fBE\fR, \fBf\fR, \fBg\fR, or \fBG\fR
+ conversion corresponds to a `\f(CWlong double\fR\*(R' argument.
+ .Ip "o" 4
+ A character that specifies the type of conversion to be applied.
+ .PP
+ A field width or precision, or both, may be indicated by an asterisk `\fB*\fR\*(R' or
+ an asterisk followed by one or more decimal digits and a `\fB$\fR\*(R' instead of a
+ digit string. In this case, an `\f(CWint\fR\*(R' argument supplies the field width or
+ precision. A negative field width is treated as a left adjustment flag
+ followed by a positive field width; a negative precision is treated as though
+ it were missing. If a single format directive mixes positional (`\fBnn$\fR') and
+ non-positional arguments, the results are undefined.
+ .PP
+ The conversion specifiers and their meanings are:
+ .Ip "\fBdiouxX\fR" 4
+ The `\f(CWint\fR\*(R' (or appropriate variant) argument is converted to signed decimal
+ (\fBd\fR and \fBi\fR), unsigned octal (\fBo\fR), unsigned decimal (\fBu\fR), or unsigned
+ hexadecimal (\fBx\fR and \fBX\fR) notation. The letters \fBabcdef\fR are used for \fBx\fR
+ conversions; the letters \fB\s-1ABCDEF\s0\fR are used for \fBX\fR conversions. The
+ precision, if any, gives the minimum number of digits that must appear; if the
+ converted value requires fewer digits, it is padded on the left with zeros.
+ .Ip "\fB\s-1DOU\s0\fR" 4
+ The `\f(CWlong int\fR argument is converted to signed decimal, unsigned octal, or
+ unsigned decimal, as if the format had been \fBld\fR, \fBlo\fR, or \fBlu\fR
+ respectively. These conversion characters are deprecated, and will eventually
+ disappear.
+ .Ip "\fBeE\fR" 4
+ The `\f(CWdouble\fR\*(R' argument is rounded and converted in the style
+ `[\-]d.ddd\fBe\fR+\-dd\*(R' where there is one digit before the decimal-point character
+ and the number of digits after it is equal to the precision; if the precision
+ is missing, it is taken as 6; if the precision is zero, no decimal-point
+ character appears. An \fBE\fR conversion uses the letter \fBE\fR (rather than \fBe\fR)
+ to introduce the exponent. The exponent always contains at least two digits;
+ if the value is zero, the exponent is 00.
+ .Ip "\fBf\fR" 4
+ The `\f(CWdouble\fR\*(R' argument is rounded and converted to decimal notation in the
+ style `[\-]ddd.ddd>\*(R' where the number of digits after the decimal-point
+ character is equal to the precision specification. If the precision is
+ missing, it is taken as 6; if the precision is explicitly zero, no
+ decimal-point character appears. If a decimal point appears, at least one
+ digit appears before it.
+ .Ip "\fBg\fR" 4
+ The `\f(CWdouble\fR\*(R' argument is converted in style \fBf\fR or \fBe\fR (or \fBE\fR for \fBG\fR
+ conversions). The precision specifies the number of significant digits. If
+ the precision is missing, 6 digits are given; if the precision is zero, it is
+ treated as 1. Style \fBe\fR is used if the exponent from its conversion is less
+ than \-4 or greater than or equal to the precision. Trailing zeros are removed
+ from the fractional part of the result; a decimal point appears only if it is
+ followed by at least one digit.
+ .Ip "\fBc\fR" 4
+ The `\f(CWint\fR\*(R' argument is converted to an `\f(CWunsigned char\fR, and the resulting
+ character is written.
+ .Ip "\fBs\fR" 4
+ The `\f(CWchar *\fR\*(R' argument is expected to be a pointer to an array of character
+ type (pointer to a string). Characters from the array are written up to (but
+ not including) a terminating \f(CWNUL\fR character; if a precision is specified, no
+ more than the number specified are written. If a precision is given, no null
+ character need be present; if the precision is not specified, or is greater
+ than the size of the array, the array must contain a terminating \f(CWNUL\fR
+ character.
+ .Ip "\fBp\fR" 4
+ The `\f(CWvoid *\fR pointer argument is printed in hexadecimal (as if by `\fB%#x\fR\*(R'
+ or `\f(CW%#lx\fR).
+ .Ip "\fBn\fR" 4
+ The number of characters written so far is stored into the integer indicated
+ by the `\f(CWint *\fR\*(R' (or variant) pointer argument. No argument is converted.
+ .Ip "\fB%\fR" 4
+ A `\fB%\fR\*(R' is written. No argument is converted. The complete conversion
+ specification is `\fB%%\fR.
+ .PP
+ In no case does a non-existent or small field width cause truncation of a
+ field; if the result of a conversion is wider than the field width, the field
+ is expanded to contain the conversion result.
+ .SH "EXAMPLES"
+ In the following a few snippets of selected use cases of \fBStr\fR are
+ presented:
+ .Ip "\fBSplice a String into Another\fR" 4
+ .Sp
+ .Vb 5
+ \& char *v1 = "foo bar quux";
+ \& char *v2 = "baz";
+ \& str_splice(v1, 3, 5, v2, 0):
+ \& /* now we have v1 = "foobazquux" */
+ \& ....
+ .Ve
+ .Ip "\fBTokenize a String\fR" 4
+ .Sp
+ .Vb 10
+ \& char *var = " foo \et " bar 'baz'" q'uu'x #comment";
+ \& char *tok, *p;
+ \& p = var;
+ \& while ((tok = str_token(p, ":", "\e"'", "#", 0)) != NULL) {
+ \& /* here we enter three times:
+ \& 1. tok = "foo"
+ \& 2. tok = " bar 'baz'"
+ \& 3. tok = "quux" */
+ \& ...
+ \& }
+ .Ve
+ .Ip "\fBMatch a String\fR" 4
+ .Sp
+ .Vb 5
+ \& char *var = "foo:bar";
+ \& if (str_parse(var, "^.+?:.+$/)) {
+ \& /* var matched */
+ \& ...
+ \& }
+ .Ve
+ .Ip "\fBMatch a String and Go Ahead with Details\fR" 4
+ .Sp
+ .Vb 10
+ \& char *var = "foo:bar";
+ \& char *cp, *v1, *v2;
+ \& if (str_parse(var, "m/^(.+?):(.+)$/b", &cp, &v1, &v2)) {
+ \& ...
+ \& /* now we have:
+ \& cp = "foo\e0bar\e0" and v1 and v2 pointing
+ \& into it, i.e., v1 = "foo", v2 = "bar" */
+ \& ...
+ \& free(cp);
+ \& }
+ .Ve
+ .Ip "\fBSubstitute Text in a String\fR" 4
+ .Sp
+ .Vb 8
+ \& char *var = "foo:bar";
+ \& char *subst = "quux";
+ \& char *new;
+ \& str_parse(var, "s/^(.+?):(.+)$/$1-%s-$2/", &new, subst);
+ \& ...
+ \& /* now we have: var = "foo:bar", new = "foo:quux:bar" */
+ \& ...
+ \& free(new);
+ .Ve
+ .Ip "\fBFormat a String\fR" 4
+ .Sp
+ .Vb 6
+ \& char *v0 = "abc..."; /* length not guessable */
+ \& char *v1 = "foo";
+ \& void *v2 = 0xDEAD;
+ \& int v3 = 42;
+ \& char *cp;
+ \& int n;
+ .Ve
+ .Vb 6
+ \& n = str_format(NULL, 0, "%s|%5s-%x-%04d", v0, v1, v2, v3);
+ \& cp = malloc(n);
+ \& str_format(cp, n, "%s-%x-%04d", v1, v2, v3);
+ \& /* now we have cp = "abc...| foo-DEAD-0042" */
+ \& ...
+ \& free(cp);
+ .Ve
+ .SH "SEE ALSO"
+ \fIstring\fR\|(3), \fIprintf\fR\|(3), \fIperlre\fR\|(1).
+ .SH "HISTORY"
+ The \fBStr\fR library was written in November and December 1999 by Ralf
+ S. Engelschall. As building blocks various existing code was used and
+ recycled: for the \fIstr_token\fR\|(3) implementation an anchient \fIstrtok\fR\|(3)
+ flavor from William Deich 1991 was cleaned up and adjusted. As the
+ background parsing engine for \fIstr_parse\fR\|(3) a heavily stripped down
+ version of Philip Hazel's PCRE 2.08 library was used. The \fIstr_format\fR\|(3)
+ implementation was based on Panos Tsirigotis\*(R' \fIsprintf\fR\|(3) code as
+ adjusted by the Apache Software Foundation 1998. The formatting engine
+ was stripped down and enhanced to support internal extensions which were
+ required by \fIstr_format\fR\|(3) and \fIstr_parse\fR\|(3).
+ .SH "AUTHOR"
+ .PP
+ .Vb 3
+ \& Ralf S. Engelschall
+ \& rse@engelschall.com
+ \& www.engelschall.com
+ .Ve
+
+ .rn }` ''
+ .IX Title "str 3"
+ .IX Name "B<Str> - String Library"
+
+ .IX Header "NAME"
+
+ .IX Header "VERSION"
+
+ .IX Header "SYNOPSIS"
+
+ .IX Header "DESCRIPTION"
+
+ .IX Header "FUNCTIONS"
+
+ .IX Item "str_size_t \fBstr_len\fR(const char *\fIs\fR);"
+
+ .IX Item "char *\fBstr_copy\fR(char *\fIs\fR, const char *\fIt\fR, size_t \fIn\fR);"
+
+ .IX Item "char *\fBstr_dup\fR(const char *\fIs\fR, str_size_t \fIn\fR);"
+
+ .IX Item "char *\fBstr_concat\fR(char *\fIs\fR, ...);"
+
+ .IX Item "char *\fBstr_splice\fR(char *\fIs\fR, str_size_t \fIoff\fR, str_size_t \fIn\fR, char *\fIt\fR, str_size_t \fIm\fR);"
+
+ .IX Item "int \fBstr_compare\fR(const char *\fIs\fR, const char *\fIt\fR, str_size_t \fIn\fR, int \fImode\fR);"
+
+ .IX Item "char *\fBstr_span\fR(const char *\fIs\fR, size_t \fIn\fR, const char *\fIcharset\fR, int \fImode\fR);"
+
+ .IX Item "char *\fBstr_locate\fR(const char *\fIs\fR, str_size_t \fIn\fR, const char *\fIt\fR);"
+
+ .IX Item "char *\fBstr_token\fR(char **\fIs\fR, const char *\fIdelim\fR, const char *\fIquote\fR, const char *\fIcomment\fR, int \fImode\fR);"
+
+ .IX Item "int \fBstr_parse\fR(const char *\fIs\fR, const char *\fIpop\fR, ...);"
+
+ .IX Item "int \fBstr_format\fR(char *\fIs\fR, str_size_t \fIn\fR, const char *\fIfmt\fR, ...);"
+
+ .IX Item "unsigned long \fBstr_hash\fR(const char *\fIs\fR, str_size_t \fIn\fR, int \fImode\fR);"
+
+ .IX Item "int \fBstr_base64\fR(char *\fIs\fR, str_size_t \fIn\fR, unsigned char *\fIucp\fR, str_size_t \fIucn\fR, int \fImode\fR);"
+
+ .IX Header "GORY DETAILS"
+
+ .IX Subsection "Perl Regular Expressions"
+
+ .IX Subsection "Parsing Specification"
+
+ .IX Item "\fBMatching:\fR `\fBm\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIflags\fR*': "
+
+ .IX Item "\fBSubstitution:\fR `\fBs\fR \fIdelim\fR \fIregex\fR \fIdelim\fR \fIsubst\fR \fIdelim\fR \fIflags\fR*': "
+
+ .IX Item "\fBb\fR"
+
+ .IX Item "\fBi\fR"
+
+ .IX Item "\fBo\fR"
+
+ .IX Item "\fBx\fR"
+
+ .IX Item "\fBm\fR"
+
+ .IX Item "\fBs\fR"
+
+ .IX Header "CONVERSION SPECIFICATION"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "o"
+
+ .IX Item "\fBdiouxX\fR"
+
+ .IX Item "\fB\s-1DOU\s0\fR"
+
+ .IX Item "\fBeE\fR"
+
+ .IX Item "\fBf\fR"
+
+ .IX Item "\fBg\fR"
+
+ .IX Item "\fBc\fR"
+
+ .IX Item "\fBs\fR"
+
+ .IX Item "\fBp\fR"
+
+ .IX Item "\fBn\fR"
+
+ .IX Item "\fB%\fR"
+
+ .IX Header "EXAMPLES"
+
+ .IX Item "\fBSplice a String into Another\fR"
+
+ .IX Item "\fBTokenize a String\fR"
+
+ .IX Item "\fBMatch a String\fR"
+
+ .IX Item "\fBMatch a String and Go Ahead with Details\fR"
+
+ .IX Item "\fBSubstitute Text in a String\fR"
+
+ .IX Item "\fBFormat a String\fR"
+
+ .IX Header "SEE ALSO"
+
+ .IX Header "HISTORY"
+
+ .IX Header "AUTHOR"
+
|