12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325 |
- \input texinfo @c -*-texinfo-*-
- @c vim: filetype=texinfo tabstop=2 expandtab
- @c $Id$
- @c %**start of header (This is for running Texinfo on a region.)
- @setfilename texindex.info
- @settitle Texindex @VERSION@: A program for sorting indices
- @c %**end of header (This is for running Texinfo on a region.)
- @c Better brace handling, this texindex is needed to process!
- @allowindexbraces
- @c Merge the function and variable indexes into the concept index,
- @c but without the code font; in the index entries we'll do the
- @c font management ourselves. Also merge in the chunk definition
- @c and reference entries, which jrweave creates for us.
- @c (Ordinarily this would be in the header, but jrweave puts the
- @c defindexes later.)
- @synindex fn cp
- @synindex vr cp
- @synindex cd cp
- @synindex cr cp
- @copying
- This @command{texindex} program (version @VERSION@, @UPDATED@) sorts the
- raw index files created by @file{texinfo.tex}. (This Texinfo source is
- a literate program written using TexiWeb@tie{}Jr., not a user manual.)
- Copyright @copyright{} 2014, 2015, 2016, 2017 Free Software Foundation,
- Inc.
- @quotation
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see @url{http://www.gnu.org/licenses/}.
- @end quotation
- @end copying
- @titlepage
- @title Texindex
- @subtitle version @VERSION@, @UPDATED@
- @author Arnold D. Robbins
- @author and Texinfo maintainers
- @page
- @vskip 0pt plus 1filll
- @insertcopying
- @end titlepage
- @contents
- @ifnottex
- @node Top
- @top Texindex
- This file defines @command{texindex} (version @VERSION@,
- @UPDATED@), an @code{awk} program that processes the raw index
- files produced by the @file{texinfo.tex} file.
- @end ifnottex
- @menu
- * Preface:: Introductory remarks.
- * Requirements:: How the program needs to work.
- * High-level organization:: The overall outline.
- * Processing records:: Processing each record.
- * Necessary stuff:: Copyright, helper functions, i18n.
- * Index:: Combined index.
- @end menu
- @node Preface
- @unnumbered Preface
- This file defines @file{texindex.awk}, a reimplementation of the C
- program @file{texindex.c}. The purpose is to make the program more
- maintainable. As a practical benefit, it also supports correct sorting
- and initials for the @samp{@{} and @samp{@}} characters in an index.
- @cindex @{ (left brace), example index entry for
- @cindex @} (right brace), example index entry for
- @cindex TexiWeb Jr.@: literate programming system
- @cindex Texinfo document formatting language
- This is a @dfn{literate program}, written using the
- @uref{https://github.com/arnoldrobbins/texiwebjr, @sc{TexiWeb Jr.@:}
- literate programming system}. The underlying documentation system is
- @uref{http://www.gnu.org/software/texinfo, Texinfo} the GNU
- documentation formatting language. A single source file produces the
- runnable program, a printable document, and an online document.
- @menu
- * Intended audience::
- @end menu
- @node Intended audience
- @section Intended audience
- You should read this if you want to understand how @file{texindex.awk}
- works. You should be familiar with the @command{awk} programming
- language.
- If you are interested in array indexing, you've come to
- the wrong place. @xref{knuth}.
- @c Scale figure to 4.5 inches which is good for both smallbook
- @c and regular. TeX will scale height also automatically.
- @float Figure,knuth
- @caption{Indexing (@url{http://xkcd.com/163/})}
- @center @image{dek_idx, 5in, , Indexing}
- @end float
- @node Requirements
- @chapter Requirements
- The input to this program is the list of unsorted index entries produced
- by @file{texinfo.tex} when a Texinfo document is processed. For
- example, two lines resulting from the @command{gawk} manual might look
- like this:
- @example
- \entry@{POSIX awk@}@{5@}@{POSIX \command @{awk@}@}
- @dots{}
- \entry@{POSIX awk@}@{106@}@{POSIX \command @{awk@}@}
- @end example
- As shown, there are three ``fields'' enclosed in braces:
- @itemize @bullet
- @item
- The sort key. This is the text of the entry with all markup removed.
- It should contain only ASCII characters.
- @item
- The page number for this entry.
- @item
- The display text to be shown in the printed index, which can include markup.
- @end itemize
- The braces are balanced in all cases, although for use by this program,
- literal braces (not necessarily balanced) can be included in the sort
- key by escaping them with the @dfn{command character}. This is the
- first character on the line. It is either the backslash used by @TeX{}
- (@samp{\}) or the at sign used by Texinfo (@samp{@@}).
- @cindex backslash vs.@: at
- @cindex command character, @samp{\} vs.@: @samp{@@}
- Historically, @file{texinfo.tex} has used @samp{\} as the command
- character in index files. This causes complications with index entries
- containing backslashes and braces; C @command{texindex} has never output
- the correct initial (a left brace) for an index entry like
- @samp{@@cindex @@@{@}}, or done technically-correct sorting with such
- entries. @xref{Details of @t{texindex},,, texinfo, GNU Texinfo}.
- The present @code{awk} implementation handles these cases better. It
- also supports @samp{@@} as the command character, which allows
- @file{texinfo.tex} to output cleaner raw index files. (For
- compatibility, for now this is only done if a ``secret'' @TeX{} variable
- is set: @code{\global\usebracesinindexestrue}.) The first character of
- the raw input file is taken as the command character.
- @subheading Processing
- The job is to sort the entries, and merge those which are identical
- except for the page numbers. Thus, for the above two entries, the
- output should be:
- @example
- \entry @{POSIX \command @{awk@}@}@{5, 106@}
- @end example
- The sorting should be in the order of: all symbols first, then all
- digits, then all letters, with uppercase letters following lowercase
- ones, so we will need some smarts.
- Input lines might be duplicated (same entry, same page, more than once),
- so we will have to deal with that.
- In addition, @command{texindex} must output special lines indicating the
- first character (the @dfn{initial}) of keys grouped together, but only
- if there is more than one initial used throughout the input file. This
- output looks like:
- @example
- \initial @{A@}
- @end example
- @subheading Assumptions
- In the rest of the program we make two fundamental assumptions:
- @enumerate 1
- @item
- If a given sort key has more than one display text, we only take the
- first (this matches the behavior of C @command{texindex}). Put another
- way, if the same sort key has two different display texts, it means that
- different markup was used, probably inadvertently, and we just take the
- first. As an example, consider these two Texinfo commands:
- @example
- @@cindex @@file@{field_split()@} function
- @dots{}
- @@cindex @@code@{field_split()@} function
- @end example
- @noindent
- They produce the following output via @file{texinfo.tex},
- which in turn is the input to @command{texindex}:
- @example
- \entry@{field_split() function@}@{2@}@{\file @{field_split()@} function@}
- \entry@{field_split() function@}@{7@}@{\code @{field_split()@} function@}
- @end example
- @noindent The result will be a single entry, using @code{\file},
- accumulating the page numbers.
- @example
- \entry @{\file @{field_split()@} function@}@{2, 7@}
- @end example
- @item
- @cindex roman numerals
- For the same sort key and text, page numbers will be monotonically
- increasing. This means we can just use a new page number when it comes
- in, and not have to sort entries based on both sort key and page number.
- (Which, in turn, means that we don't need to worry about page numbers
- that are roman numerals.)
- @end enumerate
- An additional requirement, for ease of deployment, is that the program
- be written in portable @command{awk}, and not use features found only in
- GNU @command{awk} (@command{gawk}). For our purposes, ``portable''
- means ``new'' @command{awk} as defined in the 1988 book by Aho,
- Weinberger and Kernighan. This gives us functions, multidimensional
- arrays and a number of other important features over the original
- @command{awk} shipped with V7 Unix.
- @node High-level organization
- @chapter High-level organization
- The general outline is as follows:
- @file_update_recipe
- @end file_update_recipe
- @file_update texindex.awk . ""
- @(texindex.awk@) =
- @<First line@>
- @<GPL v3 copyright statement@>
- @<Library functions@>
- @<Helper functions@>
- BEGIN {
- @<Initial setup@>
- @<Argument processing@>
- }
- @<@code{beginfile()} function@>
- @<@code{endfile()} function@>
- @<Process a record@>
- @<Work functions@>
- @
- @menu
- * First line::
- * Initial setup::
- * Argument processing::
- @end menu
- @node First line
- @section First line
- @cindex first line
- @cindex @code{#!} header
- @cindex header, shebang
- For the first line of the generated output, we hardwire our intended
- output file name and how it got made. We do not use a @samp{#!} header
- because, being a GNU program, we need to accept the @option{--help} and
- @option{--version} options. This cannot be done with a standalone
- @code{awk} script; we need a shell wrapper, and hence, the @code{awk}
- script itself need not be executable, and it's simpler not to worry
- about the location of the @code{awk} program.
- @<First line@> =
- # texindex.awk, generated by jrtangle from ti.twjr.
- @
- @node Initial setup
- @section Initial setup
- @cindex initial setup
- The initial setup sets up some constants, including the version of the
- program. In the program itself, we follow a convenient convention:
- global variable and array names start with a capital letter.
- @cindex @code{Invocation_name} variable
- Per GNU standards, we sometimes hardwire the string @samp{texindex} as
- the name of the program, and sometimes use the name by which the program
- was invoked. We'll call the latter @code{Invocation_name}; it's
- supposed to be passed in from the shell wrapper.
- @cindex @code{Can_split_null} variable
- The last line below sets up @code{Can_split_null}, which tells us if the
- built-in @code{split()} function will split apart a string into its
- individual characters or if we have to do it manually.
- @cindex @code{TRUE} constant
- @cindex @code{FALSE} constant
- @cindex @code{EXIT_SUCCESS} constant
- @cindex @code{EXIT_FAILURE} constant
- @cindex @code{Texindex_version} variable
- @cindex @code{check_split_null()} function
- @<Initial setup@>=
- TRUE = 1
- FALSE = 0
- EXIT_SUCCESS = 0
- EXIT_FAILURE = 1
- Texindex_version = "@VERSION@"
- if (! Invocation_name) {
- # provide fallback in case it's not passed in.
- Invocation_name = "texindex"
- }
- Can_split_null = check_split_null()
- @
- @node Argument processing
- @section Argument processing
- @cindex argument processing
- @cindex @code{usage()} function
- @cindex @code{version()} function
- Argument processing is straightforward, though manual. The important
- thing is to remove options and their arguments from @code{ARGV} so that
- they're not treated as filenames. The options that print version or
- help information automatically exit, so there's no need to mess with
- @code{ARGV} in those cases.
- @cindex @code{-o} (@code{--output}) option, unimplemented
- The C implementation of @command{texindex} provided an option @code{-o}
- (@code{--output}) @var{file} to write the output to @var{file}. We've
- chosen to omit it from this incarnation of the program unless and until
- we receive information that it was ever useful.
- @cindex @code{-h} (@code{--help}) option
- @cindex @code{-k} (@code{--keep}), no-op option
- @cindex @code{--} option
- @cindex @code{--version} option
- @<Argument processing@>=
- for (i = 1; i < ARGC; i++) {
- if (ARGV[i] == "-h" || ARGV[i] == "--help") {
- usage(EXIT_SUCCESS)
- } else if (ARGV[i] == "--version") {
- version()
- } else if (ARGV[i] == "-k" || ARGV[i] == "--keep") {
- # do nothing, backwards compatibility
- delete ARGV[i]
- } else if (ARGV[i] == "--") {
- delete ARGV[i]
- break
- } else if (ARGV[i] ~ /^--?.+/) {
- fatal(_"%s: unrecognized option `%s'\n" \
- "Try `%s --help' for more information.\n",
- Invocation_name, ARGV[i], Invocation_name)
- exit EXIT_FAILURE
- } else {
- break
- }
- }
- @
- @node Processing records
- @chapter Processing records
- Processing records includes setting things up for each input file,
- pulling apart each record, sorting the data at the end, and writing out
- the data properly.
- @menu
- * Setup for each input file:: What happens at the start of each file.
- * Processing each record::
- * End-of-file sorting and printing::
- @end menu
- @node Setup for each input file
- @section Setup for each input file
- At the beginning of each input file, the @code{beginfile()} function
- clears our variables from any previous processing (@code{Data},
- @code{Keys}, etc.)@: and sets up the output file name. We always append
- an @samp{s} to the name of the input file, which has become the standard
- convention. (As mentioned above, the @option{-o} option in the C
- implementation of @command{texindex} has been omitted here, until the
- need is seen.)
- When @code{beginfile()} is called, the first record has already been
- read, so it's possible to perform the checks for a Texinfo index file:
- The first character must be either @samp{\} or @samp{@@}
- (@pxref{Requirements}), and the next five characters must be the word
- @samp{entry}.
- @cindex @code{Special_chars} variable
- @code{Special_chars} are the two characters that must be preceded by
- the command character inside the first key.
- @cindex @code{beginfile()} function
- @cindex @code{Output_file} variable
- @cindex @code{Data} array
- @cindex @code{Keys} array
- @cindex @code{Seen} array
- @cindex @code{Entries} variable
- @cindex @code{Do_initials} variable
- @cindex @code{Prev_initial} variable
- @cindex @code{del_array()} function
- @cindex @code{Command_char} variable
- @cindex @code{Special_chars} variable
- @<@code{beginfile()} function@>=
- function beginfile(filename)
- {
- Output_file = filename "s"
- # Reinitialize these for each input file
- del_array(Data)
- del_array(Keys)
- del_array(Seen)
- Entries = 0
- Do_initials = FALSE
- Prev_initial = ""
- Command_char = substr($0, 1, 1)
- if ( (Command_char != "\\" && Command_char != "@") \
- || substr($0, 2, 5) != "entry")
- fatal(_"%s is not a Texinfo index file\n", filename)
- Special_chars = "{}"
- }
- @
- @node Processing each record
- @section Processing each record
- Record processing consists of building the data structures for use in
- sorting and printing once the whole file has been processed.
- @<Process a record@> =
- {
- @<Remove duplicates@>
- @<Remove leading @code{\entry}@>
- @<Get the initial@>
- @<Set up @code{fields} array with the data@>
- @<Name the fields@>
- @<Store the data for this line in the @code{Data} array@>
- @<Check for more than one initial@>
- }
- @
- @menu
- * Remove duplicates::
- * Remove leading @code{\entry}::
- * Get the initial::
- * Set up and name fields::
- * Store the data for this line::
- * Check for more than one initial::
- * Splitting the record::
- @end menu
- @node Remove duplicates
- @subsection Remove duplicates
- @cindex removing duplicates
- @cindex duplicates, removing
- @cindex @code{Seen} array
- Duplicates are going to be exact. Removing them is thus easy; store
- each incoming line as the index of an array named @code{Seen}. If a
- line is not there, it has not been seen. Otherwise it has, and we move
- on to the next record.
- @<Remove duplicates@>=
- # Remove duplicates, which can happen
- if ($0 in Seen)
- next
- Seen[$0] = TRUE
- @
- @node Remove leading @code{\entry}
- @subsection Remove leading @code{\entry}
- We use @code{substr()} here to avoid possible hassles with leading
- backslashes in @code{sub()}.
- @<Remove leading @code{\entry}@>=
- $0 = substr($0, 7) # remove leading \entry
- @
- @node Get the initial
- @subsection Get the initial
- @cindex @code{extract_initial()} function
- @<Get the initial@>=
- initial = extract_initial($0)
- @
- The key argument here is the rest of the line after @samp{\entry},
- starting with an open brace.
- The very first field character of the sort key can be an open brace.
- If so, we extract the component of the sort key surrounded by balanced
- braces. We don't account for \@{ or \@} inside this component, as
- @file{texinfo.tex} isn't expected to produce such output.
- An example can be seen in what older versions of @file{texinfo.tex}
- generated if you needed to index a real backslash, namely an input line
- something like the following:
- @example
- \entry@{@{\tt \indexbackslash @} (backslash)@}@{14@}@{\code @{@{\tt @dots{}
- @end example
- Earlier versions of @command{texindex} took the the first non-brace
- character as the initial, in this example @samp{\}, and output it as
- @samp{\\}; this was not, however, a control sequence recognized by the
- older versions of @file{texinfo.tex}.
- @cindex @code{extract_initial()} function
- @cindex @code{char_split()} function
- @<Work functions@>=
- function extract_initial(key, initial, nextchar, i, l, kchars)
- {
- l = char_split(key, kchars)
- if (l >= 3 && kchars[2] == "{") {
- bracecount = 1
- i = 3
- while (bracecount > 0 && i <= l) {
- if (kchars[i] == "{")
- bracecount++
- else if (kchars[i] == "}")
- bracecount--
- i++
- }
- if (i > l)
- fatal(_"%s:%d: Bad key %s in record\n", FILENAME, FNR, key)
- initial = substr(key, 2, i - 2)
- } else if (kchars[2] == Command_char) {
- nextchar = kchars[3]
- if (initial == Command_char && (nextchar == "{" || nextchar == "}"))
- initial = substr(key, 2, 3)
- else {
- initial = toupper(nextchar)
- }
- } else {
- initial = toupper(kchars[2])
- }
- return initial
- }
- @
- @node Set up and name fields
- @subsection Set up and name fields
- The next part is to pull out the data of interest from the three sets of
- braces. This is delegated to a function named @code{field_split()}.
- There must be exactly three fields.
- @cindex @code{field_split()} function
- @cindex @code{fields} array, setting up
- @<Set up @code{fields} array with the data@>=
- numfields = field_split($0, fields, "{", "}", Command_char)
- if (numfields != 3)
- fatal(_"%s:%d: Bad entry; expected three fields, not %d\n",
- FILENAME, FNR, numfields)
- @
- We give the fields names for later use.
- @<Name the fields@>=
- key = fields[1]
- pagenum = fields[2]
- text = fields[3]
- @
- @node Store the data for this line
- @subsection Store the data for this line
- @cindex storing data
- @cindex @code{Data} array
- We use a traditional @command{awk} multidimensional array, @code{Data}.
- The sort key from the input is invariant across entries, so we use that
- as the basis for the keys in the @code{Data} array.
- The @code{Data} values are the the output @code{"text"}, the
- @code{"pagenum"} list, and the @code{"initial"}.
- In the event that a particular key has more than one associated output
- text, we'll keep the first and ignore the remainder (this is the same
- behavior as the C implementation). @xref{Requirements}.
- For page numbers, we merely append the page number field from the input,
- preceded by a comma and space, unless that page number was already the
- last that's been stored. (We're assuming the page numbers don't jump
- around, which, in fact, they don't, so we don't need a more complex
- approach.)
- @cindex @code{Keys} array
- @cindex @code{Entries} variable
- In addition to all this updating of the @code{Data} array, the key is
- stored in the @code{Keys} array the first time it is seen; this array
- is sorted later on. For now, its indices are just incremented integers,
- stored in the global @code{Entries} variable.
- @<Store the data for this line in the @code{Data} array@>=
- if (! ((key, "text") in Data)) {
- # first time we've seen this full line
- Keys[++Entries] = key
- Data[key, "text"] = text
- Data[key, "pagenum"] = pagenum
- Data[key, "initial"] = initial
- } else
- # seen this key before; add the current pagenum
- # unless we've already seen that too.
- if ( Data[key, "pagenum"] != pagenum \
- && Data[key, "pagenum"] !~ (", " pagenum "$")) {
- Data[key, "pagenum"] = Data[key, "pagenum"] ", " pagenum
- }
- @
- @node Check for more than one initial
- @subsection Check for more than one initial
- @cindex initial, checking for more than one
- Finally, we need to determine if more than one initial occurs in the
- input. If so, we set @code{Do_initials} to true. As soon as it's true,
- we don't need to do further checking on subsequent lines.
- @cindex @code{Do_initials} variable
- @cindex @code{Prev_initial} variable
- @<Check for more than one initial@> =
- if (! Do_initials) {
- if (Prev_initial == "")
- Prev_initial = initial
- else if (initial != Prev_initial)
- Do_initials = TRUE
- }
- @
- @node Splitting the record
- @subsection Splitting the record: @code{field_split}
- Let's take a look at the function that breaks apart the record. Upon
- entry to the function, the value of @code{record} looks something like:
- @example
- @{POSIX awk@}@{5@}@{POSIX \command @{awk@}@}
- @end example
- The first field may have instances of @samp{@@@{} and/or @samp{@@@}} (or
- @samp{\@{} and/or @samp{\@}}), so the braces aren't necessarily exactly
- balanced.
- The @code{field_split()} function uses fairly straightforward ``count
- the delimiters'' code. The loop starts at 2, since we know the first
- character is an open brace. The main things to handle are the command
- character and the final closing brace. The third field is taken as a
- whole; this is described shortly.
- @cindex @code{field_split()} function
- @cindex @code{char_split()} function
- @<Work functions@>=
- function field_split( \
- record, fields, start, end, com_ch, # parameters
- chars, numchars, out, delim_count, i, j, k) # locals
- {
- del_array(fields)
- numchars = char_split(record, chars)
- j = 1 # index into fields
- k = 1 # index into out
- delim_count = 1
- for (i = 2; i <= numchars; i++) {
- if (chars[i] == com_ch) {
- @<Handle the character after the command character@>
- } else if (chars[i] == start) {
- delim_count++
- out[k++] = chars[i]
- } else if (chars[i] == end) {
- delim_count--
- if (delim_count == 0) {
- @<Finish off the field, set up for next field@>
- } else
- out[k++] = chars[i]
- } else
- out[k++] = chars[i]
- @<Take the third field as a whole@>
- }
- return j # num fields
- }
- @
- If the character following the command character is an open brace, close
- brace, or the command character itself, we pull it in. Otherwise, the
- command character is left alone as part of the field.
- @<Handle the character after the command character@>=
- if (index(Special_chars, chars[i+1]) != 0) {
- out[k++] = chars[i+1]
- i++
- } else
- out[k++] = chars[i]
- @
- Upon seeing the final closing brace, we put all the characters back
- together into a string using @code{join()}. We then reset the
- @code{out} array for the next time through. If the next character isn't
- an open brace, then the line is bad and we print a fatal error.
- Otherwise, we reset @code{delim_count} to one.
- @cindex @code{join()} function
- @<Finish off the field, set up for next field@>=
- fields[j++] = join(out, 1, k-1, SUBSEP)
- del_array(out) # reset for next time through
- k = 1
- i++
- if (i <= numchars && chars[i] != start)
- fatal(_"%s:%d: Bad entry; expected %s at column %d\n",
- FILENAME, FNR, start, i)
- delim_count = 1
- @
- The third field is the printable version, and may have unbalanced braces
- or other oddities. We just take the whole thing as is. This is done by
- stripping off the outermost braces, using @code{substr()}. We then
- break out of the loop, since we're done.
- @<Take the third field as a whole@>=
- if (j == 3) { # Per Karl, just grab the whole rest of the line
- # extract everything between the outer delimiters
- fields[3] = substr(record, i + 1, numchars - i - 1)
- break
- }
- @
- @node End-of-file sorting and printing
- @section End-of-file sorting and printing
- Upon end of input, the processing is straightforward: sort the entries
- and write them out. Additionally, if we are printing the initial,
- handle that. (That printing task is delegated to a small function.)
- @cindex @code{endfile()} function
- @cindex @code{quicksort()} function
- @cindex @code{Entries} variable
- @cindex @code{Keys} array
- @cindex @code{Data} array
- @cindex @code{print_initial()} function
- @cindex @code{Output_file} variable
- @<@code{endfile()} function@>=
- function endfile(filename, i, prev_initial, initial)
- {
- # sort the entries
- quicksort(Keys, 1, Entries)
- for (i = 1; i <= Entries; i++) {
- # deal with initial
- initial = Data[Keys[i], "initial"]
- if (initial != prev_initial) {
- prev_initial = initial
- print_initial(initial)
- }
- # write the actual line \entry {...}{...}
- printf("%centry {%s}{%s}\n",
- Command_char,
- Data[Keys[i], "text"],
- Data[Keys[i], "pagenum"]) > Output_file
- }
- close(Output_file)
- }
- @
- Printing the initial is not complicated. The main thing is to precede
- special characters with the command character.
- @cindex @code{Command_char} variable
- @cindex @code{Special_chars} variable
- @cindex @code{Do_initials} variable
- @cindex @code{print_initial()} function
- @<Work functions@>=
- function print_initial(initial)
- {
- if (Do_initials) {
- if (index(Special_chars, initial) != 0)
- initial = Command_char initial
- printf("%cinitial {%s}\n",
- Command_char, initial) > Output_file
- }
- }
- @
- @menu
- * Quicksort:: Sorting our input.
- * Comparing index entries:: The heart of the sorting algorithm.
- @end menu
- @node Quicksort
- @subsection Quicksort
- @cindex quicksort
- @cindex Hoare, C.A.R.
- Sorting uses a standard quicksort algorithm, with a @code{less_than()}
- function (defined in the next function) supplying the comparison.
- @cindex @code{less_than()} function
- @cindex @code{quicksort()} function
- @cindex @code{quicksort_swap()} function
- @<Helper functions@>=
- # quicksort --- C.A.R. Hoare's quick sort algorithm. See Wikipedia
- # or almost any algorithms or computer science text
- # Adapted from K&R-II, page 110
- #
- function quicksort(data, left, right, i, last)
- {
- if (left >= right) # do nothing if array contains fewer
- return # than two elements
- quicksort_swap(data, left, int((left + right) / 2))
- last = left
- for (i = left + 1; i <= right; i++)
- if (less_than(data[i], data[left]))
- quicksort_swap(data, ++last, i)
- quicksort_swap(data, left, last)
- quicksort(data, left, last - 1)
- quicksort(data, last + 1, right)
- }
- # quicksort_swap --- quicksort helper function, could be inline
- #
- function quicksort_swap(data, i, j, temp)
- {
- temp = data[i]
- data[i] = data[j]
- data[j] = temp
- }
- @
- @node Comparing index entries
- @subsection Comparing index entries
- The comparison function is the heart of the sorting algorithm. The
- comparison is based on the indexing rules, which are:
- @itemize @bullet
- @item
- All symbols first.
- @item
- Followed by digits.
- @item
- Followed by letters. Lowercase precedes uppercase and both ``a'' and
- ``A'' precede anything starting with ``b'' or ``B'' (etc.).
- @end itemize
- Implementing these rules is a little complicated. The first thing we
- need is a table that maps characters to comparison values. The
- following code is based on the original C @command{texindex}, although
- the actual comparison algorithm is more sophisticated.
- We set up an @code{Ordval} array to map characters to numeric values.
- Most characters map to their ASCII code. We add 512 to the value of
- each of the digits. Both uppercase and lowercase letters map to the
- same numeric value, which is the ASCII code for the uppercase letter
- plus 512. (This code should also work for EBCDIC systems,
- although @TeX{} does everything in ASCII, so it's not likely to make a
- difference.)
- The table must be built completely before changing the mapping of the
- letters, because all of the uppercase and lowercase letters must be in
- the table before we can change their values.
- @cindex @code{Ordval} array
- @<Work functions@>=
- BEGIN {
- for (i = 0; i < 256; i++) {
- c = sprintf("%c", i)
- Ordval[c] = i # map character to value
- if ((n = isdigit(c)) > 0) {
- Ordval[c] += 512
- }
- }
- # Give both 'A' and 'a' the same code
- i = Ordval["A"]
- j = Ordval["Z"]
- for (; i <= j; i++) {
- c = sprintf("%c", i)
- # In ASCII, 'A' is before 'a', so this is
- # the right order to check
- #
- # Checking isupper() lets this work for EBCDIC, too.
- if (isupper(c)) {
- Ordval[c] += 512
- Ordval[tolower(c)] = Ordval[c]
- }
- }
- }
- @
- Here is the @code{less_than()} function. It returns true if the
- @code{left} string is ``less than'' the @code{right} string.
- The comparison algorithm is not too complicated, once we define how
- things should work. We loop over each pair of characters in the
- @code{left} and @code{right} strings, comparing them one at a time.
- When comparing two characters, there are three cases, one of which has
- three subcases, as follows:
- @table @i
- @item Two letters
- @c nested table
- @table @i
- @item Same letter, but different case
- This is the complicated case. First, we want lowercase letters to be
- ordered before uppercase ones, even though this is the opposite of the
- natural ASCII ordering. To make this happen, we use a @samp{>}
- comparison instead of a @samp{<} comparison.
- Second, when two characters are equal, we have to look ahead at the next
- characters to decide whether to continue the loop or quit. As long as
- we are not at the end of the string, and at least one of the following
- characters in either string is a letter, we continue the loop.
- Otherwise we do the character comparison and return.
- @item Two different letters, but same case
- @itemx Two different letters, different case
- Use the comparison of the respective @code{Ordval} values.
- @end table
- @c end nested table
- @item A letter and something else
- @itemx Two nonletters
- Use the comparison of the respective @code{Ordval} values.
- @end table
- @noindent When the values are equal, continue around the loop. And, as
- usual, if one string is an initial substring of the other, that one is
- considered to be ``less than'' the other one.
- The rules just described produce @emph{better} results than did the C
- @command{texindex}. For example, @samp{beginfile()} sorts
- before @samp{BEGINFILE}, whereas with the C version they came out in the
- opposite order.
- @cindex @code{Ordval} array
- @cindex @code{char_split()} function
- @cindex @code{less_than()} function
- @cindex @code{isalpha()} function
- @<Work functions@>=
- function less_than(left, right, len_l, len_r, len, chars_l, chars_r)
- {
- len_l = length(left)
- len_r = length(right)
- len = (len_l < len_r ? len_l : len_r)
- char_split(left, chars_l)
- char_split(right, chars_r)
- for (i = 1; i <= len; i++) {
- if (isalpha(chars_l[i]) && isalpha(chars_r[i])) {
- # same char different case
- # upper case comes out last
- if (chars_l[i] != chars_r[i] &&
- tolower(chars_l[i]) == tolower(chars_r[i])) {
- if ( i != len \
- && (isalpha(chars_l[i+1]) \
- || isalpha(chars_r[i+1])))
- continue
- if (chars_l[i] > chars_r[i])
- return 1
- return 0
- }
- # same case, different char,
- # or different case, different char:
- # letter order wins
- if (Ordval[chars_l[i]] < Ordval[chars_r[i]])
- return 1
- if (Ordval[chars_l[i]] > Ordval[chars_r[i]])
- return 0
- # equal, keep going
- continue
- }
- # letter and something else, or two non-letters
- # letter order wins
- if (Ordval[chars_l[i]] < Ordval[chars_r[i]])
- return 1
- if (Ordval[chars_l[i]] > Ordval[chars_r[i]])
- return 0
- # equal, keep going
- }
- # equal so far, shorter one wins
- if (len_l < len_r)
- return 1
- return 0
- }
- @
- @node Necessary stuff
- @chapter Necessary stuff that isn't thrilling
- This chapter provides some necessary but unexciting elements.
- @menu
- * Copyright statement:: Copyright info.
- * Library functions:: From the @code{gawk} library: @file{ftrans.awk}, @code{join}.
- * Helper functions:: @code{del_array}, @code{check_split}, @code{fatal}, @dots{}
- * I18N:: Internationalization.
- @end menu
- @node Copyright statement
- @section Copyright statement
- @cindex copyright statement
- @cindex GNU General Public License
- @cindex License, GNU General Public
- @cindex GPL (GNU General Public License)
- Every program needs a copyright statement.
- @<GPL v3 copyright statement@>=
- #
- # Copyright 2014, 2015, 2016 Free Software Foundation, Inc.
- #
- # This file is part of GNU Texinfo.
- #
- # Texinfo is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 3 of the License, or
- # (at your option) any later version.
- #
- # Texinfo is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, see <http://www.gnu.org/licenses/>.
- @
- @node Library functions
- @section Library functions: @file{ftrans.awk}, @code{join}
- The program uses several library routines discussed in detail
- in the @command{gawk} documentation. The first sets up the
- infrastructure for the @code{beginfile()} and @code{endfile()} functions.
- @xref{Filetrans Function,,, gawk, GNU Awk User's Guide},
- for an explanation of how this function works.
- @cindex @file{ftrans.awk} library file
- @<Library functions@>=
- # ftrans.awk --- handle data file transitions
- #
- # user supplies beginfile() and endfile() functions
- #
- # Arnold Robbins, arnold@skeeve.com, Public Domain
- # November 1992
- FNR == 1 {
- if (_filename_ != "")
- endfile(_filename_)
- _filename_ = FILENAME
- beginfile(FILENAME)
- }
- END { endfile(_filename_) }
- @
- The next function is @code{join()}, which joins an array of characters
- back into a string. @xref{Join Function,,, gawk, GNU Awk User's Guide},
- for an explanation of how this function works.
- @cindex @file{join.awk} library file
- @cindex @code{join()} function
- @<Library functions@>=
- # join.awk --- join an array into a string
- #
- # Arnold Robbins, arnold@skeeve.com, Public Domain
- # May 1993
- function join(array, start, end, sep, result, i)
- {
- if (sep == "")
- sep = " "
- else if (sep == SUBSEP) # magic value
- sep = ""
- result = array[start]
- for (i = start + 1; i <= end; i++)
- result = result sep array[i]
- return result
- }
- @
- @node Helper functions
- @section Helper functions
- These helper functions make the main code easier to follow.
- @menu
- * @code{del_array}::
- * @code{check_split_null}::
- * @code{char_split}::
- * @code{fatal}::
- * @code{is@dots{}} functions::
- @end menu
- @node @code{del_array}
- @subsection @code{del_array}
- @code{del_array()} clears out an array.
- @cindex @code{del_array()} function
- @<Helper functions@>=
- function del_array(a)
- {
- # Portable and faster than
- # for (i in a)
- # delete a[i]
- split("", a)
- }
- @
- @node @code{check_split_null}
- @subsection @code{check_split_null}
- @code{check_split_null()} determines whether the @command{awk} running
- this program supports using the null string for the separator, splitting
- each character off into a separate element. If so, the return value
- will be the number of elements in the array, and it will be more than
- one. It is called at program startup.
- @cindex @code{check_split_null()} function
- @<Helper functions@>=
- function check_split_null( n, a)
- {
- n = split("abcde", a, "")
- return (n == 5)
- }
- @
- @node @code{char_split}
- @subsection @code{char_split}
- @code{char_split()} splits a string into separate characters, letting
- @command{awk} do the work if possible. If not, each character is
- extracted manually using a loop and @code{substr()}.
- @cindex @code{char_split()} function
- @cindex @code{Can_split_null} variable
- @cindex @code{del_array()} function
- @<Helper functions@>=
- function char_split(string, array, n, i)
- {
- if (Can_split_null)
- return split(string, array, "")
- # do it the hard way
- del_array(array)
- n = length(string)
- for (i = 1; i <= n; i++)
- array[i] = substr(string, i, 1)
- return n
- }
- @
- @node @code{fatal}
- @subsection @code{fatal}
- @cindex stderr
- The @code{fatal()} function prints a @code{printf}-formatted message to
- standard error and then exits badly.
- @cindex @code{fatal()} function
- @<Helper functions@>=
- function fatal(format, arg1, arg2, arg3, arg4, arg5,
- arg6, arg7, arg8, arg9, arg10)
- {
- printf(format, arg1, arg2, arg3, arg4, arg5,
- arg6, arg7, arg8, arg9, arg10) > "/dev/stderr"
- exit EXIT_FAILURE
- }
- @
- @node @code{is@dots{}} functions
- @subsection @code{is@dots{}} functions
- @cindex @code{isupper()} function
- @cindex @code{islower()} function
- @cindex @code{isalpha()} function
- @cindex @code{isdigit()} function
- The following functions help identify what a character is; they are
- similar in nature to the various macros in the C @code{<ctype.h>} header
- file. Since each one returns a count, the return value could be used to
- compute which character from the set was seen; this turned out not to be
- necessary in this program but might be useful in some other context.
- @<Helper functions@>=
- function isupper(c)
- {
- return index("ABCDEFGHIJKLMNOPQRSTUVWXYZ", c)
- }
- function islower(c)
- {
- return index("abcdefghijklmnopqrstuvwxyz", c)
- }
- function isalpha(c)
- {
- return islower(c) || isupper(c)
- }
- function isdigit(c)
- {
- return index("0123456789", c)
- }
- @
- @node I18N
- @section Internationalization
- For @command{gawk}, we can arrange for the various messages, e.g., in
- the @code{usage()} and @code{version()} functions, to be translated. We
- do this by setting the text domain at startup. For more information on
- internationalization in @command{gawk},
- @pxref{Internationalization.html,,, gawk, GNU Awk User's Guide}.
- @<Initial setup@>=
- TEXTDOMAIN = "texinfo"
- @
- @noindent On non-GNU versions of @command{awk}, this is a harmless
- assignment, and the @code{_"..."} construct below is a harmless
- concatenation of an unassigned variable @code{_}, i.e., the empty
- string.
- The @code{usage()} and @code{version()} functions print the necessary
- information and then exit. The strings that can and should be
- translated are prefixed with an underscore.
- @cindex @code{Texindex_version} variable
- @cindex @code{usage()} function
- @cindex @code{version()} function
- @tex
- % avoid useless warnings/overfull boxes in these long strings
- \global\hfuzz=\maxdimen
- @end tex
- @<Helper functions@>=
- function usage(exit_val)
- {
- printf(_"Usage: %s [OPTION]... FILE...\n", Invocation_name)
- print _"Generate a sorted index for each TeX output FILE."
- print _"Usually FILE... is specified as `foo.??' for a document `foo.texi'."
- print ""
- print _"Options:"
- print _" -h, --help display this help and exit"
- print _" --version display version information and exit"
- print _" -- end option processing"
- print ""
- print _"Email bug reports to bug-texinfo@gnu.org,\n\
- general questions and discussion to help-texinfo@gnu.org.\n\
- Texinfo home page: http://www.gnu.org/software/texinfo/";
- exit exit_val
- }
- function version()
- {
- print "texindex (GNU texinfo)", Texindex_version
- print ""
- printf _"Copyright (C) %s Free Software Foundation, Inc.\n\
- License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n\
- This is free software: you are free to change and redistribute it.\n\
- There is NO WARRANTY, to the extent permitted by law.\n", "2017";
- exit EXIT_SUCCESS
- }
- @
- @node Index
- @unnumbered Index
- @printindex cp
- @bye
|