A.2.2 Producing the Data Table Con’t
A.2.2.2 Listing of TABBUILDPROGRAM TABBUILD (TEXTFILE, OUTPUT,
{ total table and one for each character }
T_TABLE, B_TABLE, L_TABLE, N_TABLE,
J_TABLE, R_TABLE, S_TABLE, NA_TABLE,
{ ditto for indexes }
T_INDEX, B_INDEX, L_INDEX, N_INDEX,
J_INDEX, R_INDEX, S_INDEX, NA_INDEX);
CONST WLENGTH = 20; { maximum word length }
TYPE TOKEN_TYPE = PACKED ARRAY [1..WLENGTH] OF CHAR;
PCH2 = PACKED ARRAY [1..2] OF CHAR;
PCH3 = PACKED ARRAY [1..3] OF CHAR;
REFLINK = ^REFREC;
{ record for each word token encountered }
REFREC = RECORD
SEQNO : INTEGER;
ID : PCH2;
SECTNO : PCH2;
PAGENO : PCH3;
LINENO : INTEGER;
NEXTREF : REFLINK { i.e. a linked list }
END;
PTR = ^TREE_NODE;
{ record for each word type encountered }
TREE_NODE = RECORD
WORD : TOKEN_TYPE;
COUNT : INTEGER;
LENGTH : INTEGER;
LPTR, RPTR : PTR;
FIRSTREF : REFLINK; { ptr to a refrec }
LASTREF : REFLINK
END;
INDEX_RECORD = RECORD
KEYWORD : [KEY] TOKEN_TYPE;
TABRECNO : INTEGER;
END;
INDEX = FILE OF INDEX_RECORD;
BRANCHES = (LEFT, RIGHT);
VAR TEXTFILE : TEXT; { input text }
{ output tables }
B_TABLE, L_TABLE, N_TABLE : TEXT;
J_TABLE, R_TABLE, S_TABLE : TEXT;
T_TABLE, NA_TABLE, NULL_TABLE : TEXT;
{ output indexes }
B_INDEX, L_INDEX, N_INDEX : INDEX;
J_INDEX, R_INDEX, S_INDEX : INDEX;
T_INDEX, NA_INDEX, NULL_INDEX : INDEX;
{ total counts }
B_COUNT, L_COUNT, N_COUNT : INTEGER;
J_COUNT, R_COUNT, S_COUNT : INTEGER;
T_COUNT, NA_COUNT, NULL_COUNT : INTEGER;
{ current output records numbers for use with indexes }
B_CURRECNO, L_CURRECNO, N_CURRECNO : INTEGER;
J_CURRECNO, R_CURRECNO, S_CURRECNO : INTEGER;
T_CURRECNO, NA_CURRECNO, NULL_CURRECNO : INTEGER;
LENGTH : INTEGER;
LINENO : INTEGER;
SEQNO : INTEGER; { unique sequence no. for all tokens }
NSTCH : SET OF CHAR;
CHCODE : INTEGER;
CURRENT_TOKEN : TOKEN_TYPE;
BLANK_TOKEN : TOKEN_TYPE; { blank token }
{ pointers to the roots of trees }
B_F_ENTRY, L_F_ENTRY, N_F_ENTRY : PTR;
J_F_ENTRY, R_F_ENTRY, S_F_ENTRY : PTR;
T_F_ENTRY, NA_F_ENTRY, NULL_F_ENTRY : PTR;
ENTRY : PTR;
REF : REFLINK;
SC, OLDSC : PCH2; { current and previous speech codes }
PREV_WORD_AT_EOLN : BOOLEAN;
SECTNO : PCH2;
PAGENO : PCH3;
VALUE BLANK_TOKEN := (WLENGTH OF ' ');
{ characters not to be stored either as part of tokens,
or as tokens in their own right }
NSTCH := ['.','-',',','"','''',';',
':','!','?','(',')',' '];
{**************************************************************}
PROCEDURE INITIALIZE;
{ FUNCTION: Sets up initial variables
CALLED BY: Tabbuild
CALLS: Nothing }
BEGIN { Initialize }
NSTCH := NSTCH + [CHR(12)]; { add in form feed char }
SEQNO := 0;
PREV_WORD_AT_EOLN := FALSE;
{ blank out total counts }
T_COUNT := 0;
B_COUNT := 0;
L_COUNT := 0;
N_COUNT := 0;
J_COUNT := 0;
R_COUNT := 0;
S_COUNT := 0;
NA_COUNT := 0;
{ set pointers to start of each tree to NIL }
T_F_ENTRY := NIL;
B_F_ENTRY := NIL;
L_F_ENTRY := NIL;
N_F_ENTRY := NIL;
J_F_ENTRY := NIL;
R_F_ENTRY := NIL;
S_F_ENTRY := NIL;
NA_F_ENTRY := NIL;
SC := ' '; { blank out speech code };
OLDSC := ' ';
LINENO := 0;
PAGENO := '***';
{ set output record numbers for each index to 0 }
T_CURRECNO := 0;
B_CURRECNO := 0;
L_CURRECNO := 0;
N_CURRECNO := 0;
J_CURRECNO := 0;
R_CURRECNO := 0;
S_CURRECNO := 0;
NA_CURRECNO := 0;
{ open textfile to be processed }
OPEN (FILE_VARIABLE := TEXTFILE,
FILE_NAME := 'TEXTFILE',
HISTORY := READONLY,
ACCESS_METHOD := SEQUENTIAL,
RECORD_TYPE := VARIABLE,
ORGANIZATION := SEQUENTIAL);
RESET (TEXTFILE);
END; { Initialize }
{----------------------------------------------------}
PROCEDURE SPACE (VAR OUTFILE : TEXT);
{ Current output file }
{ FUNCTION: Writes a space in the current output file
CALLED BY: Write_entry, Write_Refline
CALLS: Nothing }
BEGIN { Space }
WRITE (OUTFILE, ' ')
END; { Space }
{----------------------------------------------------}
PROCEDURE GET_TOKEN (VAR NEW_TOKEN : TOKEN_TYPE);
{ Token into which word is put }
{ FUNCTION: Gets a single word from the input file
CALLED BY: Tabbuild
CALLS: Nothing }
VAR POSN : INTEGER;
CH : CHAR;
BEGIN { Get_Token}
{ initialize }
NEW_TOKEN := BLANK_TOKEN;
POSN := 1;
{ check conditions associated with changing lines }
IF PREV_WORD_AT_EOLN THEN { previous token at end of line }
BEGIN
PREV_WORD_AT_EOLN := FALSE;
LINENO := LINENO + 1; { so increment line number }
END;
{ process token }
REPEAT { until ch = ' ' or eof }
READ (TEXTFILE, CH);
{ check current character }
IF CH IN NSTCH THEN { process any punctuation correctly }
BEGIN
IF CH = '"' THEN { ignore ( and do not store) all other
NSTCH characters }
IF SC = ' ' THEN
SC := OLDSC { resume current speech code }
ELSE
BEGIN { save current speech code }
OLDSC := SC;
SC := ' ';
END;
END
ELSE { part of word }
BEGIN
NEW_TOKEN[POSN] := CH; { place current ch in array }
POSN := POSN + 1;
END;
{ if at end of current line, set global flag for next
invocation of GET_TOKEN }
IF EOLN (TEXTFILE) THEN PREV_WORD_AT_EOLN := TRUE;
UNTIL (CH = ' ') OR EOF (TEXTFILE);
LENGTH := POSN - 1;
END; { Get_Token }
{------------------------------------------------}
PROCEDURE NEW_ENTRY (NEW_WORD : TOKEN_TYPE;
{ New word to be inserted }
BRANCH : BRANCHES;
{ Where this node will be linked
relative to its parent }
PREVNODE : PTR;
{ Pointer to parent of this node }
VAR CUR_F_ENTRY : PTR);
{ Root of this tree }
{ FUNCTION: Creates and fills a new entry
CALLED BY: Putintree
CALLS: Nothing }
BEGIN { New_Entry }
NEW (ENTRY); { create a new record }
ENTRY^.WORD := NEW_WORD;
ENTRY^.COUNT := 1;
ENTRY^.LENGTH := LENGTH;
ENTRY^.LPTR := NIL;
ENTRY^.RPTR := NIL;
NEW (REF);
ENTRY^.FIRSTREF := REF;
REF^.SEQNO := SEQNO;
REF^.ID := SC;
REF^.SECTNO := SECTNO;
REF^.PAGENO := PAGENO;
REF^.LINENO := LINENO;
REF^.NEXTREF := NIL;
ENTRY^.LASTREF := REF; { set pointer to end of list }
IF CUR_F_ENTRY = NIL THEN
CUR_F_ENTRY := ENTRY { if this is first node }
ELSE
IF BRANCH = LEFT THEN
PREVNODE^.LPTR := ENTRY
ELSE { BRANCH = RIGHT }
PREVNODE^.RPTR := ENTRY
END; { New_entry }
{-------------------------------------------}
FUNCTION SEARCH (SEARCH_WORD : TOKEN_TYPE;
{ Word to be searched for }
VAR BRANCH : BRANCHES;
{ Gets set to branch search went down}
VAR LINK,
{ Pointer to current node in tree }
PREVNODE : PTR)
{ Pointer to node last searched }
: BOOLEAN; { Function returns boolean }
{ FUNCTION: Checks to see if a word is already in the tree
CALLED BY: Putintree
CALLS: Nothing }
VAR FOUND : BOOLEAN;
BEGIN { Search }
FOUND := FALSE;
REPEAT
{ until next node to be searched doen't exist or word found }
IF SEARCH_WORD < LINK^.WORD THEN { before }
BEGIN
BRANCH := LEFT;
PREVNODE := LINK;
LINK := LINK^.LPTR;
END
ELSE
IF SEARCH_WORD > LINK^.WORD THEN { after }
BEGIN
BRANCH := RIGHT;
PREVNODE := LINK;
LINK := LINK^.RPTR;
END
ELSE { found }
FOUND := TRUE;
UNTIL (LINK = NIL) OR (FOUND = TRUE);
SEARCH := FOUND;
END; { Search }
{--------------------------------------------------}
PROCEDURE PUTINTREE (NEW_WORD : TOKEN_TYPE;
{ Word to be put in tree }
VAR CUR_F_ENTRY : PTR;
{ Start of current tree }
VAR CUR_COUNT : INTEGER);
{ Total word count }
{ FUNCTION: Puts a word in tree
CALLED BY: Select_Tree
CALLS: Search, New_Entry }
VAR LINK, PREVNODE : PTR;
BRANCH : BRANCHES;
CH : CHAR;
FOUND : BOOLEAN;
BEGIN { Putintree }
CUR_COUNT := CUR_COUNT + 1;
SEQNO := SEQNO + 1;
BRANCH := LEFT; { actual value doesn't matter }
{ uncapitalise first letters }
CHCODE := ORD (NEW_WORD[1]);
IF (CHCODE > 64) AND (CHCODE < 91) THEN
NEW_WORD[1] := CHR (CHCODE + 32);
IF CUR_F_ENTRY = NIL THEN { no nodes in tree }
NEW_ENTRY(NEW_WORD, BRANCH, NIL, CUR_F_ENTRY)
{ left value irrelevant }
ELSE
BEGIN { tree must be searched }
LINK := CUR_F_ENTRY;
FOUND := SEARCH (NEW_WORD, BRANCH, LINK, PREVNODE);
IF FOUND THEN { word found }
BEGIN
LINK^.COUNT := LINK^.COUNT + 1; { increment count }
NEW (REF); { create new reference }
LINK^.LASTREF^.NEXTREF := REF; { link into reflist }
LINK^.LASTREF := REF; { make lastref point to curr. ref }
REF^.SEQNO := SEQNO;
REF^.ID := SC; { set current speech code }
REF^.SECTNO := SECTNO;
REF^.PAGENO := PAGENO;
REF^.LINENO := LINENO;
REF^.NEXTREF := NIL;
END
ELSE
NEW_ENTRY (NEW_WORD, BRANCH, PREVNODE, CUR_F_ENTRY);
END
END; { Putintree }
{---------------------------------------------}
PROCEDURE SELECT_TREE (NEW_WORD : TOKEN_TYPE);
{ Word to be put in tree }
{ FUNCTION: Selects tree for each word
CALLED BY: Main
CALLS: Putintree }
BEGIN
{ put into overall tree }
PUTINTREE (NEW_WORD, T_F_ENTRY, T_COUNT);
{ then put into separate trees }
IF SC = 'B ' THEN
PUTINTREE (NEW_WORD, B_F_ENTRY, B_COUNT)
ELSE
IF SC = 'L ' THEN
PUTINTREE (NEW_WORD, L_F_ENTRY, L_COUNT)
ELSE
IF SC = 'N ' THEN
PUTINTREE (NEW_WORD, N_F_ENTRY, N_COUNT)
ELSE
IF SC = 'J ' THEN
PUTINTREE (NEW_WORD, J_F_ENTRY, J_COUNT)
ELSE
IF SC = 'R ' THEN
PUTINTREE (NEW_WORD, R_F_ENTRY, R_COUNT)
ELSE
IF SC = 'S ' THEN
PUTINTREE (NEW_WORD, S_F_ENTRY, S_COUNT)
ELSE
IF SC = 'NA' THEN
PUTINTREE (NEW_WORD, NA_F_ENTRY, NA_COUNT)
ELSE
IF SC = ' ' THEN
PUTINTREE (NEW_WORD, NULL_F_ENTRY, NULL_COUNT);
END;
{--------------------------------------------}
PROCEDURE PROCESS_TOKEN (NEW_TOKEN : TOKEN_TYPE);
{ Input token }
{ FUNCTION: Filters out special tokens
CALLED BY: Main
CALLS: Select_Tree }
VAR TEMPNO : PCH3;
CH, FIRST_CHAR : CHAR;
TINDEX, PINDEX : INTEGER;
BEGIN
FIRST_CHAR := NEW_TOKEN[1];
CASE FIRST_CHAR OF
'@' : { section number token }
BEGIN { set new section no. }
SECTNO[1] := NEW_TOKEN[2];
SECTNO[2] := NEW_TOKEN[3];
LINENO := LINENO - 1; { ignore line with chap. no. }
END;
'*' : { page number token }
BEGIN { set new page no. }
TEMPNO[1] := NEW_TOKEN[2];
TEMPNO[2] := NEW_TOKEN[3];
TEMPNO[3] := NEW_TOKEN[4];
{ right justify pageno }
TINDEX := 3;
REPEAT
CH := TEMPNO[TINDEX];
IF CH = ' ' THEN TINDEX := TINDEX - 1;
UNTIL CH <> ' ';
PINDEX := 3;
PAGENO := ' ';
REPEAT
PAGENO[PINDEX] := TEMPNO[TINDEX];
PINDEX := PINDEX - 1;
TINDEX := TINDEX - 1;
UNTIL TINDEX = 0;
LINENO := 0; { ignore line containing page no. }
END;
'#' : { speech code token }
BEGIN { set new speech code }
SC[1] := NEW_TOKEN[2];
SC[2] := NEW_TOKEN[3];
END;
' ' : { blank - do nothing };
OTHERWISE SELECT_TREE (NEW_TOKEN) { word to be put into tree }
END; { Case }
END; { Process_Token }
{--------------------------------------------}
PROCEDURE INDEX_ENTRY (KEYWORD : TOKEN_type;
{ Keyword for current index entry }
VAR CUR_INDEX : INDEX;
{ Current index file }
NOFRECS : INTEGER;
{ Number of records for this entry }
VAR CUR_CURRECNO : INTEGER);
{ Current record number in current
table output file }
{ FUNCTION: Write an index entry for the current table record
CALLED BY: Write_Entry
CALLS: Nothing }
VAR CUR_ENTRY : INDEX_RECORD;
BEGIN
CUR_ENTRY.KEYWORD := KEYWORD;
CUR_ENTRY.TABRECNO := CUR_CURRECNO;
WRITE (CUR_INDEX, CUR_ENTRY);
CUR_CURRECNO := CUR_CURRECNO + NOFRECS;
END;
{---------------------------------------------}
PROCEDURE WRITE_ENTRY (CUR_TREE : PTR;
{ Current tree for output }
VAR CUR_TABLE : TEXT;
{ Table to be created }
VAR CUR_INDEX : INDEX;
{ Index to be created }
VAR CUR_CURRECNO : INTEGER;
{ Current output record }
CUR_COUNT : INTEGER);
{ Count of total number of
tokens for this table}
{ FUNCTION: Write a single tree node
CALLED BY: Do_Strip
CALLS: Index_Entry }
VAR FREQ : REAL;
NOFRECS : INTEGER;
CURREF : REFLINK;
REFS_WRITTEN : INTEGER;
BEGIN { Write_entry }
NOFRECS := 0;
REFS_WRITTEN := 0;
WRITE (CUR_TABLE, '*'); { start of type marker }
WRITE (CUR_TABLE, CUR_TREE^.WORD:WLENGTH);
WRITE (CUR_TABLE, CUR_TREE^.COUNT:5); { count for whole text }
SPACE (CUR_TABLE); { frequency for whole text }
FREQ := CUR_TREE^.COUNT / CUR_COUNT;
WRITE (CUR_TABLE, FREQ:10:6);
SPACE (CUR_TABLE);
WRITE (CUR_TABLE, CUR_TREE^.LENGTH:2);
WRITELN (CUR_TABLE); { first line contains main info only }
NOFRECS := NOFRECS + 1;
CURREF := CUR_TREE^.FIRSTREF;
REPEAT { until end of ref. list }
WRITE (CUR_TABLE, CURREF^.SEQNO:5);
SPACE (CUR_TABLE);
WRITE (CUR_TABLE, CURREF^.ID);
SPACE (CUR_TABLE);
WRITE (CUR_TABLE, CURREF^.SECTNO);
SPACE (CUR_TABLE);
WRITE (CUR_TABLE, CURREF^.PAGENO);
SPACE (CUR_TABLE);
WRITE (CUR_TABLE, CURREF^.LINENO:2);
SPACE (CUR_TABLE);
REFS_WRITTEN := REFS_WRITTEN + 1;
CURREF := CURREF^.NEXTREF;
IF CURREF = NIL THEN { last reference }
WRITE (CUR_TABLE, '$') { $ to terminate list of refs }
ELSE { separate references }
WRITE (CUR_TABLE, '/'); { use / as ref separator }
IF (REFS_WRITTEN MOD 2) = 0 THEN { at end of current line }
BEGIN
WRITELN (CUR_TABLE); { start new line if 2 refs written }
NOFRECS := NOFRECS + 1;
END
UNTIL CURREF = NIL;
IF (REFS_WRITTEN MOD 2) = 1 THEN
BEGIN
WRITELN (CUR_TABLE, ' '); { pad table }
NOFRECS := NOFRECS + 1
END;
INDEX_ENTRY (CUR_TREE^.WORD, CUR_INDEX, NOFRECS, CUR_CURRECNO);
END; { Write_entry }
{---------------------------------------------}
PROCEDURE DO_STRIP (CUR_TREE : PTR;
{ Tree to be stripped }
VAR CUR_TABLE : TEXT;
{ Table for output }
VAR CUR_INDEX : INDEX;
{ Index for this table }
VAR CUR_CURRECNO,
{ Current output record number }
CUR_COUNT : INTEGER);
{ Total count for this tree }
{ FUNCTION: Recursively Strip Tree
CALLED BY: StripTree
CALLS: Itself, Write_Entry }
BEGIN { Do_Strip }
IF CUR_TREE <> NIL THEN
BEGIN
DO_STRIP(CUR_TREE^.LPTR, CUR_TABLE, CUR_INDEX,
CUR_CURRECNO, CUR_COUNT); { left branch }
WRITE_ENTRY (CUR_TREE, CUR_TABLE, CUR_INDEX,
CUR_CURRECNO, CUR_COUNT); { current node }
DO_STRIP(CUR_TREE^.RPTR, CUR_TABLE, CUR_INDEX,
CUR_CURRECNO, CUR_COUNT); { left branch }
END;
END; { Do_Strip }
{-------------------------------------------}
PROCEDURE STRIPTREE (CUR_TREE : PTR;
{ Tree to be stripped }
VAR CUR_TABLE : TEXT;
{ Table for output }
VAR CUR_INDEX : INDEX;
{ Index for this table }
VAR CUR_CURRECNO,
{ Current output record number }
CUR_COUNT : INTEGER);
{ Total count for this tree }
{ FUNCTION: Strip a tree into a table and index it
CALLED BY: Strip_All_Trees
CALLS: Do_Strip }
BEGIN { Striptree }
OPEN (FILE_VARIABLE := CUR_TABLE,
HISTORY := NEW,
RECORD_LENGTH := 40,
ACCESS_METHOD := SEQUENTIAL,
RECORD_TYPE := FIXED,
ORGANIZATION := SEQUENTIAL);
REWRITE (CUR_TABLE);
OPEN (FILE_VARIABLE := CUR_INDEX,
HISTORY := NEW,
ACCESS_METHOD := KEYED,
RECORD_TYPE := FIXED,
ORGANIZATION := INDEXED);
REWRITE (CUR_INDEX);
DO_STRIP(CUR_TREE, CUR_TABLE, CUR_INDEX,
CUR_CURRECNO, CUR_COUNT);
CLOSE (CUR_TABLE);
CLOSE (CUR_INDEX);
END; { Striptree }
{--------------------------------------}
PROCEDURE STRIP_ALL_TREES;
{ FUNCTION: Make calls to strip all trees
CALLED BY: Main
CALLS: Striptree }
BEGIN
STRIPTREE (T_F_ENTRY, T_TABLE, T_INDEX, T_CURRECNO, T_COUNT);
STRIPTREE (B_F_ENTRY, B_TABLE, B_INDEX, B_CURRECNO, B_COUNT);
STRIPTREE (L_F_ENTRY, L_TABLE, L_INDEX, L_CURRECNO, L_COUNT);;
STRIPTREE (N_F_ENTRY, N_TABLE, N_INDEX, N_CURRECNO, N_COUNT);
STRIPTREE (J_F_ENTRY, J_TABLE, J_INDEX, J_CURRECNO, J_COUNT);
STRIPTREE (R_F_ENTRY, R_TABLE, R_INDEX, R_CURRECNO, R_COUNT);
STRIPTREE (S_F_ENTRY, S_TABLE, S_INDEX, S_CURRECNO, S_COUNT);
STRIPTREE (NA_F_ENTRY, NA_TABLE, NA_INDEX, NA_CURRECNO,
NA_COUNT);
END;
{******************************************}
BEGIN { Tabbuild }
INITIALIZE;
REPEAT { until end of text }
GET_TOKEN (CURRENT_TOKEN); { get a token }
PROCESS_TOKEN (CURRENT_TOKEN)
UNTIL EOF (TEXTFILE);
STRIP_ALL_TREES;
END.{ Tabbuild }
[ Back to Site Index / Roadmap ] [ Table of Contents ] [ E-mail Author ]
Last modified: Monday, 18-Sep-2017 03:30:02 AEST