summaryrefslogtreecommitdiff
path: root/src/SearchUnit.pas
blob: 56600e3d057c410965d1df54d67dbf22c2ccfb3f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
Unit SearchUnit;

{$mode objfpc}{$H+}

// NewView - a new OS/2 Help Viewer
// Copyright 2001 Aaron Lawrence (aaronl at consultant dot com)
// This software is released under the Gnu Public License - see readme.txt

Interface

// Contains code to search help files.

uses
  Classes,
  HelpFile, TextSearchQuery, DataTypes;

const
  // match weightings
  mwFirstTitleWord = 50;
  mwTitleWord = 20;
  mwFirstIndexWord = 20;
  mwIndexWord = 10;
  mwTopicTextWord = 1;

  // note on weightings. The title/index weightings
  // are multipled by word weightings.
  // Topic text matches are equal to word weighting
  // times word weighting.

type
  TSearchType = ( stStarts, stContains, stMatches );

  procedure SearchHelpFile( HelpFile: THelpFile;
                            Query: TTextSearchQuery;
                            Results: TList;
                            HighlightWords: Int32ArrayPointer );


Implementation

uses
  SysUtils,
//  ACLUtility, ACLStringUtility,
  HelpTopic, CompareWordUnit, nvUtilities;

// Search the help file dictionary for words that match
// the given search word. Partial matches are considered.
// Results returns the matching word indexes.
// Relevances returns the relevance of the word stored
// at the same position
procedure SearchDictionary( HelpFile: THelpFile;
                            SearchWord: string;
                            Results: Int32ArrayPointer );
var
  DictIndex: integer;
  DictWord: string;
  WordRelevance: longint;
begin
  SearchWord:= UpperCase( SearchWord );
  FillInt32Array( Results, HelpFile.DictionaryCount, 0 );

  for DictIndex:= 0 to HelpFile.DictionaryCount - 1 do
  begin
    DictWord := HelpFile.DictionaryWords[ DictIndex ];
    WordRelevance := CompareWord( SearchWord, DictWord );
    Results^[ DictIndex ]:= WordRelevance;
  end;
end;

// Search titles of topics for given searchword
procedure SearchTopicTitles( HelpFile: THelpFile;
                             SearchWord: string;
                             Results: Int32ArrayPointer );
var
  TopicIndex: longint;
  Title: string;
  TitleWord: string;
  Topic: TTopic;
  TitleWordIndex: longint;
  WordRelevance: longint;
  TitleWordRelevance: longint;
begin
  // Search topic titles
  for TopicIndex:= 0 to HelpFile.TopicCount - 1 do
  begin
    Topic:= HelpFile.Topics[ TopicIndex ];
    Title:= Topic.Title;
    TitleWordIndex := 0;
    while Title <> '' do
    begin
      TitleWord:= ExtractNextValue( Title, ' ' );
      WordRelevance := CompareWOrd( SearchWord, TitleWord );
      if WordRelevance > 0 then
      begin
        if TitleWordIndex = 0 then
          // matching the first word is best
          TitleWordRelevance := mwFirstTitleWord * WordRelevance
        else
          TitleWordRelevance := mwTitleWord * WordRelevance;
        inc( Results^[ TopicIndex ], TitleWordRelevance );
      end;
      inc( TitleWordIndex );
    end;
  end;
end;

// Search index entries for given searchword
procedure SearchIndex( HelpFile: THelpFile;
                       SearchWord: string;
                       Results: Int32ArrayPointer );
var
  IndexIndex: longint;
  IndexEntry: string;
  IndexEntryWord: string;
  Topic: TTopic;
  IndexEntryWordIndex: longint;
  WordRelevance: longint;
  IndexEntryWordRelevance: longint;
begin
  for IndexIndex:= 0 to HelpFile.Index.Count - 1 do
  begin
    Topic:= HelpFile.Index.Objects[ IndexIndex ] as TTopic;
    IndexEntry:= HelpFile.Index[ IndexIndex ];
    IndexEntryWordIndex := 0;
    while IndexEntry <> '' do
    begin
      IndexEntryWord:= ExtractNextValue( IndexEntry, ' ' );
      WordRelevance := CompareWord( SearchWord, IndexEntryWord );
      if WordRelevance > 0 then
      begin
        if IndexEntryWordIndex = 0 then
          // matching the first word is best
          IndexEntryWordRelevance := mwFirstIndexWord * WordRelevance
        else
          IndexEntryWordRelevance := mwIndexWord * WordRelevance;
        inc( Results^[ Topic.Index ], IndexEntryWordRelevance );
      end;
      inc( IndexEntryWordIndex );
    end;
  end;
end;

// Utility function used in decompression of search table.
// Updates the appropriate entry in Results array.
// The word being matched is given in DictIndex and is
// used to count the actual occurrences of the word
// within the topic
{procedure AddTopicFoundInTopicText( TopicIndex: int16;
                                    Results: Int32ArrayPointer;
                                     DictIndex: longint;
                                              WordRelevance: longint );
var
  Topic: TTopic;
  Relevance: longint;
begin
  Topic:= _Topics[ TopicIndex ];
  Relevance := mwTopicTextWord
            * Topic.CountWord( DictIndex )
            * WordRelevance;
  inc( Results^[ TopicIndex ], Relevance );

end;}

// ------------------------------------------------------

// Master search function. Given a search query,
// searches topic text, titles, index entries.
// Matching topics are added to TList, with their
// SearchRelevance set appropriately.
procedure SearchHelpFile( HelpFile: THelpFile;
                          Query: TTextSearchQuery;
                          Results: TList;
                          HighlightWords: Int32ArrayPointer );
var
  Topic: TTopic;
  TopicIndex: longint;
  TermIndex: longint;
  Term: TSearchTerm;
  TopicMatches: Int32ArrayPointer;
  TopicRelevancesForTerm: Int32ArrayPointer;
  TopicMatchedTerm: boolean;

  WordRelevance: longint;
  DictionaryRelevances: Int32ArrayPointer;
  DictIndex: longint;
  TopicRelevanceForTerm: longint;
  TopicWordCount: longint;
begin
  // Reset flags per topic
  for TopicIndex := 0 to HelpFile.TopicCount - 1 do
  begin
    Topic := HelpFile.Topics[ TopicIndex ];
    Topic.FoundInSearch := false;
    Topic.ExcludedInSearch := false;
    Topic.SearchRelevance := 0;
  end;

  if HighlightWords <> nil then
    // Clear the highlightwords array
    FillInt32Array( HighlightWords, HelpFile.DictionaryCount, 0 );

  // Get memory for dictionary/topic relevance arrays
  GetMem( DictionaryRelevances, HelpFile.DictionaryCount * sizeof( longint ) );
  GetMem( TopicMatches, HelpFile.TopicCount * sizeof( longint ) );
  GetMem( TopicRelevancesForTerm, HelpFile.TopicCount * sizeof( longint ) );

  for TermIndex := 0 to Query.TermCount - 1 do
  begin
    Term := Query.Term[ TermIndex ];

    FillInt32Array( TopicRelevancesForTerm, HelpFile.TopicCount, 0 );

    // Search the dictionary for matches.
    SearchDictionary( HelpFile, Term.Text, DictionaryRelevances );

    // Update the highlight words array.
    // (effectively an OR)
    if HighlightWords <> nil then
    begin
      if Term.CombineMethod in [ cmAnd, cmOr ] then
      begin
        for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
          inc( HighlightWords^[ DictIndex ],  DictionaryRelevances^[ DictIndex ] );
      end;
    end;

    // For each word in the dictionary that matches
    // this search word, search topics/titles/index
    for DictIndex := 0 to HelpFile.DictionaryCount - 1 do
    begin
      WordRelevance := DictionaryRelevances^[ DictIndex ];
      if WordRelevance > 0 then
      begin
        // Search for occurrences of this word
        // within the text of topics
        HelpFile.SearchTable.Search( DictIndex, TopicMatches );

        // Work out total relevance for each topic found:
        for TopicIndex := 0 to HelpFile.TopicCount - 1 do
        begin
          if TopicMatches^[ TopicIndex ] > 0 then
          begin
            // Search table indicates word occurs in
            // this topic, so count number of
            // occurrences to get relevance
            Topic := HelpFile.Topics[ TopicIndex ];

            TopicWordCount := Topic.CountWord( DictIndex );
            TopicRelevancesForTerm^[ TopicIndex ] := TopicWordCount * WordRelevance;
          end;
        end;
      end;
    end;

    // Search titles and index
    SearchTopicTitles( HelpFile, Term.Text, TopicRelevancesForTerm );
    SearchIndex( HelpFile, Term.Text, TopicRelevancesForTerm );

    // Set match flags for each topic, marking
    // as found or excluded depending on combine
    // method
    for TopicIndex := 0 to HelpFile.TopicCount - 1 do
    begin
      Topic := HelpFile.Topics[ TopicIndex ];
      TopicRelevanceForTerm := TopicRelevancesForTerm^[ TopicIndex ];
      TopicMatchedTerm := TopicRelevanceForTerm > 0;
      case Term.CombineMethod of
       cmAnd:
         if not TopicMatchedTerm then
           Topic.ExcludedInSearch := true
         else
           Topic.FoundInSearch := true;

       cmNot:
         if TopicMatchedTerm then
           Topic.ExcludedInSearch := true;

       cmOr:
         if TopicMatchedTerm then
           Topic.FoundInSearch := true;
      end;
      if TopicMatchedTerm then
        inc( Topic.SearchRelevance, TopicRelevanceForTerm );
    end;

    // loop for next word...
  end;

  // Now find topics that DID have a match
  // and did NOT have an exclusion match
  // ... add the topic to result list
  for TopicIndex := 0 to HelpFile.TopicCount - 1 do
  begin
    Topic := HelpFile.Topics[ TopicIndex ];
    if Topic.FoundInSearch
       and ( not Topic.ExcludedInSearch ) then
    begin
      Results.Add( Topic );
    end;
  end;

  FreeMem( TopicRelevancesForTerm, HelpFile.TopicCount * sizeof( longint ) );
  FreeMem( TopicMatches, HelpFile.TopicCount * sizeof( longint ) );
  FreeMem( DictionaryRelevances, HelpFile.DictionaryCount * sizeof( longint ) );
end;

function ExtractNextIPFWordPart( var Word: string ): string;
var
  CharIndex: longint;
begin
  assert( Length( Word ) > 0 );
  CharIndex := 2;
  if IsDigit( Word[ 1 ] ) then
  begin
    // extract string of digits
    while CharIndex <= Length( Word ) do
    begin
      if not IsDigit( Word[ CharIndex ] ) then
        break;
      inc( CharIndex );
    end;
  end
  else if IsAlpha( Word[ 1 ] ) then
  begin
    // extract string of letters
    while CharIndex <= Length( Word ) do
    begin
      if not IsAlpha( Word[ CharIndex ] ) then
        break;
      inc( CharIndex );
    end;
  end
  else
  begin
    // extract single non-alphanumeric symbol
  end;
  assert( CharIndex > 1 );
  Result := Copy(Word, 0, CharIndex-1);
//  Result := StrLeft( Word, CharIndex - 1 );
  Delete( Word, 1, CharIndex - 1 )
end;

Initialization
End.