How to resolve the algorithm Burrows–Wheeler transform step by step in the Pascal programming language

Published on 12 May 2024 09:40 PM

How to resolve the algorithm Burrows–Wheeler transform step by step in the Pascal programming language

Table of Contents

Problem Statement

The Burrows–Wheeler transform (BWT, also called block-sorting compression) rearranges a character string into runs of similar characters. This is useful for compression, since it tends to be easy to compress a string that has runs of repeated characters by techniques such as move-to-front transform and run-length encoding. More importantly, the transformation is reversible, without needing to store any additional data. The BWT is thus a "free" method of improving the efficiency of text compression algorithms, costing only some extra computation.

Source: Burrows–Wheeler transform

Let's start with the solution:

Step by Step solution about How to resolve the algorithm Burrows–Wheeler transform step by step in the Pascal programming language

Source code in the pascal programming language

program BurrowsWheeler;

{$mode objfpc}{$H+}  // Lazarus default mode; long strings
uses SysUtils;       // only for console output
const STR_BASE = 1;  // first character in a Pascal string has index [1].
type TComparison = -1..1;

procedure Encode( const input : string;
                  out encoded : string;
                  out index : integer);
var
  n : integer;
  perm : array of integer;
  i, j, k : integer;
  incr, v : integer;

        // Subroutine to compare rotations whose *last* letters have zero-based
        //  indices a, b. Returns 1, 0, -1 according as the rotation ending at a
        //  is >, =, < the rotation ending at b.
        function CompareRotations( a, b : integer) : TComparison;
        var
          p, q, nrNotTested : integer;
        begin
          result := 0;
          p := a;
          q := b;
          nrNotTested := n;
          repeat
            inc(p); if (p = n) then p := 0;
            inc(q); if (q = n) then q := 0;
            if (input[p + STR_BASE] = input[q + STR_BASE]) then dec( nrNotTested)
            else if (input[p + STR_BASE] > input[q + STR_BASE]) then result := 1
            else result := -1
          until (result <> 0) or (nrNotTested = 0);
        end;
begin
  n := Length( input);
  SetLength( perm, n);
  for j := 0 to n - 1 do perm[j] := j;

  // Sort string indices by comparing the associated rotations, as above.
  // This is a Shell sort from Press et al., Numerical Recipes, 3rd edn, pp 422-3.
  // Other sorting algorithms might be used.
  incr := 1;
  repeat
    incr := 3*incr + 1
  until (incr >= n);
  repeat
    incr := incr div 3;
    for i := incr to n - 1 do begin
      v := perm[i];
      j := i;
      while (j >= incr) and (CompareRotations( perm[j - incr], v) = 1) do begin
        perm[j] := perm[j - incr];
        dec( j, incr);
      end;
      perm[j] := v;
    end; // for
  until (incr = 1);

  // Apply the sorted array to create the output.
  SetLength( encoded, n);
  for j := 0 to n - 1 do begin
    k := perm[j];
    encoded[j + STR_BASE] := input[k + STR_BASE];
    if (k = n - 1) then index := j;
  end;
end;

{------------------------------------------------------------------------------
Given an encoded string and the associated index, one way to rebuild
the original string is to do the following, or its equivalent:

Given        Make an array     Sort the array    Rebuild the original string
'NNBAAA'     [0] = ('N', 0)    [0] = ('A', 3)    Start with given index 3
index = 3    [1] = ('N', 1)    [1] = ('A', 4)    [3] gives 'B', next index = 2
             [2] = ('B', 2)    [2] = ('A', 5)    [2] gives 'A', next index = 5
             [3] = ('A', 3)    [3] = ('B', 2)    [5] gives 'N', next index = 1
             [4] = ('A', 4)    [4] = ('N', 0)    [1] gives 'A', next index = 4
             [5] = ('A', 5)    [5] = ('N', 1)    [4] gives 'N', next index = 0
                                                 [0] gives 'A', next index = 3
                                                 3 = start index, so stop
                                                 Result = 'BANANA'

If the original string consists of two or more repetitions of a substring,
  the above method will stop when that substring has been built, e.g.
  'CANCAN' will stop at 'CAN'.
We therefore need to test for the rebuilt string being too short, and if so
  make enough copies of the decoded part to fill the required length.

It's possible to take the above description literally, and write a decoding
  routine that uses a record type consisting of a character and an integer.
A more efficient way is to create an integer array containing only the indices,
  in the above example (3, 4, 5, 2, 0, 1). A first pass counts the occurrences
  of each character in the encoded string. If the character set is ['A'..'Z']
  then the indices associated with 'A' are stored from [0]. If 'A' occurs a times,
  the indices associated with 'B' are stored from [a]; if 'B' occurs b times,
  the indices associated with 'C' are stored from [a + b]; and so on.
}
function Decode( encoded : string;
                 index : integer) : string;
var
  charInfo : array [char] of integer;
  perm : array of integer;
  n, j, k : integer;
  c : char;
  total, prev : integer;

begin
  n := Length( encoded);
  // An empty encoded string will crash the code below, so trap it here.
  if (n = 0) then begin
    result := '';
    exit;
  end;

  // Count the occurrences of each possible character.
  for c := Low(char) to High(char) do charInfo[c] := 0;
  for j := 0 to n - 1 do begin
    c := encoded[j + STR_BASE];
    inc( charInfo[c]);
  end;

  // Cumulate, i.e. charInfo[k] := sum of old charInfo from 0 to k - 1
  total := 0;
  prev := 0;
  for c := Low(char) to High(char) do begin
    inc( total, prev);
    prev := CharInfo[c];
    charInfo[c] := total;
  end;

  // Make the array "perm"
  SetLength( perm, n);
  for j := 0 to n - 1 do begin
    c := encoded[j + STR_BASE];
    k := charInfo[c];
    perm[k] := j;
    inc( charInfo[c]);
  end;

  // Apply the array "perm" to re-create the original string.
  SetLength( result, n);
  k := 0; // index into result
  j := index;
  repeat
    j := perm[j];
    result[k + STR_BASE] := encoded[j + STR_BASE];
    inc(k);
  until (j = index);

  // If the original consisted of M repetitions of the same string, then
  //   at this point exactly 1/M of the result has been filled in.
  // For M > 1 (shown by k < n), complete the result by copying the first part.
  if (k < n) then begin
    Assert( n mod k = 0); // we should have n = M*k
    for j := k to n - 1 do result[j + STR_BASE] := result[j - k + STR_BASE];
  end;
end;

procedure Test( const s : string);
var
  encoded, decoded : string;
  index : integer;
begin
  WriteLn( '');
  WriteLn( '     ' + s);
  Encode( s, {out} encoded, index);
  WriteLn( '---> ' + encoded);
  WriteLn( '       index = ' + SysUtils.IntToStr( index));
  decoded := Decode( encoded, index);
  WriteLn( '---> ' + decoded);
end;

begin
  Test( 'BANANA');
  Test( 'CANAAN');
  Test( 'CANCAN');
  Test( 'appellee');
  Test( 'dogwood');
  Test( 'TO BE OR NOT TO BE OR WANT TO BE OR NOT?');
  Test( 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES');
end.


  

You may also check:How to resolve the algorithm Comments step by step in the Squirrel programming language
You may also check:How to resolve the algorithm Distribution of 0 digits in factorial series step by step in the Raku programming language
You may also check:How to resolve the algorithm Hello world/Text step by step in the Clay programming language
You may also check:How to resolve the algorithm Associative array/Iteration step by step in the PostScript programming language
You may also check:How to resolve the algorithm System time step by step in the Maxima programming language