How to resolve the algorithm Ukkonen’s suffix tree construction step by step in the Wren programming language

Published on 12 May 2024 09:40 PM

How to resolve the algorithm Ukkonen’s suffix tree construction step by step in the Wren programming language

Table of Contents

Problem Statement

Suffix Trees are very useful in numerous string processing and computational biology problems. The task is to create a function which implements Ukkonen’s algorithm to create a useful Suffix Tree as described: Using Arithmetic-geometric mean/Calculate Pi generate the first 1000, 10000, and 100000 decimal places of pi. Using your implementation with an alphabet of 0 through 9 (plus $ say to make the tree explicit) find the longest repeated string in each list. Time your results and demonstrate that your implementation is linear (i.e. that 10000 takes approx. 10 times as long as 1000). You may vary the size of the lists of decimal places of pi to give reasonable answers.

Let's start with the solution:

Step by Step solution about How to resolve the algorithm Ukkonen’s suffix tree construction step by step in the Wren programming language

Source code in the wren programming language

import "/big" for BigRat
import "/dynamic" for Struct
import "/trait" for ByRef
import "io" for File

var maxChar = 128

var Node = Struct.create("Node", ["children", "suffixLink", "start", "pEnd", "suffixIndex"])

var text                 = ""
var root                 = null
var lastNewNode          = null
var activeNode           = null
var activeEdge           = -1
var activeLength         = 0
var remainingSuffixCount = 0
var pLeafEnd             = ByRef.new(-1)
var pRootEnd             = null
var pSplitEnd            = null
var size                 = -1

var newNode = Fn.new { |start, pEnd|
    var children = List.filled(maxChar, null)
    var suffixLink = root
    var suffixIndex = -1
    return Node.new(children, suffixLink, start, pEnd, suffixIndex)
}

var edgeLength = Fn.new { |n|
    if (n == root) return 0
    return n.pEnd.value - n.start + 1
}

var walkDown = Fn.new { |currNode|
    var el = edgeLength.call(currNode)
    if (activeLength >= el) {
        activeEdge = activeEdge + el
        activeLength = activeLength - el
        activeNode = currNode
        return true
    }
    return false
}

var extendSuffixTree = Fn.new { |pos|
    pLeafEnd.value = pos
    remainingSuffixCount = remainingSuffixCount + 1
    lastNewNode = null
    while (remainingSuffixCount > 0) {
        if (activeLength == 0) activeEdge = pos
        if (!activeNode.children[text[activeEdge].bytes[0]]) {
            activeNode.children[text[activeEdge].bytes[0]] = newNode.call(pos, pLeafEnd)
            if (lastNewNode) {
                lastNewNode.suffixLink = activeNode
                lastNewNode = null
            }
        } else {
            var next = activeNode.children[text[activeEdge].bytes[0]]
            if (walkDown.call(next)) continue
            if (text[next.start + activeLength] == text[pos]) {
                if (lastNewNode && activeNode != root) {
                    lastNewNode.suffixLink = activeNode
                    lastNewNode = null
                }
                activeLength = activeLength + 1
                break
            }
            var temp = next.start + activeLength - 1
            pSplitEnd = ByRef.new(temp)
            var split = newNode.call(next.start, pSplitEnd)
            activeNode.children[text[activeEdge].bytes[0]] = split
            split.children[text[pos].bytes[0]] = newNode.call(pos, pLeafEnd)
            next.start = next.start + activeLength
            split.children[text[next.start].bytes[0]] = next
            if (lastNewNode) lastNewNode.suffixLink = split
            lastNewNode = split
        }
        remainingSuffixCount = remainingSuffixCount - 1
        if (activeNode == root && activeLength > 0) {
            activeLength = activeLength - 1
            activeEdge = pos - remainingSuffixCount + 1
        } else if (activeNode != root) {
            activeNode = activeNode.suffixLink
        }
    }
}

var setSuffixIndexByDFS  // recursive
setSuffixIndexByDFS = Fn.new { |n, labelHeight|
    if (!n) return
    if (n.start != -1) {
        // Uncomment line below to print suffix tree
        // System.write(text[n.start..n.pEnd.value])
    }
    var leaf = 1
    for (i in 0...maxChar) {
        if (n.children[i]) {
            // Uncomment the 3 lines below to print suffix index
            // if (leaf == 1 && n.start != -1) {
            //    System.print(" [%(n.suffixIndex)]")
            // }
            leaf = 0
            setSuffixIndexByDFS.call(n.children[i], labelHeight + edgeLength.call(n.children[i]))
        }
    }
    if (leaf == 1) {
        n.suffixIndex = size - labelHeight
        // Uncomment line below to print suffix index
        // System.print(" [%(n.suffixIndex)]")
    }
}

var buildSuffixTree = Fn.new {
    size = text.count
    var temp = -1
    pRootEnd = ByRef.new(temp)
    root = newNode.call(-1, pRootEnd)
    activeNode = root
    for (i in 0...size) extendSuffixTree.call(i)
    var labelHeight = 0
    setSuffixIndexByDFS.call(root, labelHeight)
}

var doTraversal  // recursive
doTraversal = Fn.new { |n, labelHeight, pMaxHeight, pSubstringStartIndex|
    if (!n) return
    if (n.suffixIndex == -1) {
        for (i in 0...maxChar) {
            if (n.children[i]) {                        
                doTraversal.call(n.children[i], labelHeight + edgeLength.call(n.children[i]),
                    pMaxHeight, pSubstringStartIndex)
             }
        }
    } else if (n.suffixIndex > -1 && (pMaxHeight.value < labelHeight - edgeLength.call(n))) {
        pMaxHeight.value = labelHeight - edgeLength.call(n)
        pSubstringStartIndex.value = n.suffixIndex
    }
}

var getLongestRepeatedSubstring = Fn.new { |s|
    var maxHeight = 0
    var substringStartIndex = 0
    var pMaxHeight = ByRef.new(maxHeight)
    var pSubstringStartIndex = ByRef.new(substringStartIndex)
    doTraversal.call(root, 0, pMaxHeight, pSubstringStartIndex)
    maxHeight = pMaxHeight.value
    substringStartIndex = pSubstringStartIndex.value
    // Uncomment line below to print maxHeight and substringStartIndex
    // System.print("maxHeight %(maxHeight), substringStartIndex %(substringStartIndex)")
    if (s == "") {
        System.write("  %(text) is: ")
    } else {
        System.write("  %(s) is: ")
    }
    var k = 0
    while (k < maxHeight) {
        System.write(text[k + substringStartIndex])
        k = k + 1
    }
    if (k == 0) {
        System.write("No repeated substring")
    }
    System.print()
}

var tests = [
    "GEEKSFORGEEKS$",
    "AAAAAAAAAA$",
    "ABCDEFG$",
    "ABABABA$",
    "ATCGATCGA$",
    "banana$",
    "abcpqrabpqpq$",
    "pqrpqpqabab$",
]
System.print("Longest Repeated Substring in:\n")
for (test in tests) {
    text = test
    buildSuffixTree.call()
    getLongestRepeatedSubstring.call("")
}
System.print()

// load pi to 100,182 digits
var piStr = File.read("pi_100000.txt")
piStr = piStr[2..-1] // remove initial 3.
var numbers = [1e3, 1e4, 1e5]
maxChar = 58
for (number in numbers) {
    var start = System.clock
    text = piStr[0...number] + "$"
    buildSuffixTree.call()
    getLongestRepeatedSubstring.call("first %(number) d.p. of Pi")
    var elapsed = (System.clock - start) * 1000
    System.print("  (this took %(elapsed) ms)\n")
}

  

You may also check:How to resolve the algorithm Random number generator (included) step by step in the REXX programming language
You may also check:How to resolve the algorithm Greatest element of a list step by step in the Frink programming language
You may also check:How to resolve the algorithm String concatenation step by step in the Ada programming language
You may also check:How to resolve the algorithm Balanced ternary step by step in the Wren programming language
You may also check:How to resolve the algorithm Substring step by step in the Component Pascal programming language