Character-Level Text Noise-Adding System
A Character-Level Text Noise-Adding System is a text noise-adding system that implements a character-level noise-adding algorithm (which performs random character-level edits) to solve a character-level text noise-adding task.
- Context:
- It can be used to evaluated a Text Error Correction System (such as a character-level TEC system).
 
- Example(s):
- a Python-based Character-Level Noise-Adding System, such as: the one in keras_spell.py.
- …
 
- a Python-based Character-Level Noise-Adding System, such as: the one in 
- Counter-Example(s):
- See: Text String.
References
2018
# CLTnoiseadder: A character-level text noise adding system
# TODO:
# * create a module version
# * support for biased selection based on a character frequency in a corpus.
import sys
import string
import random
import numpy.random
noise_level = 0.08
swap_rate = 0.33
delete_rate = 0.33
CHARS=string.printable
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]
# Create the frequency of characters (for now uniform).
# TODO: base the distribution on some corpus
i=0
character_distribution = []
for char in CHARS:
	character_distribution = character_distribution + [1.0/len(CHARS)]
	i+=1https://www.gabormelli.com/RKB/Character-Level_Text_Noise-Adding_System
# Create a noisy line
noisy_line = ""
for original_line in sys.stdin:
	characters = list(original_line)
	i = 0
	while i < len(characters):
		if random.random() < noise_level:
			# swap
			if i<len(characters)-1 and random.random() <= swap_rate:
				noisy_line += characters[i+1]
				noisy_line += characters[i]
				i+=1
			# delete
			elif (random.random() <= delete_rate):
				i+=1
			# add
			else:
#				noisy_character = numpy.random.choice(list(CHARS), 1)[0]
				noisy_character = numpy.random.choice(list(CHARS), 1, p=character_distribution)[0]
				noisy_line += noisy_character	
		else: 
			noisy_line += characters[i]
		i+=1 ;
# print it
print (noisy_line) 
# measure the difference
print (levenshtein(original_line, noisy_line))
2017
def add_noise_to_string(a_string, amount_of_noise):
   """Add some artificial spelling mistakes to the string"""
   if rand() < amount_of_noise * len(a_string):
       # Replace a character with a random character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:]
   if rand() < amount_of_noise * len(a_string):
       # Delete a character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + a_string[random_char_position + 1:]
   if len(a_string) < CONFIG.max_input_len and rand() < amount_of_noise * len(a_string):
       # Add a random character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:]
   if rand() < amount_of_noise * len(a_string):
       # Transpose 2 characters
       random_char_position = random_randint(len(a_string) - 1)
       a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] +
                   a_string[random_char_position + 2:])
return a_string