"""
Converts data from brat to con. Enter input and output directories as command line arguments.
Each '.ann' file must have a '.txt' file in the same directory with the same name, minus the extension.
Use '-c' (without quotes) as an optional final command-line argument to copy the text files used
in the conversion process to the output directory.
Also possible to import 'convert_brat_to_con()' directly and pass the paths to the ann and txt files
for individual conversion.
:author: Steele W. Farnsworth
:date: 30 December, 2018
"""
from sys import argv as cmd_arg
from re import split
import os
import shutil
[docs]def check_valid_line(item: str):
"""Returns a boolean value for whether or not a given line is in the BRAT format. Tests are not comprehensive."""
if not isinstance(item, str): return False
elif '\t' not in item: return False
elif item == "": return False
elif item.startswith("#"): return False
else: return True
[docs]def line_to_dict(item):
"""
Converts a string that is a line in brat format to a dictionary representation of that data.
Keys are: T; data_type; start_ind; end_ind; data_type.
:param item: The line of con text (str).
:return: The dictionary containing that data.
"""
split1 = split("\t", item)
split2 = split(" ", split1[1])
split3 = [split1[0]] + split2 + [split1[2]]
s = [i.rstrip() for i in split3] # remove whitespace
return {"T": s[0], "data_type": s[1], "start_ind": int(s[2]), "end_ind": int(s[3]), "data_item": s[4]}
[docs]def switch_extension(name, ext):
"""
Primarily for internal use.
Takes the name of a file (str) and changes the extension to the one provided (str)
"""
return os.path.splitext(name)[0] + ext
[docs]def find_line_num(text_, start):
"""
:param text_: The text of the file, ex. f.read()
:param start: The index at which the desired text starts
:return: The line index (starting at 0) containing the given start index
"""
return text_[:int(start)].count("\n")
[docs]def get_relative_index(text_: str, line_, absolute_index):
"""
Takes the index of a phrase (the phrase itself is not a parameter) relative to the start of its
file and returns its index relative to the start of the line that it's on. Assumes that the line_
argument is long enough that (and thus so specific that) it only occurs once.
:param text_: The text of the file, not separated by lines
:param line_: The text of the line being searched for
:param absolute_index: The index of a given phrase
:return: The index of the phrase relative to the start of the line
"""
line_index = text_.index(line_)
return int(absolute_index) - line_index
[docs]def get_end_word_index(data_item: str, start_index, end_index):
"""Returns the index of the first char of the last word of data_item_;
all parameters shadow the appropriate name in the final for loop"""
words = split(" ", data_item)
if words.__len__() == 1:
return start_index # If there's only one word, the start of the first word is the start of the last word
else:
last_word = words[-1]
return end_index - last_word.__len__()
[docs]def convert_brat_to_con(brat_file_path, text_file_path=None):
"""
Takes a path to a brat file and returns a string representation of that file converted to the con format.
:param brat_file_path: The path to the brat file; not the file itself. If the path is not valid, the argument
will be assumed to be text of the brat file itself.
:param text_file_path: The path to the text file; if not provided, assumed to be a file with the same path as
the brat file ending in '.txt' instead of '.ann'. If neither file is found, raises error.
:return: A string (not a file) of the con equivalent of the brat file.
"""
# By default, find txt file with equivalent name
if text_file_path is None:
text_file_path = switch_extension(brat_file_path, ".txt")
if not os.path.isfile(text_file_path):
raise FileNotFoundError("No text file path was provided and no matching text file was found in the input"
" directory")
with open(text_file_path, 'r') as text_file:
text = text_file.read()
text_lines = text.split('\n')
# Otherwise open the file with the path passed to the function
elif os.path.isfile(text_file_path):
with open(text_file_path, 'r') as text_file:
text = text_file.read()
text_lines = text.split('\n')
else: raise FileNotFoundError("No text file path was provided or the file was not found."
" Note that direct string input of the source text is not supported.")
# If con_file_path is actually a path, open it and split it into lines
if os.path.isfile(brat_file_path):
with open(brat_file_path, 'r') as brat_file:
brat_text = brat_file.read()
brat_text_lines = brat_text.split('\n')
else: # Else, read whatever string is passed to the function as if it were the file itself
brat_text = brat_file_path
brat_text_lines = brat_text.split('\n')
output_lines = "" # This value will be appended
for line in brat_text_lines:
if not check_valid_line(line): continue
d = line_to_dict(line)
start_line_num = find_line_num(text, d["start_ind"])
start_char_num = get_relative_index(text, text_lines[start_line_num], d["start_ind"])
start_str = str(start_line_num + 1) + ':' + str(start_char_num)
# Note that the end word has an extra calculation because the index of the first char
# of the last word is what is needed, not the last char of the last word.
end_line_num = find_line_num(text, d["end_ind"])
end_char_num = get_relative_index(text, text_lines[end_line_num], d["end_ind"])
end_word_num = get_end_word_index(d["data_item"], start_char_num, end_char_num)
end_str = str(end_line_num + 1) + ':' + str(end_word_num)
con_line = "c=\"%s\" %s %s||t=\"%s\"\n" % (d["data_item"], start_str, end_str, d['data_type'])
output_lines += con_line
return output_lines
if __name__ == '__main__':
# Get the input and output directories from the command line.
if not cmd_arg.__len__() >= 3:
# Command-line arguments must be provided for the input and output directories.
# Else, prints instructions and aborts the program.
print("Please run the program again, entering the input and output directories as command-line arguments"
" in that order. Optionally, enter '-c' as a final command line argument if you want to copy"
" the text files used in the conversion over to the output directory.")
exit()
try:
input_dir_name = cmd_arg[1]
input_dir = os.listdir(input_dir_name)
except FileNotFoundError: # dir doesn't exist
while not os.path.isdir(input_dir_name):
input_dir_name = input("Input directory not found; please try another directory:")
input_dir = os.listdir(input_dir_name)
try:
output_dir_name = cmd_arg[2]
output_dir = os.listdir(output_dir_name)
except FileNotFoundError:
while not os.path.isdir(output_dir_name):
output_dir_name = input("Output directory not found; please try another directory:")
output_dir = os.listdir(output_dir_name)
# Create a list of only the .txt files in the input directory
text_files = [f for f in input_dir if f.endswith(".txt")]
# Create a list of all .ann files in the input directory that have a txt equivalent
ann_files = [f for f in input_dir if f.endswith(".ann") and switch_extension(f, ".txt") in text_files]
for input_file_name in ann_files:
full_file_path = os.path.join(input_dir_name, input_file_name)
output_file_name = switch_extension(input_file_name, ".con")
content = convert_brat_to_con(full_file_path)
with open(os.path.join(output_dir_name, output_file_name), "a+") as output_file:
output_file.write(content)
# Paste all the text files used in the conversion process to the output directory
# if there's a fourth command line argument and that argument is -c
if cmd_arg.__len__() == 4 and cmd_arg[3] == "-c":
text_files_with_match = [f for f in text_files if switch_extension(f, ".ann") in ann_files]
for f in text_files_with_match:
full_name = os.path.join(input_dir_name, f)
shutil.copy(full_name, output_dir_name)