BiomedParse / utilities /dataset.py
kernel-luso-comfort's picture
Add Apache License 2.0 header to multiple source files
202eff6
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class Entity(object):
def __init__(self, _id, _text, _mask, _interactive, _type, _start_idx, _end_idx, _image=None):
self.id = _id
self.text = _text
self.mask = _mask
self.interactive = _interactive
self.type = _type
self.start_idx = _start_idx
self.end_idx = _end_idx
self.image = _image
def split_by_ordered_substrings(sentence, substrings):
results = []
substring_indices = []
start_index = 0
for i, substring in enumerate(substrings):
# Find the start of the substring in the remaining part of the sentence
index = sentence[start_index:].find(substring)
if index == -1:
continue
# Append any text before the substring to the results, including spaces
if index > 0:
results.append(sentence[start_index:start_index+index])
substring_indices.append(None) # No match in the `substrings` list for this segment
# Append the substring to the results
results.append(substring)
substring_indices.append(i) # Append the index from the `substrings` list
start_index += index + len(substring)
# If there's any remaining part of the sentence after all substrings, append it to the results
if start_index < len(sentence):
results.append(sentence[start_index:])
substring_indices.append(None) # No match in the `substrings` list for this segment
return results, substring_indices