Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import argparse
import os # for file walks
import sys # for error handling
import subprocess # to call grep utility
import functools # to bind an argument to function (neccessarry for collect_callback to get a valid filename)
import hashlib # to compute the md5sum for the given ID
import magic # to determine files type not be extension but by their content. mimetype won't work, cannot recognize DICOM files
import pydicom # to process dicom images
from pathlib import Path
# global variables
PNs_occured = {} # see print_report for format
G_occured = {}
F_occured = {}
SENSITIVE_DATA = set()
def md5sum_id():
return str(hashlib.md5(args.id.encode('utf-8')).hexdigest())
def add_entry( category, value, filename, report ):
if category in report:
if value in report[category]:
report[category][value].add( filename )
else:
report[category][value] = set([filename])
else:
report[category] = { value : set([filename]) }
if args.verbose:
print(f'{filename}-{category}: {value}')
def collect_callback(fname, dataset, data_element):
global SENSITIVE_DATA
if data_element.VR:
if data_element.VR == "PN" and data_element.value: # DICOM specifies all ValueRepresentations with a code, and all with code PN are carrying PersonalNames
SENSITIVE_DATA |= set([str(data_element.value)])
add_entry(str(data_element.description()), str(data_element.value), fname, PNs_occured )
else:
for pattern in args.grep_patterns:
if pattern.lower() in str(data_element.value).lower():
add_entry(str(data_element.description()), str(data_element.value), fname, PNs_occured )
def remove_person_names_callback(dataset, data_element):
'''cycles through all data elements within a DICOM file and in case of a PN (PersonalName), it may
* stores the value of that certain PN into PNs_occured, along with the filename
* modifys the value of a non empty PN according to the given watermark
'''
if data_element.VR == "PN" and data_element.value: # DICOM specifies all ValueRepresentations with a code, and all with code PN are carrying PersonalNames
data_element.value = md5sum_id()
# TODO check also if sensitive data needs to be replaced, e.g. data in SENSITIVE_DATA
def print_report( report, dir ):
'''
Prints a report of found PNs with their actual occured values and list of files where found.
:param dict report: dictionary of occured PNs and their values with file lists, in the following format:: text
PNs_occured = {
'Operators Name' : { 'Foo Bar' : [file_a, file_b, ...],
'Bar Foo' : [file_c, file_d],
...,
},
'Pyhsicians Name' : { ... },
...
}
:param path dir: current directory used to strip this path from filepaths, so all file names are relative to that directory
'''
for dicom_tag,values in report.items():
if len(values) == 1 and not next(iter(values)): # skip those PNs only containing '' values
continue
else:
print(f'{dicom_tag}:')
for person,fileset in values.items():
if person:
if args.verbose:
prefix = repr(person)
else:
MAX_LEN = 50
v = (str(person)[:MAX_LEN] + '..' ) if len(str(person)) > MAX_LEN else str(person)
prefix = repr(v)
prefix += ' => ['
if args.verbose:
print('\t' + prefix + (',\n\t' + (' ' * len(prefix))).join([os.path.relpath(f, dir) for f in fileset]) + ']')
else:
print('\t' + prefix + f'{os.path.relpath(next(iter(fileset)), dir)}, ...] #{len(fileset)}')
def for_each_file( dir, apply ):
'''travers all files within a directory including subdirectories
:param path dir: Directory to start from
:param func apply: function to apply a certain action, requires path and filetype
'''
for root, subdirs, files in os.walk(dir):
for f in files:
apply( os.path.join(root,f), magic.from_file(os.path.join(root,f)) ) # ATTENTION DICOMDIR and DICOM images are both recognized as DICOM files
def remove_sensitive_from_dicom( fname, ftype ):
if "DICOM medical imaging data" in ftype and 'DICOMDIR' not in fname:
dataset = pydicom.dcmread(fname)
dataset.walk(remove_person_names_callback)
dataset.save_as(fname)
def collect_sensitive_from_dicom(fname, ftype):
if "DICOM medical imaging data" in ftype and 'DICOMDIR' not in fname:
dataset = pydicom.dcmread(fname)
# When there is no dataset attached to the dicom file, dataset.filename might not exits,
# hence we bind the filename as a first parameter to collect_callback
dataset.walk(functools.partial(collect_callback, dataset.filename))
def grep_file(pattern, file):
return not subprocess.call(['grep', '-i', pattern, file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def read_sensitive():
sensitive = set()
try:
filename = Path.home() / ('.dp.' + args.id )
with open(filename, 'r') as f:
sensitive |= set(f.read().splitlines())
except IOError:
print(f'Error open file: {args.read_sensitive}', file=sys.stderr)
return sensitive
def handle_other( fname, ftype ):
if 'DICOMDIR' in fname or \
'DICOM medical imaging data' in ftype:
return
elif 'compressed data' in ftype or 'archive data' in ftype:
print(f"Warning: skipping archive or compressed file: {fname}", file=sys.stderr)
return
for sdata in SENSITIVE_DATA:
if grep_file( sdata, fname):
add_entry('Grep-Results', sdata, fname, G_occured )
def wipe_sdata( fname, sdata ):
for token in sdata:
fname = fname.replace(token, '0' * len(token))
return fname
def handle_fnames(dir):
global SENSITIVE_DATA
for root, dirs, files in os.walk(dir, topdown=False):
# Attention: Files first, because they reside in directories which might be renamed
for fname in files:
if any(token in fname for token in SENSITIVE_DATA):
if args.clean:
os.rename(os.path.join(root, fname),
os.path.join(root, wipe_sdata(fname, SENSITIVE_DATA)))
else:
sdata = [token for token in fname for token in SENSITIVE_DATA][0]
add_entry('Filenames', sdata, os.path.join(root, fname), F_occured)
for subdir in dirs:
if any(token in subdir for token in SENSITIVE_DATA):
if args.clean:
os.rename(os.path.join(root, subdir),
os.path.join(root, wipe_sdata(subdir, SENSITIVE_DATA)))
else:
sdata = [token for token in subdir for token in SENSITIVE_DATA][0]
add_entry('Filenames', sdata, os.path.join(root, subdir), F_occured)
def process_dir( dir ):
"""
1. if there are archives, or compressed data files, extract them, and store them for recompression afterwards
ATTENTION: handle compression bombs, nested compression
2. cycle through dir (no compression should occur):
2.1 handle all DICOM files
2.2 handle all other files, by looking for DICOM colleced PNs
2.3 search sensitive data inside of filenames too
"""
print('process: ' + os.path.abspath(dir))
global SENSITIVE_DATA
SENSITIVE_DATA |= set([args.id])
if args.grep_patterns:
SENSITIVE_DATA |= set(args.grep_patterns)
if args.read_sensitive:
SENSITIVE_DATA |= read_sensitive()
if not args.read_sensitive or args.grep_patterns:
for_each_file( os.path.abspath(dir), collect_sensitive_from_dicom )
if not args.clean:
print_report( PNs_occured, dir )
if args.clean:
for_each_file( os.path.abspath(dir), remove_sensitive_from_dicom )
# 2.2 reprocess all non DICOM images, archives and compressed files are irgnored and warned
for_each_file( os.path.abspath(dir), handle_other )
# NOTE: here removal is non trivial, this needs to be done manually, depending on the filetype
print_report( G_occured, dir )
# 2.3 search for sensitive data in subdirectory names and filenames
handle_fnames(os.path.abspath(dir))
print_report( F_occured, dir )
# if args.save_sensitive:
try:
filename = Path.home() / ('.dp.' + args.id)
with open(filename, 'w') as f:
for e in SENSITIVE_DATA:
f.write(f'{e}\n')
except IOError:
print(f'Error saving file: {args.save_sensitive}', file=sys.stderr)
parser = argparse.ArgumentParser()
parser.add_argument('dir', help="directory or filename to process")
parser.add_argument('id', help="ID String which identifies a subject")
parser.add_argument('grep_patterns', nargs='*')
parser.add_argument('-c', '--clean', help="clean out all sensitive data (given or collected)", action='store_true')
parser.add_argument('-v', '--verbose', help="Display full information: all matching files", action='store_true')
parser.add_argument('-r', '--read-sensitive', help="read sensitve data from the given path. Each line for a given entity.", action="store_true")
#parser.add_argument('-s', '--save-sensitive', help="save collected sensitive data to the given path", action='store_true')
args = parser.parse_args()
process_dir( args.dir )
# DICOM Fields representing a person names, according to DICOM Nema Standard: DICOM PS3.1 2018e
# http://dicom.nema.org/medical/dicom/current/output/html/part05.html#sect_6.2
# https://www.dicomlibrary.com/dicom/dicom-tags/ (filter Dicom tags on their Value Representation)
# (0008,0090) PN Referring Physician's Name
# (0008,009C) PN Consulting Physician's Name
# (0008,1050) PN Performing Physician's Name
# (0008,1060) PN Name of Physician(s) Reading Study
# (0008,1070) PN Operators' Name
# (0010,0010) PN Patient's Name
# (0010,1001) PN Other Patient Names
# (0010,1005) PN Patient's Birth Name
# (0010,1060) PN Patient's Mother's Birth Name
# (0014,0104) PN Secondary Reviewer Name
# (0014,2006) PN Evaluator Name
# (0040,0006) PN Scheduled Performing Physician's Name
# (0040,1010) PN Names of Intended Recipients of Results
# (0040,4037) PN Human Performer's Name
# (0040,A075) PN Verifying Observer Name
# (0040,A123) PN Person Name
# (0070,0084) PN Content Creator's Name
# (300E,0008) PN Reviewer Name
# https://www.dicomlibrary.com/terms-of-service/
# DICOM Library anonymize these DICOM Tags:
# General:
# TAG Name Value
# 0002, 0012 Implementation Class UID modified
# 0002, 0013 Implementation Class UID "DICOMLIBRARY-100"
# 0002, 0016 Source Application Entity Title "DICOMLIBRARY"
# 0002, 0100 Private Information Creator UID empty
# 0002, 0102 Private Information empty
#
# UID Anonymity:
# TAG Name Value
# 0002, 0003 Media Storage SOP Instance UID unique
# 0008, 0018 SOP Instance UID unique
# 0020, 000D Study Instance UID unique
# 0020, 000E Series Instance UID unique
#
# Patient Anonymization:
# TAG Name Value
# 0010, 0010 Patient's Name "Anonymized^^"
# 0010, 0020 Patient ID "0"
# 0010, 0021 Issuer of Patient ID empty
# 0010, 0022 Type of Patient ID empty
# 0010, 0030 Patient's Birth Date empty
# 0010, 0032 Patient's Birth Time empty
# 0010, 0050 Patient's Insurance Plan Code Sequence removed
# 0010, 0101 Patient's Primary Language Code Sequence removed
# 0010, 0102 Patient's Primary Language Code Modifier Sequence removed
# 0010, 1000 Other Patient IDs empty
# 0010, 1001 Other Patient Names empty
# 0010, 1002 Other Patient IDs Sequence removed
# 0010, 1005 Patient's Birth Name empty
# 0010, 1040 Patient's Address empty
# 0010, 1050 Insurance Plan Identification (RET) empty
# 0010, 1060 Patient's Mother's Birth Name empty
# 0010, 1080 Military Rank empty
# 0010, 1081 Branch of Service empty
# 0010, 1090 Medical Record Locator empty
# 0010, 2150 Country of Residence empty
# 0010, 2152 Region of Residence empty
# 0010, 2154 Patient's Telephone Numbers empty
# 0010, 2160 Ethnic Group empty
# 0010, 2180 Occupation empty
# 0010, 21b0 Additional Patient History empty
# 0010, 21c0 Pregnancy Status empty
# 0010, 21d0 Last Menstrual Date empty
# 0010, 21f0 Patient's Religious Preference removed
# 0010, 2202 Patient Species Code Sequence removed
# 0010, 2203 Patient's Sex Neutered empty
# 0010, 2293 Patient Breed Code Sequence removed
# 0010, 2294 Breed Registration Sequence removed
# 0010, 2295 Breed Registration Number empty
# 0010, 2296 Breed Registry Code Sequence removed
# 0010, 2297 Responsible Person empty
# 0010, 2298 Responsible Person Role empty
# 0010, 2299 Responsible Organization empty
#
# Visit Anonymity:
# TAG Name Value
# 0008, 0080 Institution Name empty
# 0008, 0081 Institution Address empty
# 0008, 0082 Institution Code Sequence removed
# 0008, 0090 Referring Physician's Name empty
# 0008, 0092 Referring Physician's Address empty
# 0008, 0094 Referring Physician's Telephone Numbers empty
# 0008, 0096 Referring Physician Identification Sequence removed
# 0008, 0116 Responsible Organization empty
# 0008, 1048 Physician(s) of Record empty
# 0008, 1049 Physician(s) of Record Identification Sequence removed
# 0008, 1050 Performing Physician's Name empty
# 0008, 1052 Performing Physician Identification Sequence removed
# 0008, 1060 Name of Physician(s) Reading Study empty
# 0008, 1062 Physician(s) Reading Study Identification Sequence removed
# 0008, 1070 Operators' Name empty
# 0008, 1072 Operator Identification Sequence removed
# 0038, 0010 Admission ID empty
# 0038, 0011 Issuer of Admission ID empty
# 0038, 0016 Route of Admissions empty
# 0038, 001a Scheduled Admission Date (RET) empty
# 0038, 001b Scheduled Admission Time (RET) empty
# 0038, 001c Scheduled Discharge Date (RET) empty
# 0038, 001d Scheduled Discharge Time (RET) empty
# 0038, 001e Scheduled Patient Institution Residence (RET) empty
# 0038, 0020 Admitting Date empty
# 0038, 0021 Admitting Time empty
# 0038, 0030 Discharge Date (RET) empty
# 0038, 0032 Discharge Time (RET) empty
# 0038, 0040 Discharge Diagnosis Description (RET) empty
# 0038, 0044 Discharge Diagnosis Code Sequence (RET) removed
# 0038, 0300 Current Patient Location empty
# 0038, 0400 Patient's Institution Residence empty
# 0038, 0500 Patient State empty
#
# Study Anonymity:
# TAG Name Value
# 0020, 0010 Study ID "1"
# 0008, 0050 Accession Number empty
# 0032, 000a Study Status ID (RET) empty
# 0032, 000c Study Priority ID (RET) empty
# 0032, 0012 Study ID Issuer (RET) empty
# 0032, 0032 Study Verified Date (RET) empty
# 0032, 0033 Study Verified Time (RET) empty
# 0032, 0034 Study Read Date (RET) empty
# 0032, 0035 Study Read Time (RET) empty
# 0032, 1000 Scheduled Study Start Date (RET) empty
# 0032, 1001 Scheduled Study Start Time (RET) empty
# 0032, 1010 Scheduled Study Stop Date (RET) empty
# 0032, 1011 Scheduled Study Stop Time (RET) empty
# 0032, 1020 Scheduled Study Location (RET) empty
# 0032, 1021 Scheduled Study Location AE Title (RET) empty
# 0032, 1030 Reason for Study (RET) empty
# 0032, 1031 Requesting Physician Identification Sequence removed
# 0032, 1032 Requesting Physician empty
# 0032, 1033 Requesting Service empty
# 0032, 1040 Study Arrival Date (RET) empty
# 0032, 1041 Study Arrival Time (RET) empty
# 0032, 1050 Study Completion Date (RET) empty
# 0032, 1051 Study Completion Time (RET) empty
# 0032, 1055 Study Component Status ID (RET) empty
# 0032, 1060 Requested Procedure Description empty
# 0032, 1064 Requested Procedure Code Sequence removed
# 0032, 1070 Requested Contrast Agent empty
# 0032, 4000 Study Comments empty
# 0040, 2008 Order Entered By empty
# 0040, 2009 Order Enterer's Location empty
# 0040, 2010 Order Callback Phone Number empty
#
# Procedure Anonymity:
# TAG Name Value
# 0040, 0001 Scheduled Station AE Title empty
# 0040, 0006 Scheduled Performing Physician's Name empty
# 0040, 000b Scheduled Performing Physician Identification Sequence removed
# 0040, 0010 Scheduled Station Name empty
# 0040, 0011 Scheduled Procedure Step Location empty
# 0040, 0012 Pre-Medication empty
# 0040, 0241 Performed Station AE Title empty
# 0040, 0242 Performed Station Name empty
# 0040, 0243 Performed Location empty
# 0040, 0296 Billing Item Sequence removed
#
# Results Anonymity:
# TAG Name Value
# 4008, 0042 Results ID Issuer (RET) empty
#
# Interpretation Anonymity:
# TAG Name Value
# 4008, 010c Interpretation Author (RET) empty
# 4008, 0114 Physician Approving Interpretation (RET) empty
# 4008, 0119 Distribution Name (RET) empty
# 4008, 011a Distribution Address (RET) empty
# 4008, 0202 Interpretation ID Issuer (RET) empty
#
# Equipment Anonymity:
# TAG Name Value
# 0008, 0070 Manufacturer empty
# 0008, 1010 Station Name empty
# 0008, 1040 Institutional Department Name empty
# 0008, 1090 Manufacturer's Model Name empty
# 0018, 1000 Device Serial Number empty
# 0018, 1016 Secondary Capture Device Manufacturer empty
# 0018, 1017 Hardcopy Device Manufacturer empty
# 0018, 1018 Secondary Capture Device Manufacturer's Model Name empty
# 0018, 1019 Secondary Capture Device Software Version(s) empty
# 0018, 101a Hardcopy Device Software Version empty
# 0018, 101b Hardcopy Device Manufacturer's Model Name empty
# 0018, 1020 Software Version(s) empty
# 0018, 1200 Date of Last Calibration empty
# 0018, 1201 Time of Last Calibration empty
# 0018, 700c Date of Last Detector Calibration empty
# 0018, 700e Time of Last Detector Calibration empty
# 0018, 7010 Exposures on Detector Since Last Calibration empty
# 0018, 7011 Exposures on Detector Since Manufactured empty