dp.py

import argparse
import os # for file walks
import sys # for error handling
import subprocess # to call grep utility
import functools # to bind an argument to function (neccessarry for collect_callback to get a valid filename)

import hashlib # to compute the md5sum for the given ID
import magic # to determine files type not be extension but by their content. mimetype won't work, cannot recognize DICOM files
import pydicom # to process dicom images
from pathlib import Path

# global variables
PNs_occured = {} # see print_report for format
G_occured = {}
F_occured = {}
SENSITIVE_DATA = set()

def md5sum_id():
    return str(hashlib.md5(args.id.encode('utf-8')).hexdigest())

def add_entry( category, value, filename, report ):
    if category in report:
        if value in report[category]:
            report[category][value].add( filename )
        else:
            report[category][value] = set([filename])
    else:
        report[category] = { value : set([filename]) }
    if args.verbose:
        print(f'{filename}-{category}: {value}')

def collect_callback(fname, dataset, data_element):
    global SENSITIVE_DATA
    if data_element.VR:
        if data_element.VR == "PN" and data_element.value: # DICOM specifies all ValueRepresentations with a code, and all with code PN are carrying PersonalNames
            SENSITIVE_DATA |= set([str(data_element.value)])
            add_entry(str(data_element.description()), str(data_element.value), fname, PNs_occured )
        else:
            for pattern in args.grep_patterns:
                if pattern.lower() in str(data_element.value).lower():
                    add_entry(str(data_element.description()), str(data_element.value), fname, PNs_occured )

def remove_person_names_callback(dataset, data_element):
    '''cycles through all data elements within a DICOM file and in case of a PN (PersonalName), it may
       * stores the value of that certain PN into PNs_occured, along with the filename
       * modifys the value of a non empty PN according to the given watermark
    '''
    if data_element.VR == "PN" and data_element.value: # DICOM specifies all ValueRepresentations with a code, and all with code PN are carrying PersonalNames
        data_element.value = md5sum_id()

    # TODO check also if sensitive data needs to be replaced, e.g. data in SENSITIVE_DATA

def print_report( report, dir ):
    '''
    Prints a report of found PNs with their actual occured values and list of files where found.

    :param dict report: dictionary of occured PNs and their values with file lists, in the following format:: text
        PNs_occured = {
              'Operators Name'  : { 'Foo Bar' : [file_a, file_b, ...],
                                    'Bar Foo' : [file_c, file_d],
                                    ...,
                                  },
              'Pyhsicians Name' : { ... },
               ...
        }
    :param path dir: current directory used to strip this path from filepaths, so all file names are relative to that directory
    '''
    for dicom_tag,values in report.items():
        if len(values) == 1 and not next(iter(values)): # skip those PNs only containing '' values
            continue
        else:
            print(f'{dicom_tag}:')
            for person,fileset in values.items():
                if person:
                    if args.verbose:
                        prefix = repr(person)
                    else:
                        MAX_LEN = 50
                        v = (str(person)[:MAX_LEN] + '..' ) if len(str(person)) > MAX_LEN else str(person)
                        prefix = repr(v)
                    prefix += ' => ['
                    if args.verbose:
                        print('\t' + prefix + (',\n\t' + (' ' * len(prefix))).join([os.path.relpath(f, dir) for f in fileset]) + ']')
                    else:
                        print('\t' + prefix + f'{os.path.relpath(next(iter(fileset)), dir)}, ...] #{len(fileset)}')

def for_each_file( dir, apply ):
    '''travers all files within a directory including subdirectories

    :param path dir: Directory to start from
    :param func apply: function to apply a certain action, requires path and filetype
    '''
    for root, subdirs, files in os.walk(dir):
        for f in files:
            apply( os.path.join(root,f), magic.from_file(os.path.join(root,f)) ) # ATTENTION DICOMDIR and DICOM images are both recognized as DICOM files


def remove_sensitive_from_dicom( fname, ftype ):
    if "DICOM medical imaging data" in ftype and 'DICOMDIR' not in fname:
        dataset = pydicom.dcmread(fname)
        dataset.walk(remove_person_names_callback)
        dataset.save_as(fname)

def collect_sensitive_from_dicom(fname, ftype):
    if "DICOM medical imaging data" in ftype and 'DICOMDIR' not in fname:
        dataset = pydicom.dcmread(fname)
        # When there is no dataset attached to the dicom file, dataset.filename might not exits,
        # hence we bind the filename as a first parameter to collect_callback
        dataset.walk(functools.partial(collect_callback, dataset.filename))

def grep_file(pattern, file):
    return not subprocess.call(['grep', '-i', pattern, file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def read_sensitive():
    sensitive = set()
    try:
        filename = Path.home() / ('.dp.' + args.id )
        with open(filename, 'r') as f:
            sensitive |= set(f.read().splitlines())
    except IOError:
        print(f'Error open file: {args.read_sensitive}', file=sys.stderr)
    return sensitive

def handle_other( fname, ftype ):
    if 'DICOMDIR' in fname or \
       'DICOM medical imaging data' in ftype:
        return
    elif 'compressed data' in ftype or 'archive data' in ftype:
        print(f"Warning: skipping archive or compressed file: {fname}", file=sys.stderr)
        return

    for sdata in SENSITIVE_DATA:
        if grep_file( sdata, fname):
            add_entry('Grep-Results', sdata, fname, G_occured )

def wipe_sdata( fname, sdata ):
    for token in sdata:
        fname = fname.replace(token, '0' * len(token))
    return fname

def handle_fnames(dir):
    global SENSITIVE_DATA
    for root, dirs, files in os.walk(dir, topdown=False):
        # Attention: Files first, because they reside in directories which might be renamed
        for fname in files:
            if any(token in fname for token in SENSITIVE_DATA):
                if args.clean:
                    os.rename(os.path.join(root, fname),
                              os.path.join(root, wipe_sdata(fname, SENSITIVE_DATA)))
                else:
                    sdata = [token for token in fname for token in SENSITIVE_DATA][0]
                    add_entry('Filenames', sdata, os.path.join(root, fname), F_occured)
        for subdir in dirs:
            if any(token in subdir for token in SENSITIVE_DATA):
                if args.clean:
                    os.rename(os.path.join(root, subdir),
                              os.path.join(root, wipe_sdata(subdir, SENSITIVE_DATA)))
                else:
                    sdata = [token for token in subdir for token in SENSITIVE_DATA][0]
                    add_entry('Filenames', sdata, os.path.join(root, subdir), F_occured)

def process_dir( dir ):
    """
     1. if there are archives, or compressed data files, extract them, and store them for recompression afterwards
         ATTENTION: handle compression bombs, nested compression
     2. cycle through dir (no compression should occur):
     2.1 handle all DICOM files
     2.2 handle all other files, by looking for DICOM colleced PNs
     2.3 search sensitive data inside of filenames too
     """

    print('process: ' + os.path.abspath(dir))

    global SENSITIVE_DATA
    SENSITIVE_DATA |= set([args.id])
    if args.grep_patterns:
        SENSITIVE_DATA |= set(args.grep_patterns)
    if args.read_sensitive:
        SENSITIVE_DATA |= read_sensitive()

    if not args.read_sensitive or args.grep_patterns:
        for_each_file( os.path.abspath(dir), collect_sensitive_from_dicom )
        if not args.clean:
            print_report( PNs_occured, dir )

    if args.clean:
        for_each_file( os.path.abspath(dir), remove_sensitive_from_dicom )

    # 2.2 reprocess all non DICOM images, archives and compressed files are irgnored and warned
    for_each_file( os.path.abspath(dir), handle_other )
    # NOTE: here removal is non trivial, this needs to be done manually, depending on the filetype
    print_report( G_occured, dir )

    # 2.3 search for sensitive data in subdirectory names and filenames
    handle_fnames(os.path.abspath(dir))
    print_report( F_occured, dir )

#    if args.save_sensitive:
    try:
        filename = Path.home() / ('.dp.' + args.id)
        with open(filename, 'w') as f:
            for e in SENSITIVE_DATA:
                f.write(f'{e}\n')
    except IOError:
        print(f'Error saving file: {args.save_sensitive}', file=sys.stderr)


parser = argparse.ArgumentParser()
parser.add_argument('dir', help="directory or filename to process")
parser.add_argument('id', help="ID String which identifies a subject")
parser.add_argument('grep_patterns', nargs='*')
parser.add_argument('-c', '--clean', help="clean out all sensitive data (given or collected)", action='store_true')
parser.add_argument('-v', '--verbose', help="Display full information: all matching files", action='store_true')
parser.add_argument('-r', '--read-sensitive', help="read sensitve data from the given path. Each line for a given entity.", action="store_true")
#parser.add_argument('-s', '--save-sensitive', help="save collected sensitive data to the given path", action='store_true')

args = parser.parse_args()

process_dir( args.dir )

# DICOM Fields representing a person names, according to DICOM Nema Standard: DICOM PS3.1 2018e
# http://dicom.nema.org/medical/dicom/current/output/html/part05.html#sect_6.2
# https://www.dicomlibrary.com/dicom/dicom-tags/ (filter Dicom tags on their Value Representation)
# (0008,0090)	PN	Referring Physician's Name	
# (0008,009C)	PN	Consulting Physician's Name	
# (0008,1050)	PN	Performing Physician's Name	
# (0008,1060)	PN	Name of Physician(s) Reading Study	
# (0008,1070)	PN	Operators' Name	
# (0010,0010)	PN	Patient's Name	
# (0010,1001)	PN	Other Patient Names	
# (0010,1005)	PN	Patient's Birth Name	
# (0010,1060)	PN	Patient's Mother's Birth Name	
# (0014,0104)	PN	Secondary Reviewer Name	
# (0014,2006)	PN	Evaluator Name	
# (0040,0006)	PN	Scheduled Performing Physician's Name	
# (0040,1010)	PN	Names of Intended Recipients of Results	
# (0040,4037)	PN	Human Performer's Name	
# (0040,A075)	PN	Verifying Observer Name	
# (0040,A123)	PN	Person Name	
# (0070,0084)	PN	Content Creator's Name	
# (300E,0008)	PN	Reviewer Name


# https://www.dicomlibrary.com/terms-of-service/
# DICOM Library anonymize these DICOM Tags:
#        General:
#        TAG 	Name 	Value
#        0002, 0012 	Implementation Class UID 	modified
#        0002, 0013 	Implementation Class UID 	"DICOMLIBRARY-100"
#        0002, 0016 	Source Application Entity Title 	"DICOMLIBRARY"
#        0002, 0100 	Private Information Creator UID 	empty
#        0002, 0102 	Private Information 	empty
#
#        UID Anonymity:
#        TAG 	Name 	Value
#        0002, 0003 	Media Storage SOP Instance UID 	unique
#        0008, 0018 	SOP Instance UID 	unique
#        0020, 000D 	Study Instance UID 	unique
#        0020, 000E 	Series Instance UID 	unique
#
#        Patient Anonymization:
#        TAG 	Name 	Value
#        0010, 0010 	Patient's Name 	"Anonymized^^"
#        0010, 0020 	Patient ID 	"0"
#        0010, 0021 	Issuer of Patient ID 	empty
#        0010, 0022 	Type of Patient ID 	empty
#        0010, 0030 	Patient's Birth Date 	empty
#        0010, 0032 	Patient's Birth Time 	empty
#        0010, 0050 	Patient's Insurance Plan Code Sequence 	removed
#        0010, 0101 	Patient's Primary Language Code Sequence 	removed
#        0010, 0102 	Patient's Primary Language Code Modifier Sequence 	removed
#        0010, 1000 	Other Patient IDs 	empty
#        0010, 1001 	Other Patient Names 	empty
#        0010, 1002 	Other Patient IDs Sequence 	removed
#        0010, 1005 	Patient's Birth Name 	empty
#        0010, 1040 	Patient's Address 	empty
#        0010, 1050 	Insurance Plan Identification (RET) 	empty
#        0010, 1060 	Patient's Mother's Birth Name 	empty
#        0010, 1080 	Military Rank 	empty
#        0010, 1081 	Branch of Service 	empty
#        0010, 1090 	Medical Record Locator 	empty
#        0010, 2150 	Country of Residence 	empty
#        0010, 2152 	Region of Residence 	empty
#        0010, 2154 	Patient's Telephone Numbers 	empty
#        0010, 2160 	Ethnic Group 	empty
#        0010, 2180 	Occupation 	empty
#        0010, 21b0 	Additional Patient History 	empty
#        0010, 21c0 	Pregnancy Status 	empty
#        0010, 21d0 	Last Menstrual Date 	empty
#        0010, 21f0 	Patient's Religious Preference 	removed
#        0010, 2202 	Patient Species Code Sequence 	removed
#        0010, 2203 	Patient's Sex Neutered 	empty
#        0010, 2293 	Patient Breed Code Sequence 	removed
#        0010, 2294 	Breed Registration Sequence 	removed
#        0010, 2295 	Breed Registration Number 	empty
#        0010, 2296 	Breed Registry Code Sequence 	removed
#        0010, 2297 	Responsible Person 	empty
#        0010, 2298 	Responsible Person Role 	empty
#        0010, 2299 	Responsible Organization 	empty
#
#        Visit Anonymity:
#        TAG 	Name 	Value
#        0008, 0080 	Institution Name 	empty
#        0008, 0081 	Institution Address 	empty
#        0008, 0082 	Institution Code Sequence 	removed
#        0008, 0090 	Referring Physician's Name 	empty
#        0008, 0092 	Referring Physician's Address 	empty
#        0008, 0094 	Referring Physician's Telephone Numbers 	empty
#        0008, 0096 	Referring Physician Identification Sequence 	removed
#        0008, 0116 	Responsible Organization 	empty
#        0008, 1048 	Physician(s) of Record 	empty
#        0008, 1049 	Physician(s) of Record Identification Sequence 	removed
#        0008, 1050 	Performing Physician's Name 	empty
#        0008, 1052 	Performing Physician Identification Sequence 	removed
#        0008, 1060 	Name of Physician(s) Reading Study 	empty
#        0008, 1062 	Physician(s) Reading Study Identification Sequence 	removed
#        0008, 1070 	Operators' Name 	empty
#        0008, 1072 	Operator Identification Sequence 	removed
#        0038, 0010 	Admission ID 	empty
#        0038, 0011 	Issuer of Admission ID 	empty
#        0038, 0016 	Route of Admissions 	empty
#        0038, 001a 	Scheduled Admission Date (RET) 	empty
#        0038, 001b 	Scheduled Admission Time (RET) 	empty
#        0038, 001c 	Scheduled Discharge Date (RET) 	empty
#        0038, 001d 	Scheduled Discharge Time (RET) 	empty
#        0038, 001e 	Scheduled Patient Institution Residence (RET) 	empty
#        0038, 0020 	Admitting Date 	empty
#        0038, 0021 	Admitting Time 	empty
#        0038, 0030 	Discharge Date (RET) 	empty
#        0038, 0032 	Discharge Time (RET) 	empty
#        0038, 0040 	Discharge Diagnosis Description (RET) 	empty
#        0038, 0044 	Discharge Diagnosis Code Sequence (RET) 	removed
#        0038, 0300 	Current Patient Location 	empty
#        0038, 0400 	Patient's Institution Residence 	empty
#        0038, 0500 	Patient State 	empty
#
#        Study Anonymity:
#        TAG 	Name 	Value
#        0020, 0010 	Study ID 	"1"
#        0008, 0050 	Accession Number 	empty
#        0032, 000a 	Study Status ID (RET) 	empty
#        0032, 000c 	Study Priority ID (RET) 	empty
#        0032, 0012 	Study ID Issuer (RET) 	empty
#        0032, 0032 	Study Verified Date (RET) 	empty
#        0032, 0033 	Study Verified Time (RET) 	empty
#        0032, 0034 	Study Read Date (RET) 	empty
#        0032, 0035 	Study Read Time (RET) 	empty
#        0032, 1000 	Scheduled Study Start Date (RET) 	empty
#        0032, 1001 	Scheduled Study Start Time (RET) 	empty
#        0032, 1010 	Scheduled Study Stop Date (RET) 	empty
#        0032, 1011 	Scheduled Study Stop Time (RET) 	empty
#        0032, 1020 	Scheduled Study Location (RET) 	empty
#        0032, 1021 	Scheduled Study Location AE Title (RET) 	empty
#        0032, 1030 	Reason for Study (RET) 	empty
#        0032, 1031 	Requesting Physician Identification Sequence 	removed
#        0032, 1032 	Requesting Physician 	empty
#        0032, 1033 	Requesting Service 	empty
#        0032, 1040 	Study Arrival Date (RET) 	empty
#        0032, 1041 	Study Arrival Time (RET) 	empty
#        0032, 1050 	Study Completion Date (RET) 	empty
#        0032, 1051 	Study Completion Time (RET) 	empty
#        0032, 1055 	Study Component Status ID (RET) 	empty
#        0032, 1060 	Requested Procedure Description 	empty
#        0032, 1064 	Requested Procedure Code Sequence 	removed
#        0032, 1070 	Requested Contrast Agent 	empty
#        0032, 4000 	Study Comments 	empty
#        0040, 2008 	Order Entered By 	empty
#        0040, 2009 	Order Enterer's Location 	empty
#        0040, 2010 	Order Callback Phone Number 	empty
#
#        Procedure Anonymity:
#        TAG 	Name 	Value
#        0040, 0001 	Scheduled Station AE Title 	empty
#        0040, 0006 	Scheduled Performing Physician's Name 	empty
#        0040, 000b 	Scheduled Performing Physician Identification Sequence 	removed
#        0040, 0010 	Scheduled Station Name 	empty
#        0040, 0011 	Scheduled Procedure Step Location 	empty
#        0040, 0012 	Pre-Medication 	empty
#        0040, 0241 	Performed Station AE Title 	empty
#        0040, 0242 	Performed Station Name 	empty
#        0040, 0243 	Performed Location 	empty
#        0040, 0296 	Billing Item Sequence 	removed
#
#        Results Anonymity:
#        TAG 	Name 	Value
#        4008, 0042 	Results ID Issuer (RET) 	empty
#
#        Interpretation Anonymity:
#        TAG 	Name 	Value
#        4008, 010c 	Interpretation Author (RET) 	empty
#        4008, 0114 	Physician Approving Interpretation (RET) 	empty
#        4008, 0119 	Distribution Name (RET) 	empty
#        4008, 011a 	Distribution Address (RET) 	empty
#        4008, 0202 	Interpretation ID Issuer (RET) 	empty
#
#        Equipment Anonymity:
#        TAG 	Name 	Value
#        0008, 0070 	Manufacturer 	empty
#        0008, 1010 	Station Name 	empty
#        0008, 1040 	Institutional Department Name 	empty
#        0008, 1090 	Manufacturer's Model Name 	empty
#        0018, 1000 	Device Serial Number 	empty
#        0018, 1016 	Secondary Capture Device Manufacturer 	empty
#        0018, 1017 	Hardcopy Device Manufacturer 	empty
#        0018, 1018 	Secondary Capture Device Manufacturer's Model Name 	empty
#        0018, 1019 	Secondary Capture Device Software Version(s) 	empty
#        0018, 101a 	Hardcopy Device Software Version 	empty
#        0018, 101b 	Hardcopy Device Manufacturer's Model Name 	empty
#        0018, 1020 	Software Version(s) 	empty
#        0018, 1200 	Date of Last Calibration 	empty
#        0018, 1201 	Time of Last Calibration 	empty
#        0018, 700c 	Date of Last Detector Calibration 	empty
#        0018, 700e 	Time of Last Detector Calibration 	empty
#        0018, 7010 	Exposures on Detector Since Last Calibration 	empty
#        0018, 7011 	Exposures on Detector Since Manufactured 	empty