Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,11 @@

# Ignore virtualenv
sharedb/
.idea/

__pycache__/
classification/__pycache__/
datahub/__pycache__/
filter/__pycache__/
pipeline/__pycache__/

8 changes: 2 additions & 6 deletions notes.md → README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Notes

# ShareDB
## HIPAA PII removal

Our main starting point for the removal of PII is working off of the [HIPAA guidelines](https://www.hhs.gov/hipaa/for-professionals/privacy/special-topics/de-identification/).
Expand Down Expand Up @@ -42,8 +41,6 @@ If we want to make the dashes optional, we can either just use `-?` or allow for

## Environment
My work here so far has been done using a virtual environment in the `sharedb` folder.
It's somewhat easier to collect all of my packages into a virtualenv to ensure that future users don't worry about requirements.
I may or may not actually commit this to the git repo due to its size, but I will always have a `requirements.txt`.

--

Expand All @@ -52,5 +49,4 @@ names w/ bloom filter
glove word vec

--
http://www.cs.cmu.edu/Groups/AI/util/areas/nlp/corpora/names/other/

http://www.cs.cmu.edu/Groups/AI/util/areas/nlp/corpora/names/other/
Binary file added db.sqlite3
Binary file not shown.
170 changes: 0 additions & 170 deletions frontend.py

This file was deleted.

Empty file added iid/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions iid/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from django.contrib import admin

# Register your models here.
5 changes: 5 additions & 0 deletions iid/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from django.apps import AppConfig


class IidConfig(AppConfig):
name = 'iid'
16 changes: 12 additions & 4 deletions bundle.py → iid/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
Pre-defined Pipelines to match certain deidentification standards,
like HIPAA.
"""
from pipeline import Pipe, Pipeline
from classification import SSNClassifier, EmailClassifier, PhoneNumberClassifier, MACAddressClassifier, IPAddressClassifier, URLClassifier, ZipCodeClassifier, NumberClassifier, AddressClassifier, Lookup, DateClassifier
from filter import Drop, ZipCodeFilter, AddressFilter, DateFilter
from iid.pipeline import Pipe, Pipeline
from iid.classification import SSNClassifier, EmailClassifier, PhoneNumberClassifier, MACAddressClassifier, IPAddressClassifier, URLClassifier, ZipCodeClassifier, NumberClassifier, AddressClassifier, Lookup, DateClassifier, FaceClassifier
from iid.filter import Drop, ZipCodeFilter, AddressFilter, DateFilter
import os

BASE_DIR = os.path.dirname(os.path.abspath(__file__))


class Bundle:
def __init__(self, pipeline, description):
Expand All @@ -14,13 +18,15 @@ def __init__(self, pipeline, description):
self.pipeline = pipeline
self.description = description


def read_names():
names = set()
with open('data/names.dat') as f:
with open(os.path.join(BASE_DIR, 'data/names.dat')) as f:
for name in f:
names.add(name.strip().lower())
return names


def build_hipaa():
pipeline = Pipeline()
pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop()))
Expand All @@ -33,12 +39,14 @@ def build_hipaa():
pipeline.add_pipe('ssn', Pipe(SSNClassifier, Drop()))
pipeline.add_pipe('ip_address', Pipe(IPAddressClassifier, Drop()))
pipeline.add_pipe('mac_address', Pipe(MACAddressClassifier, Drop()))
pipeline.add_pipe('face', Pipe(FaceClassifier(), Drop()))
# TODO: This is far too sensitive
# pipeline.add_pipe('number', Pipe(NumberClassifier, Drop()))
return pipeline

HIPAABundle = Bundle(build_hipaa(), 'HIPAA PII Removal')


def build_ferpa():
pipeline = Pipeline()
pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop()))
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
from .regex import Regex, SSNClassifier, EmailClassifier, PhoneNumberClassifier, MACAddressClassifier, IPAddressClassifier, URLClassifier, ZipCodeClassifier, NumberClassifier
from .address import AddressClassifier
from .date import DateClassifier
from .lookup import Lookup
from .lookup import Lookup
from .face import FaceClassifier

File renamed without changes.
File renamed without changes.
File renamed without changes.
30 changes: 30 additions & 0 deletions iid/classification/face.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Face identifier using the face_recognition library."""
import face_recognition
from .classifier import Classifier
import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


class FaceClassifier(Classifier):
def __init__(self):
pass

def rate(self, column):
# Test to see if this is a column of file paths
try:
open(os.path.join(BASE_DIR, column[0]))
except OSError:
return 0

score = 0
for row in column:
try:
image = face_recognition.load_image_file(os.path.join(BASE_DIR, row))
face_locations = face_recognition.face_locations(image)
if len(face_locations) > 0:
score += 1
except OSError:
continue
return score / len(column)

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading