Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: transmart/tranSMART-ETL
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: hms-dbmi/tranSMART-ETL
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
Able to merge. These branches can be automatically merged.
  • 9 commits
  • 14 files changed
  • 2 contributors

Commits on Apr 7, 2015

  1. AVL-189 - Forking a repository and catching it up to our current ETL …

    …code so we can correct the mRNA load.
    mtmcduffie committed Apr 7, 2015
    Copy the full SHA
    d5ee3b5 View commit details
  2. Copy the full SHA
    5d32eb5 View commit details

Commits on Apr 8, 2015

  1. Copy the full SHA
    5de15d7 View commit details

Commits on Mar 8, 2016

  1. Default properties files.

    Tom authored and Tom committed Mar 8, 2016
    Copy the full SHA
    fa4b59c View commit details

Commits on Mar 9, 2016

  1. Adding ETL Scripts

    Tom authored and Tom committed Mar 9, 2016
    Copy the full SHA
    6dbfb42 View commit details
  2. Update for scripts

    Tom authored and Tom committed Mar 9, 2016
    Copy the full SHA
    bbf4b63 View commit details

Commits on Mar 10, 2016

  1. Update for Clinical Load example

    Tom authored and Tom committed Mar 10, 2016
    Copy the full SHA
    5e5f118 View commit details

Commits on Mar 18, 2016

  1. ADDED UMLS CODE TO TEST DATA

    Tom authored and Tom committed Mar 18, 2016
    Copy the full SHA
    0c4615f View commit details

Commits on Mar 28, 2016

  1. Update to connection

    Tom authored and Tom committed Mar 28, 2016
    Copy the full SHA
    86fbd83 View commit details
59 changes: 59 additions & 0 deletions Kettle-GPL/ETL/Loading/ScriptRepo/Load_Study.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

MAPPING_FILE=''
RAW_MAPPING=''
STUDY_ID=''
TOP_NODE=''
FACT_SET=''
LOAD_REGISTRY='Y'
LOAD_RAW='N'

ENVIRONMENT=$KETTLE_DEVELOPMENT

















## ADMIN Variables do not change unless specified
## '/scratchg/data/NAMHR4/data/datafiles/'
DATA_LOCATION=$KETTLE_LOADING_DIR'Registries/'$STUDY_ID'/data/datafiles'
RAW_DATA=$KETTLE_LOADING_DIR'/Registries/'$STUDY_ID'/data/raw_datafiles'

LOGFILE=$KETTLE_LOADING_DIR'logs/'$STUDY_ID'.log'
DEBUG='Basic'
LOAD_TYPE='I'
SORT_DIR=$KETTLE_LOADING_DIR

## DO NOT CHANGE ANYTHING BELOW!!!!!
export KETTLE_HOME=$ENVIRONMENT

## Validation


## Load Study's Mapped data and GRDR Concepts
if [ $LOAD_REGISTRY == 'Y' ];
then
/tranSMART-ETL/Kettle-GPL/ETL/Loading/ScriptRepo/create_clinical_data.sh ${MAPPING_FILE} ${STUDY_ID} "'"${SORT_DIR}"'" ${FACT_SET} ${DATA_LOCATION} ${RAW_DATA} ${LOGFILE} ${DEBUG} ${LOAD_TYPE} "${TOP_NODE}" >> "${LOGFILE}" 2>>"${LOGFILE}_ERROR.log"
fi

MAPPING_FILE=$RAW_MAPPING
DATA_LOCATION=$RAW_DATA
## Load Study's Raw Data
if [ $LOAD_RAW == 'Y' ];
then
/tranSMART-ETL/Kettle-GPL/ETL/Loading/ScriptRepo/create_clinical_data.sh ${MAPPING_FILE} ${STUDY_ID} "'"${SORT_DIR}"'" ${FACT_SET} ${DATA_LOCATION} ${RAW_DATA} ${LOGFILE} ${DEBUG} ${LOAD_TYPE} "${TOP_NODE}/__RAW" >> "${LOGFILE}" 2>>"${LOGFILE}_ERROR.log"
fi


23 changes: 23 additions & 0 deletions Kettle-GPL/ETL/Loading/ScriptRepo/create_clinical_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
echo "$2 Registry Loading - $0"
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"

echo "'""${10}""'"


/opt/etl/src/data-integration/kitchen.sh \
-norep=Y \
-file=/opt/etl/src/tranSMART-ETL/Kettle-GPL/Kettle-ETL/create_clinical_data.kjb \
-level=Basic \
-param:COLUMN_MAP_FILE=$1 \
-param:DATA_LOCATION=$5 \
-param:STUDY_ID=$2 \
-param:TOP_NODE="${10}" \
-param:SORT_DIR='/scratch/ETL' \
-param:LOAD_TYPE=$9 \
-param:LOADER_PATH=/scratchg/tmp/11.2.0/client_2/bin/sqlldr \
-param:FACT_SET=$4

echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
echo "$2 Registry Load Completed - $0"
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
11 changes: 11 additions & 0 deletions Kettle-GPL/ETL/Loading/ScriptRepo/generate_load_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Generate Study
STUDY=''



## Adming Configs
LOADING_DIR=$KETTLE_LOADING_DIR

mkdir -p ${LOADING_DIR}/'Registries'/${STUDY}/{'data'/{'datafiles','raw_datafiles'},'logs'}
cp "ScriptRepo/Load_Study.sh" "${LOADING_DIR}/${STUDY}_Load.sh"
cp "ScriptRepo/resolve_db_issues.sh" "${LOADING_DIR}/${STUDY}_resolve_db_issues.sh"
71 changes: 71 additions & 0 deletions Kettle-GPL/ETL/Loading/ScriptRepo/resolve_db_issues.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# The purpose of this script is to execute a series of cleanup actions that have to be
# performed after the tranSMART database has been loaded with the create_clinical_data.sh
# script. The first action is to rebuild the indexes. The second action is related to a
# problem with numerical values. When selecting a patient cohort via a numerical value
# range (like select all patients older than 50), no statistics are generated when
# c_columndatatype contains 'N' instead of only 'T'. So this has to be set to 'T'. The
# third action is cleaning up the database where the value '\\' appears where it should
# not. If this is not done, the tranSMART interface does not show up.
#
# Change everything between **, or simply remove the ** (3 occurrences: lines 17, 58, 63)
#
# On 2015/11/03, the TM_CZ user does not have the right permissions to operate on the
# database. This should be changed as soon as time permits. As long as this is not done,
# use the root user and the root password.

sqlplus <username>/<password>@server:<port>/<instance> << EOF
set serveroutput on format wrapped;
begin
DBMS_OUTPUT.put_line('Rebuild the indexes');
end;
/
drop INDEX I2B2METADATA.i2b2_fullname_text;
drop INDEX I2B2DEMODATA.cd_concept_path_text;
GRANT CTXAPP TO i2b2metadata;
GRANT CTXAPP TO i2b2demodata;
CREATE INDEX I2B2METADATA.i2b2_fullname_text ON i2b2metadata.i2b2(c_fullname) INDEXTYPE IS CTXSYS.CONTEXT;
CREATE INDEX I2B2DEMODATA.cd_concept_path_text ON i2b2demodata.concept_dimension(concept_path) INDEXTYPE IS CTXSYS.CONTEXT;
REVOKE CTXAPP FROM i2b2metadata;
REVOKE CTXAPP FROM i2b2demodata;
alter INDEX I2B2METADATA.I2B2_IDX1_PART rebuild;
ALTER INDEX I2B2METADATA.I2B2_INDEX1_PART REBUILD;
ALTER INDEX I2B2METADATA.I2B2_INDEX2_PART REBUILD;
ALTER INDEX I2B2METADATA.I2B2_INDEX3_PART REBUILD;
ALTER INDEX I2B2METADATA.I2B2_INDEX4_PART REBUILD;
ALTER INDEX I2B2METADATA.I2B2_C_HLEVEL_BASECODE_PART REBUILD;
ALTER INDEX I2B2METADATA.META_APPLIED_PATH_I2B2_PART REBUILD;
ALTER INDEX I2B2DEMODATA.CONCEPT_COUNTS_INDEX1 REBUILD;
ALTER INDEX I2B2DEMODATA.IDX_OB_FACT_1 REBUILD;
ALTER INDEX I2B2DEMODATA.IDX_OB_FACT_2 REBUILD;
ALTER INDEX I2B2DEMODATA.FACT_MOD_PAT_ENC REBUILD;
ALTER INDEX I2B2DEMODATA.FACT_CNPT_PAT_ENCT_IDX REBUILD;
ALTER INDEX I2B2DEMODATA.PD_IDX_ALLPATIENTDIM REBUILD;
ALTER INDEX I2B2DEMODATA.PATIENT_DIMENSION_INDEX1 REBUILD;
ALTER INDEX I2B2DEMODATA.PATIENT_TRIAL_INDEX1 REBUILD;
ALTER INDEX I2B2DEMODATA.IDX_CONCEPT_DIM3 REBUILD;
ALTER INDEX I2B2DEMODATA.IDX_CONCEPT_DIM_1 REBUILD;
ALTER INDEX I2B2METADATA.I2B2_S_IDX1 REBUILD;
commit;
begin
DBMS_OUTPUT.put_line('Make sure the numerical values are treated correctly for top node $1');
end;
/
update i2b2metadata.i2b2
set c_columndatatype='T'
where c_fullname like '$1%';
commit;
begin
DBMS_OUTPUT.put_line('Avoid a bug with double slashes');
end;
/
delete from i2b2metadata.i2b2 where c_fullname='\\';
delete from i2b2metadata.i2b2_secure where c_fullname='\\';
delete from I2B2METADATA.TABLE_ACCESS where c_fullname='\\';
commit;
exit;
EOF

11 changes: 11 additions & 0 deletions Kettle-GPL/ETL/Loading/generate_load_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Generate Study
STUDY=''



## Adming Configs
LOADING_DIR=$KETTLE_LOADING_DIR

mkdir -p ${LOADING_DIR}/'Registries'/${STUDY}/{'data'/{'datafiles','raw_datafiles'},'logs'}
cp "ScriptRepo/Load_Study.sh" "${LOADING_DIR}/${STUDY}_Load.sh"
cp "ScriptRepo/resolve_db_issues.sh" "${LOADING_DIR}/${STUDY}_resolve_db_issues.sh"
50 changes: 36 additions & 14 deletions Kettle-GPL/Kettle-ETL/check_gene_expression_filenames.ktr
Original file line number Diff line number Diff line change
@@ -104,44 +104,44 @@
</notepad>
</notepads>
<connection>
<name>BIOMART</name>
<server>${BIOMART_DB_SERVER}</server>
<name>SQLLDR_LZ</name>
<server>${TM_LZ_DB_SERVER}</server>
<type>ORACLE</type>
<access>Native</access>
<database>${BIOMART_DB_NAME}</database>
<port>${BIOMART_DB_PORT}</port>
<username>${BIOMART_DB_USER}</username>
<password>${BIOMART_DB_PWD}</password>
<database>${TM_LZ_DB_NAME}</database>
<port>${TM_LZ_DB_PORT}</port>
<username>${TM_LZ_DB_USER}</username>
<password>${TM_LZ_DB_PWD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${BIOMART_DB_PORT}</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${TM_LZ_DB_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>SUPPORTS_BOOLEAN_DATA_TYPE</code><attribute>N</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>DEAPP</name>
<server>${DEAPP_DB_SERVER}</server>
<name>TM_BIOMART</name>
<server>${TM_BIOMART_DB_SERVER}</server>
<type>ORACLE</type>
<access>Native</access>
<database>${DEAPP_DB_NAME}</database>
<port>${DEAPP_DB_PORT}</port>
<username>${DEAPP_DB_USER}</username>
<password>${DEAPP_DB_PWD}</password>
<database>${TM_BIOMART_DB_NAME}</database>
<port>${TM_BIOMART_DB_PORT}</port>
<username>${TM_BIOMART_DB_USER}</username>
<password>${TM_BIOMART_DB_PWD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${DEAPP_DB_PORT}</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${TM_BIOMART_DB_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>SUPPORTS_BOOLEAN_DATA_TYPE</code><attribute>N</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
@@ -191,6 +191,28 @@
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>Transmart</name>
<server>${TRANSMART_DB_SERVER}</server>
<type>ORACLE</type>
<access>Native</access>
<database>${TRANSMART_DB_NAME}</database>
<port>${TRANSMART_DB_PORT}</port>
<username>${TRANSMART_DB_USER}</username>
<password>${TRANSMART_DB_PWD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${TRANSMART_DB_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>SUPPORTS_BOOLEAN_DATA_TYPE</code><attribute>N</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<order>
<hop> <from>Group by</from><to>Write to log</to><enabled>Y</enabled> </hop> <hop> <from>Write to log</from><to>Set Variables</to><enabled>Y</enabled> </hop> <hop> <from>Get File Names</from><to>Select values</to><enabled>Y</enabled> </hop> <hop> <from>Select values</from><to>Group by</to><enabled>Y</enabled> </hop> </order>
<step>
5 changes: 5 additions & 0 deletions Kettle-GPL/Kettle-ETL/create_clinical_data.kjb
Original file line number Diff line number Diff line change
@@ -66,6 +66,11 @@
<default_value>x</default_value>
<description>Name of word mapping file (optional)</description>
</parameter>
<parameter>
<name>FACT_SET</name>
<default_value>x</default_value>
<description>Required, The name that gets put into Observation Fact. This will be used to partition the data for easy loading/reloading.</description>
</parameter>
</parameters>
<connection>
<name>BIOMART</name>
5 changes: 5 additions & 0 deletions Kettle-GPL/Kettle-ETL/load_gene_expression_data.kjb
Original file line number Diff line number Diff line change
@@ -96,6 +96,11 @@
<default_value/>
<description/>
</parameter>
<parameter>
<name>FACT_SET</name>
<default_value>DEFAULT</default_value>
<description>Required, The name that gets put into Observation Fact. This will be used to partition the data for easy loading/reloading.</description>
</parameter>
</parameters>
<connection>
<name>BIOMART</name>
Loading