Different methods of normalizing data:

import numpy as np

a = np.random.rand(3,2)

# Normalised [0,1]
b = (a - np.min(a))/np.ptp(a)

# Normalised [0,255] as integer: don't forget the parenthesis before astype(int)
c = (255*(a - np.min(a))/np.ptp(a)).astype(int)

# Normalised [-1,1]
d = 2.*(a - np.min(a))/np.ptp(a)-1

Scaling using sklearn, including normalising the std dev.

Write a function:

def normalize_columns(arr):
rows, cols = arr.shape
for col in xrange(cols):
arr[:,col] /= abs(arr[:,col]).max()

--

--

Merging is simple enough with two rows. However with multiple rows, concat is simpler.

Remember to use [], and add axis = 1 to ensure columns are bound.

df = pd.concat([cea_sample_97, cea_b_103, cea_model_only_90, cea_model_MV_sites_102], axis = 1)

--

--

Read in SQL db from the Audit pack:

#Create a SQL connection to our SQLite database

con = sqlite3.connect(‘Audit_Pack_DataAggregation_220705.db’)

cur = con.cursor()

# select one of the tables from the db

df = pd.read_sql_query(‘SELECT * FROM “0-sites_data”;’, con)

# close the connection

con.close()

df

--

--

# join cores & latts & longs for 7 properties

# Note — check if reading data from 30 cm layer or 120 cm layer

for p in range(0, len( property )):

# read in files and correct column headings

cov_df = pd.read_csv(‘C:/Users/MelZeppel/OneDrive — McCosker Contracting Pty Ltd/ML_development/2022_covariates_QGIS/’ + property.iloc[p] + ‘_ML_covariates_2208.csv’)

cov_df = cov_df.rename(columns = {‘core_numbe’:’core_number’})

property_name = cov_df[‘property_n’].iloc[0]

print(property_name)

layer_30 = pd.read_csv(‘C:/Users/MelZeppel/OneDrive — McCosker Contracting Pty Ltd/ML_development/carbon_point_join_files/point_join_’ + property_name + ‘_30_no_nulls.csv’)

# #join and export files

df = cov_df.merge(layer_30, on = ‘core_number’, how = ‘left’)

df = df.rename(columns = {‘cea_name_x’:’cea_name’, ‘sampling_r’:’sampling_round’})

df = df.drop(columns = {‘cea_name_y’})

lower_depth = df[‘lower_depth’].iloc[0].astype(str)

print(df.head() )

# df = df[[‘property_name’, ‘lower_depth’,’cea_name’, ‘strata_name’, ‘sampling_round’, ‘core_number’, ‘actual_latitude’, ‘actual_longitude’, ‘core_carbon_mass’]]

df.to_csv(‘C:/Users/MelZeppel/OneDrive — McCosker Contracting Pty Ltd/ML_development/2022_covariates_QGIS/’ + property.iloc[p] + lower_depth + ‘_ML_covariates_2208.csv’)

--

--

# Read in the Mullion FlintPro sqlite db

conn = sqlite3.connect(“Simplified_RothC_DB.sqlite”)

# Create a SQL connection to our SQLite database

con = sqlite3.connect(“Simplified_RothC_DB.sqlite”)

cur = con.cursor()

# select one of the tables from

df = pd.read_sql_query(‘SELECT * FROM soil_inputs;’, con)

# close the connection

con.close()

df

--

--

Column headings and things to be aware of

File 1 — Strata

df = df.rename(columns = {“name”:”strata_name”,

“round.area.name”:”cea”,

“round_name”:”period”,

“area_ha”: “strata_area_ha”,

“round.area.area_ha”:”cea_area_ha”

})

File 6 — core sublayers

df = pd.read_excel( ‘6-core-sublayer-’ + property.iloc[4] + ‘.xlsx’)

df = df.rename(columns {“s6_socl_core_sublayer_carbon_mass”:”sub_lay_carbon_mass”,

“cum_sublayer_carbon_mass”:”cum_sub_lay_carbon_mass”, “stata_name”:”strata_name”})

### Note that sublayers look like 1 & 2 in the initial view, however in T1 there are 23 layers. If you select ‘layer’ == 2 it won’t be what one expects for T1 ###

File 11 — core level

df = pd.read_csv( ‘11-core-report-layer-’ + property.iloc[4] + ‘.csv’)

df = df.rename(columns = {“e53_soci_core_report_layer_esm_carbon_mass”:”carbon_mass”, “round_name”:”period”})

File 13 — CEA round comparison

df = pd.read_excel(‘13-report-layer-cea-round’ + ‘-’ ‘ALBE’ + ‘.xlsx’)

df = df.rename(columns = {‘e60_soccea_mean_cea_soc_stock’:’carbon_mass’,’cea_name’:’cea’,’round_name’:’period’}) #

--

--

for p in range(0, len(property)):

df = pd.read_excel(‘C:/Python39/Projects2/Audit_pack/Property_Standard_Mass_3000Tha/’ + property.iloc[p] + ‘-auditing/6-core-sublayer.xlsx’)

df.to_excel(‘C:/Python39/Projects2/Audit_pack/Unsplining_T1_3000Tha/6-core-sublayer-’ + property.iloc[p] + ‘.xlsx’)

--

--

# write a loop for the T0 sublayer of each topsoil to have a mass calculated for each 5 cm layer — equally divided across the 30 cm

final_df = []

for i in range(0,6): # inset the rows below

row_15 = df_T0[( df_T0.core_number == 15353 ) & ( df_T0.sublayer_number ==1 )]

row_15[‘5cm_sublayer_number’] = row_15[‘sub_lay_carbon_mass’] / 6

row_15[‘sublayer_number’] = row_15[‘sublayer_number’] + i

row_15[‘T0_layer’] = 1 # filter to only have the top soil

print(f’loop’, i +1 )

# print(row_15)

final_df = row_15.append(final_df)

final_df.sort_values([‘sublayer_number’])

--

--