Cross-Read & -Write R, Py, Matlab, Binary Files
Note: feather-format is desigend to transfer data between Py & R [stackoverflow, feather-doc].
.FE #
OBS: only for data.frame type, not even arrays.
py (feather-format) #
Requires: pip install feather-format
. (OBS: feather-format
NOT feather
.)
write:
import numpy as np
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, index=['one', 'two', 'three'])
import feather
feather.write_dataframe(df.reset_index(drop=True), 'df.fe')
(though the df
is created by pandas)
read:
import feather
df = feather.read_dataframe('df.fe')
py-pandas (canNOT read)
#
write: df.reset_index(drop=True).to_feather('df.fe')
. Note that feather canNOT handle string-based index names, another solution is drop=False, then index becomes columns.
read: Not working: pd.read_feather('df.fe')
triggers an error: “read_feather() got an unexpected keyword argument ’nthreads’ “.
r #
Requres: install.packages('feather')
(better to use non-IDE to install to avoid libs in use). Note: seems also requres R >= 3.5.
write:
library(feather)
write_feather(df, 'df.fe')
read:
library(feather)
df = read_feather('df.fe')
.BIN #
py-numpy #
import numpy as np
data = np.array([1.1, 2.2, -3.14]) # default, float64
data
data.tofile('test.bin') # default, float64
# or:
data.astype(np.float32).tofile('test.bin') # convert to float32
data = np.fromfile('data.bak/test.bin', '<f8') # little-endian float64
data
Meaning of dtype = '<f8'
:
<
: little-endian, win-tel default.
f
: float (default); i
for int.
8
: 4/8 bytes == 32/64 bits.
Endian and 32/64 bits default values depend on platforms.
ref: numpy.ndarray.tofile(), numpy.fromfile()
tip: part of .bin file:
My combined instructions in GitHub.
Separated instructions below:
read part of a file by seek()
item_size_bytes = 4 # 4: 32bits, 8: 64bit
seek_nr = the_number_of_floats_or_integeters_to_skip * item_size_bytes
item_nr = the_number_of_floats_or_integeters_to_read
file_read = '/dev/shm/ttt.bin'
p_file = open(file_read, 'rb')
p_file.seek(seek_nr, os.SEEK_SET)
t = np.fromfile(p_file, dtype=np.float32, count=item_nr)
t
p_file.close() # optional ???
append to a .bin file: ref
r #
tData = as.integer(2^(1:18))
# make sure what you are writing is the same format (e.g. integer) as reading.
pFileWrite = file('test_data/test.bin', 'wb')
writeBin(tData, pFileWrite, size=4, endian='little')
close(pFileWrite)
pFileRead = file('test_data/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='integer', size=4, endian='little')
close(pFileRead)
# or use file.size():
file_input = 'test_data/test.bin'
pFileRead = file(file_input, 'rb')
t = readBin(pFileRead, n=file.size(file_input)/4, integer(), size=4, endian='little')
close(pFileRead)
For float32, use ’numeric':
tData = as.vector(as.single(c(1.1, 2.2, -3.45)))
pFileWrite = file('data.bak/test.bin', 'wb')
writeBin(tData, pFileWrite, size=4, endian='little')
close(pFileWrite)
pFileRead = file('data.bak/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='numeric', size=4, endian='little')
close(pFileRead)
For float64, use ‘size=8’:
tData = as.vector(c(1.1, 2.2, -3.45))
pFileWrite = file('data.bak/test.bin', 'wb')
writeBin(tData, pFileWrite, size=8, endian='little')
close(pFileWrite)
pFileRead = file('data.bak/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='numeric', size=8, endian='little')
close(pFileRead)
It seems that “as.single” is not necessary, we can use “size” to control 32/64bits.
matlab #
‘b’ argument of ‘fopen’ means big-endian.
c/cpp/c++ #
#include<iostream>
#include<fstream>
#include<string>
using namespace std;
int main() {
int count_out, * p_out_array, out_file_bytes;
cout << "多少个数据:" << endl;
cin >> count_out;
string filename = to_string(count_out) + ".bin";
p_out_array = new int[count_out];
for (int index = 0; index < count_out; index++) {
cout << "输入数据" << to_string(index) << ": ";
cin >> p_out_array[index];
}
cout << "正在写文件 ..." << endl;
ofstream fd_outfile;
fd_outfile.open(filename, ios::out | ios::binary);
out_file_bytes = count_out * sizeof(p_out_array[0]);
fd_outfile.write((char*)p_out_array, out_file_bytes);
fd_outfile.close();
int count_in, * p_in_array, in_file_bytes;
cout << "读文件内容:" << endl;
ifstream fd_infile;
fd_infile.open(filename, ios::in | ios::binary);
fd_infile.seekg(0, ios::end);
in_file_bytes = fd_infile.tellg();
fd_infile.clear(); //unset eof flag
fd_infile.seekg(0);
cout << "文件bytes:" << in_file_bytes << endl;
count_in = in_file_bytes / sizeof(int);
cout << "文件count:" << count_in << endl;
p_in_array = new int[count_in];
fd_infile.read((char*)p_in_array, in_file_bytes);
for (int index = 0; index < count_in; index++) {
cout << p_out_array[index] << endl;
}
fd_infile.close();
return 0;
}
.NPY #
py-numpy #
or .npz for compressed
# binary
np.save('var_name.npy', var_name)
t = np.load('var_name.npy')
# txt
np.savetxt('var_name.txt', var_name, fmt='%d')
t = np.loadtxt('var_name.txt', dtype=int)
.PKL #
py-pandas #
data_frame: #
# write:
my_data_df.to_pickle('my_data_df.pkl')
# read:
my_data_df = pd.read_pickle('my_data_df.pkl')
general: #
pickle.dump(model_fitted, open('my_model.pkl', 'wb'))
loaded_model = pickle.load(open('my_model.pkl', 'rb'))
# OR:
with open('test.pkl', 'wb') as p_file:
pickle.dump(my_var, p_file)
with open('test.pkl', 'rb') as p_file:
my_var = pickle.load(p_file)
.MAT #
r #
install.packages('R.matlab')
library('R.matlab')
# write
writeMat('fileName.mat',
variableNameInMat01 = variable.1.in.r,
variableNameInMat02 = variable.2.in.r
)
# read: Returns a named "list" structure containing all variables
data = readMat('fileName.mat')
str(data)
cite: Henrik Bengtsson (2016). R.matlab: Read and Write MAT Files and Call MATLAB from
Within R. R package version 3.6.0-9000. https://github.com/HenrikBengtsson/R.matlab
py #
// see solo-cuda: http://localhost:8889/notebooks/impute-5-convert-to-r.ipynb
TODO.
Suggest: use .csv as a middle file.
.CSV #
r #
# write:
write.table(MyDataDf, file='MyData.csv', sep=',', row.names=FALSE)
write.table(MyDataDf, file='MyData.csv', sep=',', row.names=FALSE, na='', col.names=FALSE)
#write.csv(MyDataDf, file='MyData.csv', sep=' ', row.names=FALSE) # do NOT use !!! sep is space and can NOT be changed.
# read:
read.csv('MyData.csv') # WARN !!! OBS: strings/chars are interpreted as levels (internally int).
py-pandas #
# write:
df.to_csv('my_data_df.csv', index=False) # do NOT save row indexes
df.to_csv('my_data_df.csv', index=False, sep='\t', encoding='utf-8')
# read:
pd.read_csv('my_data_df.csv') # sep is comma by default
W/R .NPY in R
#
Suggested: feather-format.
see also: R 和 Python 能做出什么新花样?
install.packages('RcppCNPy')
library(RcppCNPy)
variableFoo = npyLoad('fileName.npy') # read crap, NOT working !!!
variableFoo
W/R .RDATA in Py
#
Dated content below, see feather-format above.
see also: R 和 Python 能做出什么新花样?
Write:
rpy2 ( > v3.0 )
TODO.
rpy2 < v3.0
(Dated API)
from rpy2.robjects import r, pandas2ri
pandas2ri.activate() # important !!!
import numpy as np
import pandas as pd
# create
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, index=['one', 'two', 'three'])
# convert
r_dataframe = pandas2ri.py2ri(df)
print(type(r_dataframe))
print(r_dataframe)
# save
r.assign(r_variable_name_string, r_dataframe)
r("save(" + r_variable_name_string + ", file='foo.bar.rdata', compress=TRUE)")
// see solo-cuda: http://localhost:8889/notebooks/impute-5.2-newAPI-convert-to-r.ipynb
Read:
TODO.
HDFS in Py-Pandas #
(not tested, and should be deprecated by Ceph soon)
better ref2
ould be deprecated by Ceph soon)