Cross-Read & -Write R, Py, Matlab, Binary Files

2016-06-01. Category & Tags: R, Binary, Mat, Matlab, Python, NumPy, Pandas

Note: feather-format is desigend to transfer data between Py & R [stackoverflow, feather-doc].

.FE #

OBS: only for data.frame type, not even arrays.

py (feather-format) #

Requires: pip install feather-format. (OBS: feather-format NOT feather.)

write:

import numpy as np
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, index=['one', 'two', 'three'])

import feather
feather.write_dataframe(df.reset_index(drop=True), 'df.fe')

(though the df is created by pandas)

read:

import feather
df = feather.read_dataframe('df.fe')

py-pandas (canNOT read) #

write: df.reset_index(drop=True).to_feather('df.fe'). Note that feather canNOT handle string-based index names, another solution is drop=False, then index becomes columns.

read: Not working: pd.read_feather('df.fe') triggers an error: “read_feather() got an unexpected keyword argument ’nthreads’ “.

r #

Requres: install.packages('feather') (better to use non-IDE to install to avoid libs in use). Note: seems also requres R >= 3.5.

write:

library(feather)
write_feather(df, 'df.fe')

read:

library(feather)
df = read_feather('df.fe')

.BIN #

py-numpy #

import numpy as np

data = np.array([1.1, 2.2, -3.14]) # default, float64
data

data.tofile('test.bin') # default, float64
# or:
data.astype(np.float32).tofile('test.bin') # convert to float32

data = np.fromfile('data.bak/test.bin', '<f8')  # little-endian float64
data

Meaning of dtype = '<f8':
<: little-endian, win-tel default.
f: float (default); i for int.
8: 4/8 bytes == 32/64 bits.
Endian and 32/64 bits default values depend on platforms.
ref: numpy.ndarray.tofile(), numpy.fromfile()

tip: part of .bin file:
My combined instructions in GitHub.
Separated instructions below:
read part of a file by seek()

item_size_bytes = 4 # 4: 32bits, 8: 64bit
seek_nr = the_number_of_floats_or_integeters_to_skip * item_size_bytes
item_nr = the_number_of_floats_or_integeters_to_read
file_read = '/dev/shm/ttt.bin'
p_file = open(file_read, 'rb')
p_file.seek(seek_nr, os.SEEK_SET)
t = np.fromfile(p_file, dtype=np.float32, count=item_nr)
t
p_file.close() # optional ???

ref

append to a .bin file: ref

r #

tData = as.integer(2^(1:18))

# make sure what you are writing is the same format (e.g. integer) as reading.
pFileWrite = file('test_data/test.bin', 'wb')
writeBin(tData, pFileWrite, size=4, endian='little')
close(pFileWrite)

pFileRead = file('test_data/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='integer', size=4, endian='little')
close(pFileRead)

# or use file.size():
file_input = 'test_data/test.bin'
pFileRead = file(file_input, 'rb')
t = readBin(pFileRead, n=file.size(file_input)/4, integer(), size=4, endian='little')
close(pFileRead)

For float32, use ’numeric':

tData = as.vector(as.single(c(1.1, 2.2, -3.45)))

pFileWrite = file('data.bak/test.bin', 'wb')
writeBin(tData, pFileWrite, size=4, endian='little')
close(pFileWrite)

pFileRead = file('data.bak/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='numeric', size=4, endian='little')
close(pFileRead)

For float64, use ‘size=8’:

tData = as.vector(c(1.1, 2.2, -3.45))

pFileWrite = file('data.bak/test.bin', 'wb')
writeBin(tData, pFileWrite, size=8, endian='little')
close(pFileWrite)

pFileRead = file('data.bak/test.bin', 'rb')
t = readBin(pFileRead, n=18, what='numeric', size=8, endian='little')
close(pFileRead)

It seems that “as.single” is not necessary, we can use “size” to control 32/64bits.

matlab #

‘b’ argument of ‘fopen’ means big-endian.

c/cpp/c++ #

#include<iostream>
#include<fstream>
#include<string>
using namespace std;

int main() {
    int count_out, * p_out_array, out_file_bytes;
    cout << "多少个数据：" << endl;
    cin >> count_out;
    string filename = to_string(count_out) + ".bin";
    p_out_array = new int[count_out];
    for (int index = 0; index < count_out; index++) {
        cout << "输入数据" << to_string(index) << ": ";
        cin >> p_out_array[index];
    }

    cout << "正在写文件 ..." << endl;
    ofstream fd_outfile;
    fd_outfile.open(filename, ios::out | ios::binary);
    out_file_bytes = count_out * sizeof(p_out_array[0]);
    fd_outfile.write((char*)p_out_array, out_file_bytes);
    fd_outfile.close();

    int count_in, * p_in_array, in_file_bytes;
    cout << "读文件内容：" << endl;
    ifstream fd_infile;
    fd_infile.open(filename, ios::in | ios::binary);
    fd_infile.seekg(0, ios::end);
    in_file_bytes = fd_infile.tellg();
    fd_infile.clear(); //unset eof flag
    fd_infile.seekg(0);
    cout << "文件bytes：" << in_file_bytes << endl;
    count_in = in_file_bytes / sizeof(int);
    cout << "文件count：" << count_in << endl;
    p_in_array = new int[count_in];
    fd_infile.read((char*)p_in_array, in_file_bytes);
    for (int index = 0; index < count_in; index++) {
        cout << p_out_array[index] << endl;
    }
    fd_infile.close();

    return 0;
}

.NPY #

py-numpy #

or .npz for compressed

# binary
np.save('var_name.npy', var_name)
t = np.load('var_name.npy')

# txt
np.savetxt('var_name.txt', var_name, fmt='%d')
t = np.loadtxt('var_name.txt', dtype=int)

ref

.PKL #

py-pandas #

data_frame: #

# write:
my_data_df.to_pickle('my_data_df.pkl')
# read:
my_data_df = pd.read_pickle('my_data_df.pkl')

general: #

pickle.dump(model_fitted, open('my_model.pkl', 'wb'))
loaded_model = pickle.load(open('my_model.pkl', 'rb'))

# OR:
with open('test.pkl', 'wb') as p_file:
  pickle.dump(my_var, p_file)
with open('test.pkl', 'rb') as p_file:
  my_var = pickle.load(p_file)

.MAT #

r #

install.packages('R.matlab')
library('R.matlab')

# write
writeMat('fileName.mat',
         variableNameInMat01 = variable.1.in.r,
         variableNameInMat02 = variable.2.in.r
         )

# read: Returns a named "list" structure containing all variables
data = readMat('fileName.mat')
str(data)

ref

cite: Henrik Bengtsson (2016). R.matlab: Read and Write MAT Files and Call MATLAB from
Within R. R package version 3.6.0-9000. https://github.com/HenrikBengtsson/R.matlab

py #

// see solo-cuda: http://localhost:8889/notebooks/impute-5-convert-to-r.ipynb
TODO.
Suggest: use .csv as a middle file.

.CSV #

r #

# write:
write.table(MyDataDf, file='MyData.csv', sep=',', row.names=FALSE)
write.table(MyDataDf, file='MyData.csv', sep=',', row.names=FALSE, na='', col.names=FALSE)
#write.csv(MyDataDf, file='MyData.csv', sep=' ', row.names=FALSE) # do NOT use !!! sep is space and can NOT be changed.

# read:
read.csv('MyData.csv') # WARN !!! OBS: strings/chars are interpreted as levels (internally int).

py-pandas #

# write:
df.to_csv('my_data_df.csv', index=False) # do NOT save row indexes
df.to_csv('my_data_df.csv', index=False, sep='\t', encoding='utf-8')

# read:
pd.read_csv('my_data_df.csv') # sep is comma by default

ref overflow
ref doc

W/R .NPY in R #

Suggested: feather-format.

see also: R 和 Python 能做出什么新花样？

install.packages('RcppCNPy')
library(RcppCNPy)
variableFoo = npyLoad('fileName.npy') # read crap, NOT working !!!
variableFoo

ref

W/R .RDATA in Py #

Dated content below, see feather-format above.

see also: R 和 Python 能做出什么新花样？

Write:
rpy2 ( > v3.0 ) TODO.

~~rpy2 < v3.0~~
(Dated API)

from rpy2.robjects import r, pandas2ri
pandas2ri.activate() # important !!!

import numpy as np
import pandas as pd

# create
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, index=['one', 'two', 'three'])

# convert
r_dataframe = pandas2ri.py2ri(df)
print(type(r_dataframe))
print(r_dataframe)

# save
r.assign(r_variable_name_string, r_dataframe)
r("save(" + r_variable_name_string + ", file='foo.bar.rdata', compress=TRUE)")

// see solo-cuda: http://localhost:8889/notebooks/impute-5.2-newAPI-convert-to-r.ipynb

Read:
TODO.

HDFS in Py-Pandas #

(not tested, and should be deprecated by Ceph soon)

ref 1

better ref2
ould be deprecated by Ceph soon)

ref 1

better ref2