bipartite graph data pre-processing

The initialized downloaded data is messy and the graph id is not continuous, so a data preprocessing method is written to allow the data to be loaded into memory faster.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import random
import argparse

filepath = os.getcwd() + "/"


def read_files(filename: str, sep: str, skip: int):
data = []
filename = filepath + filename[2:]
with open(filename, 'r') as lines:
lines = list(lines)
for line in lines[skip:]:
nodes = line.split(sep)
temp = [int(nodes[0]), int(nodes[1])]
for val in nodes[2:]:
temp.append(val)
data.append(temp)

return data


def parse_data(data):
"""
encoding the data
:param data:
:return:
"""
u_node = {}
u_id = 0

v_node = {}
v_id = 0

for line in data:
if line[0] not in u_node:
u_node[line[0]] = u_id
u_id += 1

if line[1] not in v_node:
v_node[line[1]] = v_id
v_id += 1

nodes = ""
edges = 0
for line in data:
nodes += "{} {}".format(u_node[line[0]], v_node[line[1]])
nodes += " ".join(line[2:])
nodes += "\n"
edges += 1

return u_id, v_id, edges, nodes


def store_file(filename, data):
meta = filepath + filename + '.meta'
edge = filepath + filename + '.e'

f = open(edge, 'w')
f.write(data[3])
f.close()

lines = "{}\n{}\n{}\n".format(data[0], data[1], data[2])
f = open(meta, 'w')
f.write(lines)
f.close()


def execute(args):
filename = args.filename
sep = args.sep
outfile = args.out
skip = args.skip

if outfile is None:
outfile = str(filename).split('.')[1][:3]

data = read_files(filename, sep, int(skip))
parsed_data = parse_data(data)
store_file(outfile, parsed_data)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Processing csv data')
parser.add_argument('--filename', help='Please add file name', required=True)
parser.add_argument('--sep', help='Separator', default=' ')
parser.add_argument('--out', help='output file name', default=None)
parser.add_argument('--skip', help='how many lines need to be skiped', default=1)
args = parser.parse_args()

print ("Start processing: ", args.filename)
execute(args)
print("Finished")

How to use it

You can get help information by using --help.

1
2
3
4
5
6
7
8
9
10
usage: data.py [-h] --filename FILENAME [--sep SEP] [--out OUT] [--skip SKIP]

Processing csv data

optional arguments:
-h, --help show this help message and exit
--filename FILENAME Please add file name
--sep SEP Separator
--out OUT output file name
--skip SKIP how many lines need to be skiped、

--sep default value is a space, --skip default value is 1.

----- End Thanks for reading-----