Skip to content

API reference

build(config, corpus_data, all_elements, filter_file, out, output_format, include_private)

Run the pipeline extract -> filter -> export in one command.

Parameters:

Name Type Description Default
config
required
corpus_data
required
all_elements
required
filter_file
required
out
required
output_format
required
include_private
required
Source code in src/corpus/corpus.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@cli.command()
@click.option('--all-elements', '-a',
              help='Get all elements available in the GitLab instance WARNING: This might take a long time and might '
                   'cause problems for the server',
              is_flag=True)
@click.option('--filter-file', '-f',
              help='File in yaml format which defines the filters to be used on the corpus',
              default='resources/filters.yaml', show_default=True)
@click.option('--out', '-o', default='out/corpus.json',
              help='Specifies the output file', show_default=True)
@click.option('--output-format', '-F', default='json',
              help='Specifies the output format', show_default=True)
@click.option('--include-private', '-p', is_flag=True,
              help='If set, GitLab projects with visibility private will be included as well')
@corpus
@command_config
def build(config, corpus_data, all_elements, filter_file, out, output_format, include_private):
    """Run the pipeline extract -> filter -> export in one command.

    :param config: 
    :param corpus_data: 
    :param all_elements: 
    :param filter_file: 
    :param out: 
    :param output_format: 
    :param include_private: 

    """
    extractor = Extractor(config.verbose, config.gl, corpus=corpus_data)
    corpus_filter = Filter(config.verbose, corpus=corpus_data, from_file=False)

    extractor.extract(all_elements=all_elements, include_private=include_private)
    corpus_filter.load_filters(filter_file=filter_file)
    corpus_filter.filter()

    exporter = Exporter(config, corpus=corpus_filter.filtered_corpus, format_str=output_format, from_file=False)
    exporter.export(out=out)

cli(config, gl_config, neo4j_config, source, verbose)

Entry point to the corpus cli.

Parameters:

Name Type Description Default
config
required
gl_config
required
neo4j_config
required
source
required
verbose
required
Source code in src/corpus/corpus.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@click.group()
@click.option('--gl-config', '-g', default='resources/gitlab.cfg',
              help='Path to the GitLab config file', show_default=True)
@click.option('--neo4j-config', '-n', default='resources/neo4j.cfg',
              help='Path to the Neo4J config file', show_default=True)
@click.option('--source', '-s',
              help='Name of the GitLab instance, you want to analyze, if not the default value of your configuration')
@click.option('--verbose', '-v', default=False,
              help='Prints more output during execution')
@command_config
def cli(config, gl_config, neo4j_config, source, verbose):
    """Entry point to the corpus cli.

    :param config: 
    :param gl_config: 
    :param neo4j_config: 
    :param source: 
    :param verbose: 

    """
    config.gl = gitlab.Gitlab.from_config(source, [gl_config])
    config.verbose = verbose
    config.neo4j_config = load_neo4j_config(neo4j_config)

export(config, corpus_data, input_file, out, output_format)

Export a previously extracted (and maybe filtered) corpus to another format.

Parameters:

Name Type Description Default
config
required
corpus_data
required
input_file
required
out
required
output_format
required
Source code in src/corpus/corpus.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
@cli.command()
@click.option('--input-file', '-i', default='out/corpus.json',
              help='Specifies the file to load the corpus from', show_default=True)
@click.option('--out', '-o', default='out/corpus.json',
              help='Specifies the output file', show_default=True)
@click.option('--output-format', '-F', default='json',
              help='Specifies the output format', show_default=True)
@corpus
@command_config
def export(config, corpus_data, input_file, out, output_format):
    """Export a previously extracted (and maybe filtered) corpus to another format.

    :param config: 
    :param corpus_data: 
    :param input_file: 
    :param out: 
    :param output_format: 

    """
    exporter = Exporter(config, corpus=corpus_data, format_str=output_format, from_file=True, file=input_file)
    exporter.export(out=out)

extract(config, corpus_data, all_elements, out, include_private)

Extract projects from the specified GitLab instance and write the output to a file.

Parameters:

Name Type Description Default
config
required
corpus_data
required
all_elements
required
out
required
include_private
required
Source code in src/corpus/corpus.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@cli.command()
@click.option('--all-elements', '-a',
              help='Get all elements available in the GitLab instance WARNING: This might take a long time and might '
                   'cause problems for the server',
              is_flag=True)
@click.option('--out', '-o', default='out/corpus.json',
              help='Specifies the output file', show_default=True)
@click.option('--include-private', '-p', is_flag=True,
              help='If set, GitLab projects with visibility private will be included as well')
@corpus
@command_config
def extract(config, corpus_data, all_elements, out, include_private):
    """Extract projects from the specified GitLab instance and write the output to a file.

    :param config: 
    :param corpus_data: 
    :param all_elements: 
    :param out: 
    :param include_private: 

    """
    extractor = Extractor(config.verbose, config.gl, corpus=corpus_data)
    exporter = Exporter(config, corpus=corpus_data, format_str="json")

    extractor.extract(all_elements=all_elements, include_private=include_private)
    exporter.export(out=out)

filter(config, corpus_data, filter_file, input_file, out)

Apply filters on a previously extracted corpus.

Parameters:

Name Type Description Default
config
required
corpus_data
required
filter_file
required
input_file
required
out
required
Source code in src/corpus/corpus.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@cli.command()
@click.option('--filter-file', '-f',
              help='File in yaml format which defines the filters to be used on the corpus',
              default='resources/filters.yaml', show_default=True)
@click.option('--input-file', '-i', default='out/corpus.json',
              help='Specifies the file to load the corpus from', show_default=True)
@click.option('--out', '-o', default='out/corpus.json',
              help='Specifies the output file', show_default=True)
@corpus
@command_config
def filter(config, corpus_data, filter_file, input_file, out):
    """Apply filters on a previously extracted corpus.

    :param config: 
    :param corpus_data: 
    :param filter_file: 
    :param input_file: 
    :param out: 

    """
    corpus_filter = Filter(config.verbose, corpus=corpus_data, from_file=True, file=input_file)

    corpus_filter.load_filters(filter_file=filter_file)
    corpus_filter.filter()

    exporter = Exporter(config, corpus=corpus_filter.filtered_corpus, format_str="json")
    exporter.export(out=out)