ufal.udpipe

Bindings to UDPipe library

mpl-2.0 9 个版本
Milan Straka <straka@ufal.mff.cuni.cz>
安装
pip install ufal.udpipe
poetry add ufal.udpipe
pipenv install ufal.udpipe
conda install ufal.udpipe
描述

ufal.udpipe

The ufal.udpipe is a Python binding to UDPipe library http://ufal.mff.cuni.cz/udpipe.

The bindings is a straightforward conversion of the C++ bindings API. In Python 2, strings can be both unicode and UTF-8 encoded str, and the library always produces unicode. In Python 3, strings must be only str.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see http://ufal.mff.cuni.cz/udpipe/api-reference.

::

Helper Structures

typedef vector<int> Children;

typedef vector<uint8_t> Bytes;

typedef vector<string> Comments;

class ProcessingError {
public:
  bool occurred();
  string message;
};

class Token {
 public:
  string form;
  string misc;

  Token(const string& form = string(), const string& misc = string());

  // CoNLL-U defined SpaceAfter=No feature
  bool getSpaceAfter() const;
  void setSpaceAfter(bool space_after);

  // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
  string getSpacesBefore() const;
  void setSpacesBefore(const string& spaces_before);
  string getSpacesAfter() const;
  void setSpacesAfter(const string& spaces_after);
  string getSpacesInToken() const;
  void setSpacesInToken(const string& spaces_in_token);

  // UDPipe-specific TokenRange feature
  bool getTokenRange() const;
  size_t getTokenRangeStart() const;
  size_t getTokenRangeEnd() const;
  void setTokenRange(size_t start, size_t end);
};

class Word : public Token {
 public:
  // form and misc are inherited from token
  int id;         // 0 is root, >0 is sentence word, <0 is undefined
  string lemma;   // lemma
  string upostag; // universal part-of-speech tag
  string xpostag; // language-specific part-of-speech tag
  string feats;   // list of morphological features
  int head;       // head, 0 is root, <0 is undefined
  string deprel;  // dependency relation to the head
  string deps;    // secondary dependencies

  Children children;

  Word(int id = -1, const string& form = string());
};
typedef vector<Word> Words;

class MultiwordToken : public Token {
 public:
  // form and misc are inherited from token
  int idFirst, idLast;

  MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string());
};
typedef vector<MultiwordToken> MultiwordTokens;

class EmptyNode {
 public:
  int id;          // 0 is root, >0 is sentence word, <0 is undefined
  int index;       // index for the current id, should be numbered from 1, 0=undefined
  string form;     // form
  string lemma;    // lemma
  string upostag;  // universal part-of-speech tag
  string xpostag;  // language-specific part-of-speech tag
  string feats;    // list of morphological features
  string deps;     // secondary dependencies
  string misc;     // miscellaneous information

  EmptyNode(int id = -1, int index = 0) : id(id), index(index) {}
};
typedef vector<empty_node> EmptyNodes;

class Sentence {
 public:
  Sentence();

  Words words;
  MultiwordTokens multiwordTokens;
  EmptyNodes emptyNodes;
  Comments comments;
  static const string rootForm;

  // Basic sentence modifications
  bool empty();
  void clear();
  virtual Word& addWord(const char* form);
  void setHead(int id, int head, const string& deprel);
  void unlinkAllWords();

  // CoNLL-U defined comments
  bool getNewDoc() const;
  string getNewDocId() const;
  void setNewDoc(bool new_doc, const string& id = string());
  bool getNewPar() const;
  string getNewParId() const;
  void setNewPar(bool new_par, const string& id = string());

  string getSentId() const;
  void setSentId(const string& id);
  string getText() const;
  void setText(const string& id);
};
typedef vector<Sentence> Sentences;

Main Classes

class InputFormat {
 public:
  virtual void resetDocument(const string& id = string());
  virtual void setText(const char* text);
  virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr);

  static InputFormat* newInputFormat(const string& name);
  static InputFormat* newConlluInputFormat(const string& id = string());
  static InputFormat* newGenericTokenizerInputFormat(const string& id = string());
  static InputFormat* newHorizontalInputFormat(const string& id = string());
  static InputFormat* newVerticalInputFormat(const string& id = string());

  static InputFormat* newPresegmentedTokenizer(InputFormat tokenizer);

  static const string CONLLU_V1;
  static const string CONLLU_V2;
  static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
  static const string GENERIC_TOKENIZER_PRESEGMENTED;
  static const string GENERIC_TOKENIZER_RANGES;
};

class OutputFormat {
 public:
  virtual string writeSentence(const Sentence& s);
  virtual string finishDocument();

  static OutputFormat* newOutputFormat(const string& name);
  static OutputFormat* newConlluOutputFormat(const string& options = string());
  static OutputFormat* newEpeOutputFormat(const string& options = string());
  static OutputFormat* newMatxinOutputFormat(const string& options = string());
  static OutputFormat* newHorizontalOutputFormat(const string& options = string());
  static OutputFormat* newPlaintextOutputFormat(const string& options = string());
  static OutputFormat* newVerticalOutputFormat(const string& options = string());

  static const string CONLLU_V1;
  static const string CONLLU_V2;
  static const string HORIZONTAL_PARAGRAPHS;
  static const string PLAINTEXT_NORMALIZED_SPACES;
  static const string VERTICAL_PARAGRAPHS;
};

class Model {
 public:
  static Model* load(const char* fname);

  virtual InputFormat* newTokenizer(const string& options) const;
  virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const;
  virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const;

  static const string DEFAULT;
  static const string TOKENIZER_PRESEGMENTED;
};

class Pipeline {
 public:
  Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output);

  void setModel(const Model* m);
  void setInput(const string& input);
  void setTagger(const string& tagger);
  void setParser(const string& parser);
  void setOutput(const string& output);

  void setImmediate(bool immediate);
  void setDocumentId(const string& document_id);

  string process(const string& data, ProcessingError* error = nullptr) const;

  static const string DEFAULT;
  static const string NONE;
};

class Trainer {
 public:

  static Bytes* train(const string& method, const Sentences& train, const Sentences& heldout,
                      const string& tokenizer, const string& tagger, const string& parser,
                      ProcessingError* error = nullptr);

  static const string DEFAULT;
  static const string NONE;
};

class Evaluator {
 public:
  Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser);

  void setModel(const Model* m);
  void setTokenizer(const string& tokenizer);
  void setTagger(const string& tagger);
  void setParser(const string& parser);

  string evaluate(const string& data, ProcessingError* error = nullptr) const;

  static const string DEFAULT;
  static const string NONE;
};

class Version {
 public:
  unsigned major;
  unsigned minor;
  unsigned patch;
  string prerelease;

  // Returns current version.
  static version current();
};

Examples

run_udpipe

Simple pipeline loading data (tokenizing on request), tagging, parsing and writing to specified output format::

import sys

from ufal.udpipe import Model, Pipeline, ProcessingError # pylint: disable=no-name-in-module

In Python2, wrap sys.stdin and sys.stdout to work with unicode.

if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) < 4: sys.stderr.write('Usage: %s input_format(tokenize|conllu|horizontal|vertical) output_format(conllu) model_file\n' % sys.argv[0]) sys.exit(1)

sys.stderr.write('Loading model: ') model = Model.load(sys.argv[3]) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) sys.stderr.write('done\n')

pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError()

Read whole input

text = ''.join(sys.stdin.readlines())

Process data

processed = pipeline.process(text, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) sys.stdout.write(processed)

AUTHORS

Milan Straka straka@ufal.mff.cuni.cz

COPYRIGHT AND LICENCE

Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.