使用LTP进行分词及词性标注

1、下载并解压
https://github.com/HIT-SCIR/ltp/releases
一定要注意,默认输出为utf-8,所以输出到屏幕上为乱码的。

2、语法
要看文档来这里:http://ltp.readthedocs.org/zh_CN/latest/install.html

ltp_test in LTP 3.3.1 - (C) 2012-2015 HIT-SCIR
The console application for Language Technology Platform.

usage: ./ltp_test <options>

options:
  --threads arg           The number of threads [default=1].
  --last-stage arg        The last stage of analysis. This option can be used
                          when the user onlywants to perform early stage
                          analysis, like only segment without postagging.value
                          includes:
                          - ws: Chinese word segmentation
                          - pos: Part of speech tagging
                          - ner: Named entity recognization
                          - dp: Dependency parsing
                          - srl: Semantic role labeling (equals to all)
                          - all: The whole pipeline [default]
  --input arg             The path to the input file.
  --segmentor-model arg   The path to the segment model
                          [default=ltp_data/cws.model].
  --segmentor-lexicon arg The path to the external lexicon in segmentor
                          [optional].
  --postagger-model arg   The path to the postag model
                          [default=ltp_data/pos.model].
  --postagger-lexicon arg The path to the external lexicon in postagger
                          [optional].
  --ner-model arg         The path to the NER model [default=ltp_data/ner.model
                          ].
  --parser-model arg      The path to the parser model
                          [default=ltp_data/parser.model].
  --srl-data arg          The path to the SRL model directory
                          [default=ltp_data/srl_data/].
  --debug-level arg       The debug level.
  -h [ --help ]           Show help information

3、测试例子
en.txt

Don't ever let somebody tell you you can't do something, not even me. 
You got a dream, you gotta protect it. 
People can’t do something themselves, they wanna tell you you can’t do it. 
If you want something, go get it. 
Period.

zh.txt

别让别人告诉你你成不了才,即使是我也不行。
如果你有梦想的话,就要去捍卫它。
那些一事无成的人想告诉你你也成不了大器。
如果你有理想的话,就要去努力实现。
就这样。

4、执行语句

ltp_test --input en.txt > enout.txt
ltp_test --input zh.txt > zhout.txt

5、测试结果
enpos.txt

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="Don&apos;t ever let somebody tell you you can&apos;t do something, not even me.">
                <word id="0" cont="Don&apos;t" pos="ws" ne="O" parent="1" relate="ATT" />
                <word id="1" cont="ever" pos="ws" ne="O" parent="7" relate="ATT" />
                <word id="2" cont="let" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="3" cont="somebody" pos="ws" ne="O" parent="4" relate="ATT" />
                <word id="4" cont="tell" pos="ws" ne="O" parent="5" relate="ATT" />
                <word id="5" cont="you" pos="ws" ne="O" parent="6" relate="ATT" />
                <word id="6" cont="you" pos="ws" ne="O" parent="7" relate="ATT" />
                <word id="7" cont="can&apos;t" pos="ws" ne="O" parent="8" relate="ATT" />
                <word id="8" cont="do" pos="ws" ne="O" parent="9" relate="ATT" />
                <word id="9" cont="something" pos="ws" ne="O" parent="-1" relate="HED" />
                <word id="10" cont="," pos="wp" ne="O" parent="9" relate="WP" />
                <word id="11" cont="not" pos="ws" ne="O" parent="13" relate="ATT" />
                <word id="12" cont="even" pos="ws" ne="O" parent="13" relate="ATT" />
                <word id="13" cont="me" pos="ws" ne="O" parent="9" relate="COO" />
                <word id="14" cont="." pos="wp" ne="O" parent="13" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="You got a dream, you gotta protect it.">
                <word id="0" cont="You" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="1" cont="got" pos="ws" ne="O" parent="2" relate="ATT" />
                <word id="2" cont="a" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="3" cont="dream" pos="ws" ne="O" parent="-1" relate="HED" />
                <word id="4" cont="," pos="wp" ne="O" parent="3" relate="WP" />
                <word id="5" cont="you" pos="ws" ne="O" parent="8" relate="ATT" />
                <word id="6" cont="gotta" pos="ws" ne="O" parent="8" relate="ATT" />
                <word id="7" cont="protect" pos="ws" ne="O" parent="8" relate="ATT" />
                <word id="8" cont="it" pos="ws" ne="O" parent="3" relate="COO" />
                <word id="9" cont="." pos="wp" ne="O" parent="3" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="People can’t do something themselves, they wanna tell you you can’t do it.">
                <word id="0" cont="People" pos="ws" ne="O" parent="1" relate="ATT" />
                <word id="1" cont="can" pos="ws" ne="O" parent="-1" relate="HED" />
                <word id="2" cont="’" pos="wp" ne="O" parent="6" relate="WP" />
                <word id="3" cont="t" pos="ws" ne="O" parent="4" relate="ATT" />
                <word id="4" cont="do" pos="ws" ne="O" parent="5" relate="ATT" />
                <word id="5" cont="something" pos="ws" ne="O" parent="6" relate="ATT" />
                <word id="6" cont="themselves" pos="ws" ne="O" parent="1" relate="COO" />
                <word id="7" cont="," pos="wp" ne="O" parent="6" relate="WP" />
                <word id="8" cont="they" pos="ws" ne="O" parent="9" relate="ATT" />
                <word id="9" cont="wanna" pos="ws" ne="O" parent="10" relate="ATT" />
                <word id="10" cont="tell" pos="ws" ne="O" parent="11" relate="ATT" />
                <word id="11" cont="you" pos="ws" ne="O" parent="12" relate="ATT" />
                <word id="12" cont="you" pos="ws" ne="O" parent="13" relate="ATT" />
                <word id="13" cont="can" pos="ws" ne="O" parent="6" relate="COO" />
                <word id="14" cont="’" pos="wp" ne="O" parent="17" relate="WP" />
                <word id="15" cont="t" pos="ws" ne="O" parent="17" relate="ATT" />
                <word id="16" cont="do" pos="ws" ne="O" parent="17" relate="ATT" />
                <word id="17" cont="it" pos="ws" ne="O" parent="13" relate="COO" />
                <word id="18" cont="." pos="wp" ne="O" parent="17" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="If you want something, go get it.">
                <word id="0" cont="If" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="1" cont="you" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="2" cont="want" pos="ws" ne="O" parent="3" relate="ATT" />
                <word id="3" cont="something" pos="ws" ne="O" parent="-1" relate="HED" />
                <word id="4" cont="," pos="wp" ne="O" parent="3" relate="WP" />
                <word id="5" cont="go" pos="ws" ne="O" parent="7" relate="ATT" />
                <word id="6" cont="get" pos="ws" ne="O" parent="7" relate="ATT" />
                <word id="7" cont="it" pos="ws" ne="O" parent="3" relate="COO" />
                <word id="8" cont="." pos="wp" ne="O" parent="3" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="Period.">
                <word id="0" cont="Period" pos="ws" ne="O" parent="-1" relate="HED" />
                <word id="1" cont="." pos="wp" ne="O" parent="0" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

zhpos.txt

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="别让别人告诉你你成不了才,即使是我也不行。">
                <word id="0" cont="别" pos="d" ne="O" parent="1" relate="ADV" />
                <word id="1" cont="让" pos="v" ne="O" parent="-1" relate="HED">
                    <arg id="0" type="ù" beg="0" end="0" />
                    <arg id="1" type="" beg="2" end="2" />
                    <arg id="2" type="" beg="3" end="9" />
                </word>
                <word id="2" cont="别人" pos="r" ne="O" parent="1" relate="DBL" />
                <word id="3" cont="告诉" pos="v" ne="O" parent="1" relate="VOB">
                    <arg id="0" type="R&#x07;" beg="4" end="4" />
                    <arg id="1" type="" beg="5" end="9" />
                </word>
                <word id="4" cont="你" pos="r" ne="O" parent="3" relate="IOB" />
                <word id="5" cont="你" pos="r" ne="O" parent="6" relate="SBV" />
                <word id="6" cont="成" pos="v" ne="O" parent="3" relate="VOB">
                    <arg id="0" type="ˆ&#x07;" beg="5" end="5" />
                    <arg id="1" type="" beg="9" end="9" />
                </word>
                <word id="7" cont="不" pos="d" ne="O" parent="8" relate="ADV" />
                <word id="8" cont="了" pos="v" ne="O" parent="6" relate="CMP" />
                <word id="9" cont="才" pos="n" ne="O" parent="6" relate="VOB" />
                <word id="10" cont="," pos="wp" ne="O" parent="1" relate="WP" />
                <word id="11" cont="即使" pos="c" ne="O" parent="12" relate="ADV" />
                <word id="12" cont="是" pos="v" ne="O" parent="1" relate="COO">
                    <arg id="0" type="|&#x1C;S" beg="11" end="11" />
                </word>
                <word id="13" cont="我" pos="r" ne="O" parent="15" relate="SBV" />
                <word id="14" cont="也" pos="d" ne="O" parent="15" relate="ADV" />
                <word id="15" cont="不行" pos="a" ne="O" parent="12" relate="VOB">
                    <arg id="0" type="‘&#x07;" beg="13" end="13" />
                    <arg id="1" type="" beg="14" end="14" />
                </word>
                <word id="16" cont="。" pos="wp" ne="O" parent="1" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="如果你有梦想的话,就要去捍卫它。">
                <word id="0" cont="如果" pos="c" ne="O" parent="2" relate="ADV" />
                <word id="1" cont="你" pos="r" ne="O" parent="2" relate="SBV" />
                <word id="2" cont="有" pos="v" ne="O" parent="-1" relate="HED">
                    <arg id="0" type="&#x02;" beg="0" end="0" />
                    <arg id="1" type="" beg="1" end="1" />
                    <arg id="2" type="" beg="3" end="3" />
                    <arg id="3" type="" beg="5" end="5" />
                </word>
                <word id="3" cont="梦想" pos="n" ne="O" parent="2" relate="VOB" />
                <word id="4" cont="的" pos="u" ne="O" parent="2" relate="RAD" />
                <word id="5" cont="话" pos="n" ne="O" parent="2" relate="VOB" />
                <word id="6" cont="," pos="wp" ne="O" parent="2" relate="WP" />
                <word id="7" cont="就要" pos="d" ne="O" parent="9" relate="ADV" />
                <word id="8" cont="去" pos="v" ne="O" parent="9" relate="ADV" />
                <word id="9" cont="捍卫" pos="v" ne="O" parent="2" relate="COO">
                    <arg id="0" type="£&#x07;V" beg="7" end="7" />
                    <arg id="1" type="" beg="10" end="10" />
                </word>
                <word id="10" cont="它" pos="r" ne="O" parent="9" relate="VOB" />
                <word id="11" cont="。" pos="wp" ne="O" parent="2" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="那些一事无成的人想告诉你你也成不了大器。">
                <word id="0" cont="那些" pos="r" ne="O" parent="3" relate="ATT" />
                <word id="1" cont="一事无成" pos="i" ne="O" parent="3" relate="ATT" />
                <word id="2" cont="的" pos="u" ne="O" parent="1" relate="RAD" />
                <word id="3" cont="人" pos="n" ne="O" parent="4" relate="SBV" />
                <word id="4" cont="想" pos="v" ne="O" parent="-1" relate="HED">
                    <arg id="0" type="‘&#x07;" beg="0" end="3" />
                    <arg id="1" type="" beg="5" end="12" />
                </word>
                <word id="5" cont="告诉" pos="v" ne="O" parent="4" relate="VOB">
                    <arg id="0" type="â&#x07;" beg="6" end="6" />
                    <arg id="1" type="" beg="7" end="12" />
                </word>
                <word id="6" cont="你" pos="r" ne="O" parent="5" relate="IOB" />
                <word id="7" cont="你" pos="r" ne="O" parent="9" relate="SBV" />
                <word id="8" cont="也" pos="d" ne="O" parent="9" relate="ADV" />
                <word id="9" cont="成" pos="v" ne="O" parent="5" relate="VOB">
                    <arg id="0" type="ù" beg="7" end="7" />
                    <arg id="1" type="" beg="8" end="8" />
                    <arg id="2" type="" beg="12" end="12" />
                </word>
                <word id="10" cont="不" pos="d" ne="O" parent="11" relate="ADV" />
                <word id="11" cont="了" pos="v" ne="O" parent="9" relate="CMP" />
                <word id="12" cont="大器" pos="n" ne="O" parent="9" relate="VOB" />
                <word id="13" cont="。" pos="wp" ne="O" parent="4" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="如果你有理想的话,就要去努力实现。">
                <word id="0" cont="如果" pos="c" ne="O" parent="2" relate="ADV" />
                <word id="1" cont="你" pos="r" ne="O" parent="2" relate="SBV" />
                <word id="2" cont="有" pos="v" ne="O" parent="-1" relate="HED">
                    <arg id="0" type="&#x13;&#x01;S" beg="0" end="0" />
                    <arg id="1" type="" beg="1" end="1" />
                    <arg id="2" type="" beg="3" end="3" />
                </word>
                <word id="3" cont="理想" pos="n" ne="O" parent="2" relate="VOB" />
                <word id="4" cont="的话" pos="u" ne="O" parent="2" relate="RAD" />
                <word id="5" cont="," pos="wp" ne="O" parent="2" relate="WP" />
                <word id="6" cont="就要" pos="d" ne="O" parent="9" relate="ADV">
                    <arg id="0" type="" beg="4" end="4" />
                </word>
                <word id="7" cont="去" pos="v" ne="O" parent="9" relate="ADV">
                    <arg id="0" type="³&#x03;D" beg="4" end="4" />
                    <arg id="1" type="" beg="6" end="6" />
                </word>
                <word id="8" cont="努力" pos="a" ne="O" parent="9" relate="ADV" />
                <word id="9" cont="实现" pos="v" ne="O" parent="2" relate="COO">
                    <arg id="0" type=" &#x01;D" beg="4" end="4" />
                    <arg id="1" type="" beg="6" end="6" />
                    <arg id="2" type="" beg="8" end="8" />
                </word>
                <word id="10" cont="。" pos="wp" ne="O" parent="2" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

<?xml version="1.0" encoding="utf-8" ?>
<xml4nlp>
    <note sent="y" word="y" pos="y" ne="y" parser="y" wsd="n" srl="y" />
    <doc>
        <para id="0">
            <sent id="0" cont="就这样。">
                <word id="0" cont="就" pos="d" ne="O" parent="1" relate="ADV" />
                <word id="1" cont="这样" pos="r" ne="O" parent="-1" relate="HED" />
                <word id="2" cont="。" pos="wp" ne="O" parent="1" relate="WP" />
            </sent>
        </para>
    </doc>
</xml4nlp>

Comments are closed.